From f26c9ab3f40f79f274bdec3c599bb45b953dadd1 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 26 May 2021 15:18:27 +0200 Subject: [PATCH 01/26] PoC --- pytorch_lightning/callbacks/early_stopping.py | 2 +- .../callbacks/model_checkpoint.py | 44 ++++++++++++++++--- pytorch_lightning/trainer/evaluation_loop.py | 3 -- pytorch_lightning/trainer/properties.py | 5 --- pytorch_lightning/trainer/trainer.py | 2 +- pytorch_lightning/trainer/training_loop.py | 29 +----------- 6 files changed, 42 insertions(+), 43 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 242eeed808f34..f0c1a3a95819e 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -97,7 +97,7 @@ def __init__( check_finite: bool = True, stopping_threshold: Optional[float] = None, divergence_threshold: Optional[float] = None, - check_on_train_epoch_end: bool = False, + check_on_train_epoch_end: bool = True, ): super().__init__() self.monitor = monitor diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 7642ad95d08bf..1bf8046dcee5b 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -118,6 +118,7 @@ class ModelCheckpoint(Callback): will only save checkpoints at epochs 0 < E <= N where both values for ``every_n_val_epochs`` and ``check_val_every_n_epoch`` evenly divide E. period: Interval (number of epochs) between checkpoints. + save_on_train_epoch_end: TODO .. warning:: This argument has been deprecated in v1.3 and will be removed in v1.5. @@ -202,6 +203,7 @@ def __init__( train_time_interval: Optional[timedelta] = None, every_n_val_epochs: Optional[int] = None, period: Optional[int] = None, + save_on_train_epoch_end: bool = True, ): super().__init__() self.monitor = monitor @@ -210,6 +212,7 @@ def __init__( self.save_top_k = save_top_k self.save_weights_only = save_weights_only self.auto_insert_metric_name = auto_insert_metric_name + self._save_on_train_epoch_end = save_on_train_epoch_end self._last_global_step_saved = -1 self._last_time_checked: Optional[float] = None self.current_score = None @@ -267,16 +270,47 @@ def on_train_batch_end( self.save_checkpoint(trainer) + def on_train_epoch_end( + self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', unused: Optional = None + ) -> None: + """ Save a checkpoint at the end of the training epoch. """ + if ( + self._should_skip_saving_checkpoint(trainer) or self._save_on_train_epoch_end + # TODO: should every_n_val_epochs be repurposed to work for this too? + ): + return + # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates + trainer.train_loop.global_step -= 1 + self.save_checkpoint(trainer) + trainer.train_loop.global_step += 1 + def on_validation_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: """ Save a checkpoint at the end of the validation stage. """ - skip = ( - self._should_skip_saving_checkpoint(trainer) or self._every_n_val_epochs < 1 - or (trainer.current_epoch + 1) % self._every_n_val_epochs != 0 - ) - if skip: + if ( + self._should_skip_saving_checkpoint(trainer) or self._save_on_train_epoch_end + or self._every_n_val_epochs < 1 or (trainer.current_epoch + 1) % self._every_n_val_epochs != 0 + ): return self.save_checkpoint(trainer) + def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: + """ + Save a checkpoint at the very end of training. + + This will only save a checkpoint if `save_last` is also enabled + as the monitor metrics produced by training or validation steps or end of epochs + is not guaranteed to be available at this stage. + """ + if self._should_skip_saving_checkpoint(trainer) or not trainer.checkpoint_connector.has_trained: + return + if self.save_last and self.verbose: + rank_zero_info("Saving last checkpoint...") + # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates + trainer.train_loop.global_step -= 1 + monitor_candidates = self._monitor_candidates(trainer) + self._save_last_checkpoint(trainer, monitor_candidates) + trainer.train_loop.global_step += 1 + def on_save_checkpoint( self, trainer: 'pl.Trainer', diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index f048297892533..810efef3fa52b 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -72,9 +72,6 @@ def get_evaluation_dataloaders(self) -> Tuple[Optional[List[DataLoader]], List[U dataloaders = self.trainer.val_dataloaders return dataloaders, max_batches - def should_skip_evaluation(self, max_batches: List[Union[int, float]]) -> bool: - return sum(max_batches) == 0 - def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None: self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end() if self.trainer.testing: diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index e469d1bc12394..440a6693aba43 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -258,11 +258,6 @@ def progress_bar_dict(self) -> dict: all_metrics.update(**logged_metrics) return all_metrics - @property - def disable_validation(self) -> bool: - """ Check if validation is disabled during training. """ - return not self.enable_validation - @property def enable_validation(self) -> bool: """ Check if we should run validation during training. """ diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b01f4fa36bd33..b24d6d7b2da48 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -940,7 +940,7 @@ def _run_evaluation(self) -> _EVALUATE_OUTPUT: dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders() # check if we want to skip this evaluation - if self.evaluation_loop.should_skip_evaluation(max_batches): + if sum(max_batches) == 0: return [], [] # enable eval mode + no grads diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 09a32c3c96aad..ea33241b7a4af 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -27,7 +27,6 @@ from pytorch_lightning.plugins import ParallelPlugin from pytorch_lightning.trainer.supporters import TensorRunningAccum from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType -from pytorch_lightning.utilities.distributed import rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.finite_checks import detect_nan_parameters from pytorch_lightning.utilities.grads import grad_norm @@ -107,12 +106,6 @@ def on_train_end(self): return self._teardown_already_run = True - # trigger checkpoint check. need to temporarily decrease the global step to avoid saving duplicates - # when a checkpoint was saved at the last step - self.global_step -= 1 - self.check_checkpoint_callback(should_update=True, is_last=True) - self.global_step += 1 - # hook self.trainer.call_hook("on_train_end") @@ -131,19 +124,6 @@ def on_train_end(self): # reset bookkeeping self.trainer.state.stage = None - def check_checkpoint_callback(self, should_update, is_last=False): - # TODO bake this logic into the ModelCheckpoint callback - if should_update and self.trainer.checkpoint_connector.has_trained: - callbacks = self.trainer.checkpoint_callbacks - - if is_last and any(cb.save_last and cb.verbose for cb in callbacks): - rank_zero_info("Saving latest checkpoint...") - - model = self.trainer.lightning_module - - for cb in callbacks: - cb.on_validation_end(self.trainer, model) - def on_train_epoch_start(self, epoch): # update training progress in trainer @@ -540,6 +520,7 @@ def run_training_epoch(self): return # handle epoch_output on epoch end + # TODO: this can log so ModelCheckpoint won't have access to them since the logger conector is updated after. self.on_train_epoch_end(epoch_output) # the global step is manually decreased here due to backwards compatibility with existing loggers @@ -553,14 +534,6 @@ def run_training_epoch(self): self.update_lr_schedulers('epoch') - did_train_only = self.trainer.disable_validation or self.trainer.evaluation_loop.should_skip_evaluation( - self.trainer.num_val_batches - ) - if did_train_only: - self.global_step -= 1 - self.check_checkpoint_callback(True) - self.global_step += 1 - def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: # inform logger the batch loop has finished self.trainer.logger_connector.on_train_epoch_end() From e2acb7871ec2808eec01edb1215079bfec457eb0 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 4 Jul 2021 14:28:29 +0200 Subject: [PATCH 02/26] Update code to new loops --- pytorch_lightning/loops/fit_loop.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py index a7699eaec812c..c7207f2cf833f 100644 --- a/pytorch_lightning/loops/fit_loop.py +++ b/pytorch_lightning/loops/fit_loop.py @@ -21,7 +21,6 @@ from pytorch_lightning.loops.epoch import TrainingEpochLoop from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection from pytorch_lightning.trainer.supporters import TensorRunningAccum -from pytorch_lightning.utilities import rank_zero_info log = logging.getLogger(__name__) @@ -229,12 +228,6 @@ def on_advance_end(self) -> None: self.epoch_loop.update_lr_schedulers('epoch', update_plateau_schedulers=True) - did_train_only = self.trainer.disable_validation or self.epoch_loop.val_loop.skip - if did_train_only: - self.global_step -= 1 - self._check_checkpoint_callback(True) - self.global_step += 1 - def on_run_end(self) -> None: """Calls the ``on_train_end`` hook""" # NOTE: the iteration_count/current_epoch is already incremented @@ -243,13 +236,6 @@ def on_run_end(self) -> None: # TODO: must be fixed by https://github.com/PyTorchLightning/pytorch-lightning/issues/5007 self.current_epoch -= 1 - # trigger checkpoint check. need to temporarily decrease the global step to avoid saving duplicates - # when a checkpoint was saved at the last step - self.epoch_loop.global_step -= 1 - # TODO: see discussion/rework https://github.com/PyTorchLightning/pytorch-lightning/issues/7406 - self._check_checkpoint_callback(should_update=True, is_last=True) - self.epoch_loop.global_step += 1 - # hook self.trainer.call_hook("on_train_end") @@ -269,20 +255,6 @@ def should_accumulate(self) -> bool: """Whether the gradients should be accumulated""" return self.epoch_loop.batch_loop.should_accumulate() - def _check_checkpoint_callback(self, should_update: bool, is_last: bool = False): - """Checks if checkpointing needs to be done""" - # TODO: bake this logic into the ModelCheckpoint callback - if should_update and self.trainer.checkpoint_connector.has_trained: - callbacks = self.trainer.checkpoint_callbacks - - if is_last and any(cb.save_last and cb.verbose for cb in callbacks): - rank_zero_info("Saving latest checkpoint...") - - model = self.trainer.lightning_module - - for cb in callbacks: - cb.on_validation_end(self.trainer, model) - def state_dict(self) -> Dict: return {"epoch_loop": self.epoch_loop.state_dict()} From 7b348dbd1d13cc40dcc4234e0fe25dafe6c92f00 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 4 Jul 2021 14:34:19 +0200 Subject: [PATCH 03/26] Pass through function --- pytorch_lightning/callbacks/model_checkpoint.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 5152aa9924a7c..87dbcfe59b6c9 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -307,10 +307,8 @@ def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') - if self.save_last and self.verbose: rank_zero_info("Saving last checkpoint...") # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates - trainer.train_loop.global_step -= 1 - monitor_candidates = self._monitor_candidates(trainer) + monitor_candidates = self._monitor_candidates(trainer, trainer.current_epoch, trainer.global_step - 1) self._save_last_checkpoint(trainer, monitor_candidates) - trainer.train_loop.global_step += 1 def on_save_checkpoint( self, From db2a6e54c10a57d96f17bb304278e91dd2b57850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sun, 4 Jul 2021 17:13:54 +0200 Subject: [PATCH 04/26] Update after loop refactor --- .azure-pipelines/gpu-tests.yml | 17 +- .azure-pipelines/ipu-tests.yml | 16 +- .circleci/config.yml | 8 +- .deepsource.toml | 26 + .github/BECOMING_A_CORE_CONTRIBUTOR.md | 16 +- .github/CODEOWNERS | 5 + .github/CONTRIBUTING.md | 30 +- .github/ISSUE_TEMPLATE/bug_report.md | 9 +- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/ci_test-full.yml | 2 +- .github/workflows/code-formatting.yml | 20 + .github/workflows/docs-checks.yml | 39 +- .github/workflows/release-pypi.yml | 2 +- .gitignore | 9 +- .gitmodules | 4 + .pre-commit-config.yaml | 38 +- .readthedocs.yml | 4 + CHANGELOG.md | 302 +- MANIFEST.in | 6 +- Makefile | 1 + README.md | 15 +- _notebooks | 1 + benchmarks/test_basic_parity.py | 2 +- dockers/base-cuda/Dockerfile | 3 +- dockers/nvidia/Dockerfile | 8 +- dockers/tpu-tests/tpu_test_cases.jsonnet | 1 + .../images/accelerator/ipus/profiler.png | Bin 0 -> 129635 bytes docs/source/_templates/layout.html | 10 + docs/source/_templates/theme_variables.jinja | 2 + docs/source/advanced/advanced_gpu.rst | 100 +- docs/source/advanced/amp.rst | 94 - docs/source/advanced/ipu.rst | 234 + docs/source/advanced/multi_gpu.rst | 48 +- docs/source/advanced/multiple_loaders.rst | 17 - docs/source/api_references.rst | 11 +- docs/source/benchmarking/performance.rst | 183 - docs/source/clouds/cloud_training.rst | 33 +- docs/source/common/fast_training.rst | 82 - docs/source/common/lightning_cli.rst | 181 +- docs/source/common/lightning_module.rst | 218 +- docs/source/common/loggers.rst | 13 +- docs/source/common/optimizers.rst | 82 - docs/source/common/test_set.rst | 4 +- docs/source/common/trainer.rst | 85 +- docs/source/conf.py | 36 +- docs/source/ecosystem/asr_nlp_tts.rst | 4 +- docs/source/extensions/datamodules.rst | 2 +- docs/source/extensions/logging.rst | 4 + docs/source/extensions/plugins.rst | 2 - docs/source/guides/speed.rst | 482 ++ docs/source/index.rst | 14 +- docs/source/starter/new-project.rst | 2 +- notebooks/01-mnist-hello-world.ipynb | 448 - notebooks/02-datamodules.ipynb | 588 -- notebooks/03-basic-gan.ipynb | 472 -- .../04-transformers-text-classification.ipynb | 599 -- notebooks/05-trainer-flags-overview.ipynb | 2926 ------- notebooks/06-mnist-tpu-training.ipynb | 368 - notebooks/07-cifar10-baseline.ipynb | 394 - notebooks/08-Domain-specific-demos.ipynb | 7415 ----------------- notebooks/README.md | 15 - pl_examples/basic_examples/autoencoder.py | 13 +- .../backbone_image_classifier.py | 11 +- .../basic_examples/conv_sequential_example.py | 226 - .../basic_examples/profiler_example.py | 4 + .../basic_examples/simple_image_classifier.py | 2 +- pl_examples/bug_report_model.py | 4 +- .../domain_templates/reinforce_learn_Qnet.py | 4 +- .../domain_templates/reinforce_learn_ppo.py | 6 +- pl_examples/ipu_examples/__init__.py | 0 pl_examples/ipu_examples/mnist.py | 89 + pyproject.toml | 25 + pytorch_lightning/accelerators/__init__.py | 1 + pytorch_lightning/accelerators/accelerator.py | 76 +- pytorch_lightning/accelerators/gpu.py | 5 +- pytorch_lightning/accelerators/ipu.py | 35 + pytorch_lightning/callbacks/early_stopping.py | 18 +- pytorch_lightning/callbacks/finetuning.py | 26 +- pytorch_lightning/callbacks/lr_monitor.py | 111 +- .../callbacks/model_checkpoint.py | 62 +- .../callbacks/prediction_writer.py | 2 +- pytorch_lightning/callbacks/progress.py | 2 +- pytorch_lightning/callbacks/pruning.py | 25 +- .../callbacks/stochastic_weight_avg.py | 13 +- pytorch_lightning/callbacks/timer.py | 3 +- pytorch_lightning/core/datamodule.py | 65 +- pytorch_lightning/core/grads.py | 2 +- pytorch_lightning/core/hooks.py | 27 +- pytorch_lightning/core/lightning.py | 320 +- pytorch_lightning/core/memory.py | 83 +- pytorch_lightning/core/optimizer.py | 2 +- pytorch_lightning/core/saving.py | 12 +- pytorch_lightning/core/step_result.py | 613 -- pytorch_lightning/loggers/base.py | 23 +- pytorch_lightning/loggers/comet.py | 4 +- pytorch_lightning/loggers/csv_logs.py | 3 +- pytorch_lightning/loggers/neptune.py | 6 +- pytorch_lightning/loggers/tensorboard.py | 15 +- pytorch_lightning/loggers/test_tube.py | 8 +- pytorch_lightning/loggers/wandb.py | 93 +- pytorch_lightning/loops/__init__.py | 19 + pytorch_lightning/loops/base.py | 158 + pytorch_lightning/loops/batch/__init__.py | 15 + .../loops/batch/training_batch_loop.py | 677 ++ .../loops/dataloader/__init__.py | 17 + .../loops/dataloader/dataloader_loop.py | 53 + .../loops/dataloader/evaluation_loop.py | 269 + .../loops/dataloader/prediction_loop.py | 151 + pytorch_lightning/loops/epoch/__init__.py | 17 + .../loops/epoch/evaluation_epoch_loop.py | 255 + .../loops/epoch/prediction_epoch_loop.py | 151 + .../loops/epoch/training_epoch_loop.py | 426 + pytorch_lightning/loops/fit_loop.py | 265 + pytorch_lightning/metrics/__init__.py | 6 - .../metrics/classification/accuracy.py | 4 +- .../metrics/classification/auc.py | 4 +- .../metrics/classification/auroc.py | 4 +- .../classification/average_precision.py | 4 +- .../classification/confusion_matrix.py | 4 +- .../metrics/classification/f_beta.py | 8 +- .../classification/hamming_distance.py | 4 +- .../metrics/classification/iou.py | 8 +- .../classification/precision_recall.py | 10 +- .../classification/precision_recall_curve.py | 4 +- .../metrics/classification/roc.py | 4 +- .../metrics/classification/stat_scores.py | 8 +- pytorch_lightning/metrics/compositional.py | 3 +- .../metrics/functional/accuracy.py | 3 +- pytorch_lightning/metrics/functional/auc.py | 3 +- pytorch_lightning/metrics/functional/auroc.py | 3 +- .../metrics/functional/average_precision.py | 3 +- .../metrics/functional/confusion_matrix.py | 3 +- .../metrics/functional/explained_variance.py | 3 +- .../metrics/functional/f_beta.py | 8 +- .../metrics/functional/hamming_distance.py | 3 +- .../metrics/functional/image_gradients.py | 3 +- pytorch_lightning/metrics/functional/iou.py | 3 +- .../metrics/functional/mean_absolute_error.py | 3 +- .../metrics/functional/mean_relative_error.py | 3 +- .../metrics/functional/mean_squared_error.py | 3 +- .../functional/mean_squared_log_error.py | 3 +- pytorch_lightning/metrics/functional/nlp.py | 3 +- .../metrics/functional/precision_recall.py | 11 +- .../functional/precision_recall_curve.py | 3 +- pytorch_lightning/metrics/functional/psnr.py | 3 +- .../metrics/functional/r2score.py | 3 +- pytorch_lightning/metrics/functional/roc.py | 3 +- .../metrics/functional/self_supervised.py | 3 +- pytorch_lightning/metrics/functional/ssim.py | 3 +- .../metrics/functional/stat_scores.py | 5 +- pytorch_lightning/metrics/metric.py | 4 +- .../metrics/regression/explained_variance.py | 4 +- .../metrics/regression/mean_absolute_error.py | 4 +- .../metrics/regression/mean_squared_error.py | 4 +- .../regression/mean_squared_log_error.py | 4 +- pytorch_lightning/metrics/regression/psnr.py | 4 +- .../metrics/regression/r2score.py | 4 +- pytorch_lightning/metrics/regression/ssim.py | 4 +- pytorch_lightning/metrics/utils.py | 16 +- pytorch_lightning/overrides/base.py | 58 +- pytorch_lightning/overrides/data_parallel.py | 4 +- pytorch_lightning/overrides/distributed.py | 10 +- pytorch_lightning/overrides/fairscale.py | 4 +- pytorch_lightning/plugins/__init__.py | 8 +- .../plugins/precision/apex_amp.py | 18 +- pytorch_lightning/plugins/precision/double.py | 100 +- .../plugins/precision/ipu_precision.py | 60 + .../plugins/precision/native_amp.py | 23 +- .../plugins/precision/precision_plugin.py | 3 +- .../plugins/training_type/__init__.py | 2 - .../plugins/training_type/ddp.py | 153 +- .../plugins/training_type/ddp2.py | 20 +- .../plugins/training_type/ddp_spawn.py | 59 +- .../plugins/training_type/deepspeed.py | 320 +- pytorch_lightning/plugins/training_type/dp.py | 37 +- .../plugins/training_type/horovod.py | 5 +- .../plugins/training_type/ipu.py | 393 + .../plugins/training_type/parallel.py | 15 +- .../plugins/training_type/rpc.py | 85 - .../plugins/training_type/rpc_sequential.py | 408 - .../plugins/training_type/sharded.py | 7 +- .../plugins/training_type/sharded_spawn.py | 4 +- .../plugins/training_type/single_device.py | 3 +- .../plugins/training_type/single_tpu.py | 29 +- .../plugins/training_type/tpu_spawn.py | 12 +- .../training_type/training_type_plugin.py | 105 +- pytorch_lightning/profiler/__init__.py | 12 +- pytorch_lightning/profiler/advanced.py | 92 + pytorch_lightning/profiler/base.py | 219 + pytorch_lightning/profiler/profilers.py | 409 +- pytorch_lightning/profiler/pytorch.py | 33 +- pytorch_lightning/profiler/simple.py | 123 + pytorch_lightning/profiler/xla.py | 110 + pytorch_lightning/trainer/callback_hook.py | 20 +- .../trainer/configuration_validator.py | 25 +- .../connectors/accelerator_connector.py | 64 +- .../trainer/connectors/callback_connector.py | 12 +- .../connectors/checkpoint_connector.py | 302 +- .../trainer/connectors/data_connector.py | 75 +- .../trainer/connectors/debugging_connector.py | 13 +- .../trainer/connectors/env_vars_connector.py | 2 +- .../logger_connector/epoch_result_store.py | 493 -- .../logger_connector/fx_validator.py | 41 +- .../logger_connector/logger_connector.py | 499 +- .../logger_connector/metrics_holder.py | 82 - .../connectors/logger_connector/result.py | 700 ++ .../trainer/connectors/optimizer_connector.py | 15 +- .../connectors/training_trick_connector.py | 3 +- pytorch_lightning/trainer/data_loading.py | 38 +- pytorch_lightning/trainer/deprecated_api.py | 10 +- pytorch_lightning/trainer/evaluation_loop.py | 252 - pytorch_lightning/trainer/logging.py | 2 +- pytorch_lightning/trainer/model_hooks.py | 8 +- pytorch_lightning/trainer/optimizers.py | 4 +- pytorch_lightning/trainer/predict_loop.py | 164 - pytorch_lightning/trainer/progress.py | 164 +- pytorch_lightning/trainer/properties.py | 355 +- pytorch_lightning/trainer/supporters.py | 13 +- pytorch_lightning/trainer/trainer.py | 539 +- pytorch_lightning/trainer/training_loop.py | 944 --- pytorch_lightning/trainer/training_tricks.py | 4 +- pytorch_lightning/tuner/batch_size_scaling.py | 16 +- pytorch_lightning/tuner/lr_finder.py | 20 +- pytorch_lightning/tuner/tuning.py | 37 +- pytorch_lightning/utilities/__init__.py | 13 +- pytorch_lightning/utilities/apply_func.py | 129 +- pytorch_lightning/utilities/argparse.py | 20 +- pytorch_lightning/utilities/cli.py | 253 +- pytorch_lightning/utilities/cloud_io.py | 5 +- pytorch_lightning/utilities/data.py | 10 +- pytorch_lightning/utilities/debugging.py | 13 +- pytorch_lightning/utilities/device_parser.py | 50 +- pytorch_lightning/utilities/distributed.py | 56 +- pytorch_lightning/utilities/enums.py | 2 +- pytorch_lightning/utilities/exceptions.py | 10 +- pytorch_lightning/utilities/finite_checks.py | 8 +- pytorch_lightning/utilities/imports.py | 9 +- pytorch_lightning/utilities/memory.py | 15 +- pytorch_lightning/utilities/metrics.py | 35 +- pytorch_lightning/utilities/model_helpers.py | 71 +- pytorch_lightning/utilities/parsing.py | 28 +- pytorch_lightning/utilities/seed.py | 8 +- pytorch_lightning/utilities/types.py | 19 +- pytorch_lightning/utilities/warnings.py | 42 +- requirements.txt | 9 +- requirements/adjust_versions.py | 5 +- requirements/docs.txt | 8 +- requirements/extra.txt | 2 +- setup.cfg | 8 + .../test_accelerator_connector.py | 76 +- tests/accelerators/test_cpu.py | 110 + tests/accelerators/test_ddp.py | 35 +- tests/accelerators/test_ipu.py | 547 ++ tests/accelerators/test_multi_nodes_gpu.py | 13 +- tests/base/model_train_steps.py | 31 +- tests/callbacks/test_callback_hook_outputs.py | 2 +- tests/callbacks/test_callbacks.py | 158 +- tests/callbacks/test_early_stopping.py | 6 +- tests/callbacks/test_finetuning_callback.py | 43 +- tests/callbacks/test_lambda_function.py | 13 +- tests/callbacks/test_lr_monitor.py | 173 + tests/callbacks/test_progress_bar.py | 76 +- tests/callbacks/test_pruning.py | 43 +- tests/callbacks/test_stochastic_weight_avg.py | 26 +- tests/callbacks/test_timer.py | 7 +- .../test_checkpoint_callback_frequency.py | 15 +- .../checkpointing/test_legacy_checkpoints.py | 6 + tests/checkpointing/test_model_checkpoint.py | 74 +- tests/conftest.py | 9 + tests/core/test_datamodules.py | 72 +- tests/core/test_lightning_module.py | 23 - tests/core/test_lightning_optimizer.py | 3 +- tests/core/test_memory.py | 112 +- tests/core/test_metric_result_integration.py | 274 +- tests/core/test_results.py | 70 +- tests/deprecated_api/test_remove_1-4.py | 13 + tests/deprecated_api/test_remove_1-5.py | 22 +- tests/deprecated_api/test_remove_1-6.py | 196 +- tests/helpers/advanced_models.py | 19 +- tests/helpers/boring_model.py | 26 +- tests/helpers/datasets.py | 2 +- tests/helpers/pipelines.py | 4 +- tests/helpers/runif.py | 21 +- tests/loggers/test_all.py | 3 + tests/loggers/test_base.py | 5 + tests/loggers/test_tensorboard.py | 15 +- tests/loggers/test_wandb.py | 77 +- tests/loops/__init__.py | 0 tests/loops/test_loop_state_dict.py | 54 + tests/loops/test_loops.py | 74 + tests/metrics/test_metric_lightning.py | 81 +- tests/metrics/test_remove_1-5_metrics.py | 2 +- tests/metrics/utils.py | 16 +- .../data/horovod/train_default_model.py | 2 +- tests/models/test_cpu.py | 22 +- tests/models/test_gpu.py | 26 + tests/models/test_grad_norm.py | 35 +- tests/models/test_hooks.py | 861 +- tests/models/test_horovod.py | 4 +- tests/models/test_hparams.py | 19 + tests/models/test_restore.py | 24 +- tests/models/test_tpu.py | 21 +- tests/overrides/test_base.py | 44 + tests/overrides/test_distributed.py | 15 +- tests/plugins/test_amp_plugins.py | 41 + tests/plugins/test_cluster_integration.py | 3 +- tests/plugins/test_ddp_plugin.py | 30 + tests/plugins/test_ddp_spawn_plugin.py | 41 +- tests/plugins/test_deepspeed_plugin.py | 102 +- tests/plugins/test_double_plugin.py | 53 +- tests/plugins/test_plugins_registry.py | 20 +- tests/plugins/test_rpc_plugin.py | 89 - tests/plugins/test_rpc_sequential_plugin.py | 185 - tests/plugins/test_sharded_plugin.py | 18 +- tests/plugins/test_single_device_plugin.py | 2 +- tests/plugins/test_tpu_spawn.py | 6 +- tests/profiler/__init__.py | 0 tests/{ => profiler}/test_profiler.py | 4 +- tests/profiler/test_xla_profiler.py | 72 + tests/special_tests.sh | 12 +- .../connectors/test_callback_connector.py | 13 + .../connectors/test_checkpoint_connector.py | 155 + tests/trainer/flags/test_fast_dev_run.py | 1 - .../logging_/test_distributed_logging.py | 2 +- .../logging_/test_eval_loop_logging.py | 630 +- .../trainer/logging_/test_logger_connector.py | 534 +- .../logging_/test_train_loop_logging.py | 669 +- tests/trainer/loops/test_evaluation_loop.py | 63 +- .../loops/test_evaluation_loop_flow.py | 35 +- tests/trainer/loops/test_training_loop.py | 47 +- .../loops/test_training_loop_flow_scalar.py | 49 +- .../optimization/test_manual_optimization.py | 302 +- .../optimization/test_multiple_optimizers.py | 17 +- tests/trainer/optimization/test_optimizers.py | 85 + tests/trainer/test_config_validator.py | 26 +- tests/trainer/test_data_loading.py | 10 +- tests/trainer/test_dataloaders.py | 23 +- tests/trainer/test_progress.py | 198 +- tests/trainer/test_states.py | 15 +- tests/trainer/test_supporters.py | 4 +- tests/trainer/test_trainer.py | 134 +- tests/tuner/test_auto_gpu_select.py | 2 +- tests/utilities/distributed.py | 5 +- tests/utilities/test_apply_func.py | 147 +- tests/utilities/test_cli.py | 303 +- tests/utilities/test_model_helpers.py | 67 + tests/utilities/test_warnings.py | 52 + 347 files changed, 14628 insertions(+), 22518 deletions(-) create mode 100644 .deepsource.toml create mode 100644 .gitmodules create mode 160000 _notebooks create mode 100644 docs/source/_static/images/accelerator/ipus/profiler.png create mode 100644 docs/source/_templates/layout.html delete mode 100644 docs/source/advanced/amp.rst create mode 100644 docs/source/advanced/ipu.rst delete mode 100644 docs/source/benchmarking/performance.rst delete mode 100644 docs/source/common/fast_training.rst create mode 100644 docs/source/guides/speed.rst delete mode 100644 notebooks/01-mnist-hello-world.ipynb delete mode 100644 notebooks/02-datamodules.ipynb delete mode 100644 notebooks/03-basic-gan.ipynb delete mode 100644 notebooks/04-transformers-text-classification.ipynb delete mode 100644 notebooks/05-trainer-flags-overview.ipynb delete mode 100644 notebooks/06-mnist-tpu-training.ipynb delete mode 100644 notebooks/07-cifar10-baseline.ipynb delete mode 100644 notebooks/08-Domain-specific-demos.ipynb delete mode 100644 notebooks/README.md delete mode 100644 pl_examples/basic_examples/conv_sequential_example.py create mode 100644 pl_examples/ipu_examples/__init__.py create mode 100644 pl_examples/ipu_examples/mnist.py create mode 100644 pytorch_lightning/accelerators/ipu.py delete mode 100644 pytorch_lightning/core/step_result.py create mode 100644 pytorch_lightning/loops/__init__.py create mode 100644 pytorch_lightning/loops/base.py create mode 100644 pytorch_lightning/loops/batch/__init__.py create mode 100644 pytorch_lightning/loops/batch/training_batch_loop.py create mode 100644 pytorch_lightning/loops/dataloader/__init__.py create mode 100644 pytorch_lightning/loops/dataloader/dataloader_loop.py create mode 100644 pytorch_lightning/loops/dataloader/evaluation_loop.py create mode 100644 pytorch_lightning/loops/dataloader/prediction_loop.py create mode 100644 pytorch_lightning/loops/epoch/__init__.py create mode 100644 pytorch_lightning/loops/epoch/evaluation_epoch_loop.py create mode 100644 pytorch_lightning/loops/epoch/prediction_epoch_loop.py create mode 100644 pytorch_lightning/loops/epoch/training_epoch_loop.py create mode 100644 pytorch_lightning/loops/fit_loop.py create mode 100644 pytorch_lightning/plugins/precision/ipu_precision.py create mode 100644 pytorch_lightning/plugins/training_type/ipu.py delete mode 100644 pytorch_lightning/plugins/training_type/rpc.py delete mode 100644 pytorch_lightning/plugins/training_type/rpc_sequential.py create mode 100644 pytorch_lightning/profiler/advanced.py create mode 100644 pytorch_lightning/profiler/base.py create mode 100644 pytorch_lightning/profiler/simple.py create mode 100644 pytorch_lightning/profiler/xla.py delete mode 100644 pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py delete mode 100644 pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py create mode 100644 pytorch_lightning/trainer/connectors/logger_connector/result.py delete mode 100644 pytorch_lightning/trainer/evaluation_loop.py delete mode 100644 pytorch_lightning/trainer/predict_loop.py delete mode 100644 pytorch_lightning/trainer/training_loop.py create mode 100644 tests/accelerators/test_ipu.py create mode 100644 tests/loops/__init__.py create mode 100644 tests/loops/test_loop_state_dict.py create mode 100644 tests/loops/test_loops.py create mode 100644 tests/overrides/test_base.py delete mode 100644 tests/plugins/test_rpc_plugin.py delete mode 100644 tests/plugins/test_rpc_sequential_plugin.py create mode 100644 tests/profiler/__init__.py rename tests/{ => profiler}/test_profiler.py (99%) create mode 100644 tests/profiler/test_xla_profiler.py create mode 100644 tests/trainer/connectors/test_checkpoint_connector.py create mode 100644 tests/utilities/test_model_helpers.py create mode 100644 tests/utilities/test_warnings.py diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 05e8624b72630..b1fedd578bc85 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -25,20 +25,14 @@ jobs: pool: gridai-spot-pool - #strategy: - # matrix: - # PT16: - # torch.version: '1.6' - # python.version: '3.7' - # ToDo: this need to have installed docker in the base image... - #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 - #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6" container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6" - #endpoint: azureContainerRegistryConnection - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" + # run on torch 1.8 as it's the LTS version + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + # default shm size is 64m. Increase it to avoid: + # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=256m" workspace: clean: all @@ -57,6 +51,7 @@ jobs: - bash: | python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install fairscale>=0.3.4 + pip install deepspeed>=0.4.0 -U pip install . --requirement requirements/devel.txt pip list displayName: 'Install dependencies' diff --git a/.azure-pipelines/ipu-tests.yml b/.azure-pipelines/ipu-tests.yml index 763549e88200b..42cee6b040ba3 100644 --- a/.azure-pipelines/ipu-tests.yml +++ b/.azure-pipelines/ipu-tests.yml @@ -53,12 +53,9 @@ jobs: export GIT_TERMINAL_PROMPT=1 python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" - python ./requirements/adjust_versions.py requirements/extra.txt python ./requirements/adjust_versions.py requirements/examples.txt - - pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed - + pip install . --requirement requirements/devel.txt pip list displayName: 'Install dependencies' @@ -84,8 +81,17 @@ jobs: - bash: | source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh - + export POPTORCH_WAIT_FOR_IPU=1 python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 env: MKL_THREADING_LAYER: "GNU" displayName: 'Testing: standard' + + - bash: | + source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh + source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh + export POPTORCH_WAIT_FOR_IPU=1 + bash tests/special_tests.sh + env: + MKL_THREADING_LAYER: "GNU" + displayName: 'Testing: special' diff --git a/.circleci/config.yml b/.circleci/config.yml index 660645abcbbe4..fa9753e063a3f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -91,7 +91,7 @@ jobs: docker: - image: circleci/python:3.7 environment: - - XLA_VER: 1.7 + - XLA_VER: 1.8 - MAX_CHECKS: 240 - CHECK_SPEEP: 5 steps: @@ -119,6 +119,10 @@ jobs: - image: readthedocs/build:latest steps: - checkout + - run: + command: | + git submodule update --init --recursive + name: Init git submodule - *make_docs - store_artifacts: # allows us to preview the generated html pages @@ -127,7 +131,7 @@ jobs: workflows: version: 2 - tpu-tests: + ci-tests: jobs: - build-Docs - TPU-tests diff --git a/.deepsource.toml b/.deepsource.toml new file mode 100644 index 0000000000000..49e710a55b3b2 --- /dev/null +++ b/.deepsource.toml @@ -0,0 +1,26 @@ +version = 1 + +test_patterns = ["tests/**", "benchmarks/**"] + +[[analyzers]] +name = "secrets" +enabled = true + +[[analyzers]] +name = "shell" +enabled = true + +[[analyzers]] +name = "docker" +enabled = true + +[[analyzers]] +name = "python" +enabled = true + + [analyzers.meta] + runtime_version = "3.x.x" + +[[analyzers]] +name = "test-coverage" +enabled = true diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index 2b3ba3ee93235..a5e7d3830bdd9 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -5,18 +5,18 @@ We're currently recruiting for a team of 5 core maintainers. As a core maintainer you will have a strong say in the direction of the project. Big changes will require a majority of maintainers to agree. -### Code of conduct +## Code of conduct First and foremost, you'll be evaluated against [these core values](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md). Any code we commit or feature we add needs to align with those core values. -### The bar for joining the team +## The bar for joining the team Lightning is being used to solve really hard problems at the top AI labs in the world. As such, the bar for adding team members is extremely high. Candidates must have solid engineering skills, have a good eye for user experience, and must be a power user of Lightning and PyTorch. With that said, the Lightning team will be diverse and a reflection of an inclusive AI community. You don't have to be an engineer to contribute! Scientists with great usability intuition and PyTorch ninja skills are welcomed! -### Responsibilities: +## Responsibilities: The responsibilities mainly revolve around 3 things. -#### Github issues +### Github issues - Here we want to help users have an amazing experience. These range from questions from new people getting into DL to questions from researchers about doing something esoteric with Lightning Often, these issues require some sort of bug fix, document clarification or new functionality to be scoped out. @@ -27,7 +27,7 @@ Pleasant/helpful tone. - Don’t make users feel like they don’t know what they’re doing. We’re here to help and to make everyone’s experience delightful. -#### Pull requests +### Pull requests - Here we need to ensure the code that enters Lightning is high quality. For each PR we need to: - Make sure code coverage does not decrease @@ -43,16 +43,16 @@ Guidance for a sanity check. At the end of 10 PRs if your PR reviews are inline with expectations described above, then you can merge PRs on your own going forward, otherwise we'll do a few more until we're both comfortable :) -#### Project directions +### Project directions There are some big decisions which the project must make. For these I expect core contributors to have something meaningful to add if it’s their area of expertise. -#### Diversity +### Diversity Lightning should reflect the broader community it serves. As such we should have scientists/researchers from different fields contributing! The first 5 core contributors will fit this profile. Thus if you overlap strongly with experiences and expertise as someone else on the team, you might have to wait until the next set of contributors are added. -#### Summary: Requirements to apply +### Summary: Requirements to apply The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. - Solve 10+ Github issues. diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 39f38bf266af0..d6fc6ce5fe64e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -25,6 +25,7 @@ /pytorch_lightning/core @tchaton @SeanNaren @borda @carmocca @justusschock @kaushikb11 /pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11 /pytorch_lightning/loggers @tchaton @awaelchli @borda +/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca /pytorch_lightning/overrides @tchaton @SeanNaren @borda /pytorch_lightning/plugins @tchaton @SeanNaren @awaelchli @justusschock /pytorch_lightning/profiler @williamfalcon @tchaton @borda @carmocca @@ -33,6 +34,10 @@ /pytorch_lightning/tuner @SkafteNicki @borda @awaelchli /pytorch_lightning/utilities @borda @tchaton @SeanNaren @carmocca +# Specifics +/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca +/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca + # Metrics /pytorch_lightning/metrics/ @SkafteNicki @ananyahjha93 @justusschock /tests/metrics/ @SkafteNicki @ananyahjha93 @justusschock diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 8414bf43f68a2..ee9706172e2ac 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -2,6 +2,8 @@ Welcome to the PyTorch Lightning community! We're building the most advanced research platform on the planet to implement the latest, best practices that the amazing PyTorch team rolls out! +If you are new to open source, check out [this blog to get started with your first Open Source contribution](https://devblog.pytorchlightning.ai/quick-contribution-guide-86d977171b3a). + ## Main Core Value: One less thing to remember Simplify the API as much as possible from the user perspective. @@ -14,18 +16,18 @@ This helps users avoid all sorts of subtle errors. We encourage all sorts of contributions you're interested in adding! When coding for lightning, please follow these principles. -#### No PyTorch Interference +### No PyTorch Interference We don't want to add any abstractions on top of pure PyTorch. This gives researchers all the control they need without having to learn yet another framework. -#### Simple Internal Code +### Simple Internal Code It's useful for users to look at the code and understand very quickly what's happening. Many users won't be engineers. Thus we need to value clear, simple code over condensed ninja moves. While that's super cool, this isn't the project for that :) -#### Force User Decisions To Best Practices +### Force User Decisions To Best Practices There are 1,000 ways to do something. However, eventually one popular solution becomes standard practice, and everyone follows. We try to find the best way to solve a particular problem, and then force our users to use it for readability and simplicity. @@ -35,22 +37,22 @@ A bad forced decision would be to make users use a specific library to do someth When something becomes a best practice, we add it to the framework. This is usually something like bits of code in utils or in the model file that everyone keeps adding over and over again across projects. When this happens, bring that code inside the trainer and add a flag for it. -#### Simple External API +### Simple External API What makes sense to you may not make sense to others. When creating an issue with an API change suggestion, please validate that it makes sense for others. Treat code changes the way you treat a startup: validate that it's a needed feature, then add if it makes sense for many people. -#### Backward-compatible API +### Backward-compatible API We all hate updating our deep learning packages because we don't want to refactor a bunch of stuff. In Lightning, we make sure every change we make which could break an API is backward compatible with good deprecation warnings. **You shouldn't be afraid to upgrade Lightning :)** -#### Gain User Trust +### Gain User Trust As a researcher, you can't have any part of your code going wrong. So, make thorough tests to ensure that every implementation of a new trick or subtle change is correct. -#### Interoperability +### Interoperability Have a favorite feature from other libraries like fast.ai or transformers? Those should just work with lightning as well. Grab your favorite model or learning rate scheduler from your favorite library and run it in Lightning. @@ -58,13 +60,13 @@ Have a favorite feature from other libraries like fast.ai or transformers? Those ## Contribution Types -We are always looking for help implementing new features or fixing bugs. +We are always open to contributions of new features or bug fixes. A lot of good work has already been done in project mechanics (requirements.txt, setup.py, pep8, badges, ci, etc...) so we're in a good state there thanks to all the early contributors (even pre-beta release)! ### Bug Fixes: -1. If you find a bug please submit a github issue. +1. If you find a bug please submit a GitHub issue. - Make sure the title explains the issue. - Describe your setup, what you are trying to do, expected vs. actual behaviour. Please add configs and code samples. @@ -79,12 +81,12 @@ A lot of good work has already been done in project mechanics (requirements.txt, 3. Submit a PR! -_**Note**, even if you do not find the solution, sending a PR with a test covering the issue is a valid contribution and we can help you or finish it with you :]_ +_**Note**, even if you do not find the solution, sending a PR with a test covering the issue is a valid contribution, and we can help you or finish it with you :]_ ### New Features: -1. Submit a github issue - describe what is the motivation of such feature (adding the use case or an example is helpful). -2. Let's discuss to determine the feature scope. +1. Submit a GitHub issue - describe what is the motivation of such feature (adding the use case, or an example is helpful). +2. Determine the feature scope with us. 3. Submit a PR! We recommend test driven approach to adding new features as well: - Write a test for the functionality you want to add. @@ -199,7 +201,7 @@ Note: if your computer does not have multi-GPU nor TPU these tests are skipped. **GitHub Actions:** For convenience, you can also use your own GHActions building which will be triggered with each commit. This is useful if you do not test against all required dependency versions. -**Docker:** Another option is utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/repository/docker/pytorchlightning/pytorch_lightning/tags?page=1&name=cuda). You can then run: +**Docker:** Another option is to utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/repository/docker/pytorchlightning/pytorch_lightning/tags?page=1&name=cuda). You can then run: ```bash python -m pytest pytorch_lightning tests pl_examples -v @@ -230,7 +232,7 @@ We welcome any useful contribution! For your convenience here's a recommended wo - Make sure all tests are passing. - Make sure you add a GitHub issue to your PR. 5. Use tags in PR name for following cases: - - **[blocked by #]** if you work is depending on others changes. + - **[blocked by #]** if your work is dependent on other PRs. - **[wip]** when you start to re-edit your work, mark it so no one will accidentally merge it in meantime. ### Question & Answer diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index cef062516b0eb..9faa2331a2f27 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -41,13 +41,14 @@ wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master python collect_env_details.py ``` - - PyTorch Version (e.g., 1.0): - - OS (e.g., Linux): - - How you installed PyTorch (`conda`, `pip`, source): - - Build command you used (if compiling from source): + - PyTorch Lightning Version (e.g., 1.3.0): + - PyTorch Version (e.g., 1.8) - Python version: + - OS (e.g., Linux): - CUDA/cuDNN version: - GPU models and configuration: + - How you installed PyTorch (`conda`, `pip`, source): + - If compiling from source, the output of `torch.__config__.show()`: - Any other relevant information: ### Additional context diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 9ed2f30e0b062..0d7dae8fa8b41 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -34,9 +34,9 @@ jobs: pip list - name: Pull checkpoints from S3 + working-directory: ./legacy run: | # enter legacy and update checkpoints from S3 - cd legacy curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip unzip -o checkpoints.zip ls -l checkpoints/ diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index bffd604c9d333..1064e603bee1f 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -96,8 +96,8 @@ jobs: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull checkpoints from S3 + working-directory: ./legacy run: | - cd legacy # wget is simpler but does not work on Windows python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')" ls -l . diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml index bc03905ab2bbd..1cb8633545995 100644 --- a/.github/workflows/code-formatting.yml +++ b/.github/workflows/code-formatting.yml @@ -74,3 +74,23 @@ jobs: - name: mypy check run: | mypy + + dead-code-vulture: + name: Python dead code checker + runs-on: ubuntu-20.04 + + # Timeout: https://stackoverflow.com/a/59076067/4521646 + timeout-minutes: 10 + steps: + - name: Checkout + uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Install dependencies + run: pip install vulture && pip list + + - name: Check for dead code with Vulture + run: | + vulture pytorch_lightning diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 7613310df40de..8569bf4b0a24b 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -11,20 +11,25 @@ jobs: sphinx-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 - - uses: ammaraskar/sphinx-action@master - with: - # git is required to clone the docs theme - # before custom requirement are resolved https://github.com/ammaraskar/sphinx-action/issues/16 - pre-build-command: "apt-get update -y && apt-get install -y git && pip install -r requirements/docs.txt" - docs-folder: "docs/" - repo-token: "${{ secrets.GITHUB_TOKEN }}" + - uses: actions/checkout@v2 + with: + submodules: true + # lfs: true + - uses: ammaraskar/sphinx-action@master + with: + # git is required to clone the docs theme + # before custom requirement are resolved https://github.com/ammaraskar/sphinx-action/issues/16 + pre-build-command: "apt-get update -y && apt-get install -y git pandoc && pip install -r requirements/docs.txt" + docs-folder: "docs/" + repo-token: "${{ secrets.GITHUB_TOKEN }}" test-docs: runs-on: ubuntu-20.04 - steps: - uses: actions/checkout@v2 + with: + submodules: true + # lfs: true - uses: actions/setup-python@v2 with: python-version: 3.7 @@ -45,7 +50,8 @@ jobs: - name: Install dependencies run: | - python --version + sudo apt-get update + sudo apt-get install -y cmake pandoc pip --version # remove Horovod from requirements python .github/prune-packages.py requirements/extra.txt "horovod" @@ -60,18 +66,19 @@ jobs: - name: Test Documentation env: SPHINX_MOCK_REQUIREMENTS: 0 + working-directory: ./docs run: | # First run the same pipeline as Read-The-Docs - apt-get update && sudo apt-get install -y cmake - cd docs make doctest make coverage make-docs: runs-on: ubuntu-20.04 - steps: - uses: actions/checkout@v2 + with: + submodules: true + # lfs: true - uses: actions/setup-python@v2 with: python-version: 3.7 @@ -88,7 +95,8 @@ jobs: - name: Install dependencies run: | - python --version + sudo apt-get update + sudo apt-get install -y cmake pandoc pip --version # pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet pip install --requirement requirements/docs.txt @@ -98,10 +106,9 @@ jobs: shell: bash - name: Make Documentation + working-directory: ./docs run: | # First run the same pipeline as Read-The-Docs - cd docs - make clean make html --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going" - name: Upload built docs diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 62bf1b1aa00ac..8e81ef40a0b3c 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -137,8 +137,8 @@ jobs: bash legacy/generate_checkpoints.sh $pl_ver - name: Push files to S3 + working-directory: ./legacy run: | aws s3 sync legacy/checkpoints/ s3://pl-public-data/legacy/checkpoints/ - cd legacy zip -r checkpoints.zip checkpoints aws s3 cp checkpoints.zip s3://pl-public-data/legacy/ --acl public-read diff --git a/.gitignore b/.gitignore index 99939ff7fce0c..59340744ce420 100644 --- a/.gitignore +++ b/.gitignore @@ -8,15 +8,14 @@ lightning_logs/ .vscode/ # Test-tube -test_tube_logs/ -test_tube_data/ -test_tube_exp/ +test_tube_*/ # Documentations docs/source/api docs/source/*.md docs/source/generated docs/source/*/generated +docs/source/notebooks # Byte-compiled / optimized / DLL files __pycache__/ @@ -154,7 +153,3 @@ cifar-10-batches-py *.pt # ctags tags -data -MNIST -runs -*trace* diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..b311352c45f4c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "_notebooks"] + path = _notebooks + url = https://github.com/PyTorchLightning/lightning-tutorials.git + branch = publication diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5814ea965d179..fe1cbced9a9a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,17 +19,47 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: - - id: trailing-whitespace - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-yaml + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-toml + - id: check-case-conflict + - id: check-added-large-files + args: ['--maxkb=350', '--enforce-all'] + exclude: | + (?x)^( + docs/source/_static/images/general/fast_2.gif| + docs/source/_static/images/mnist_imgs/pt_to_pl.jpg| + docs/source/_static/images/lightning_module/pt_to_pl.png| + docs/source/_static/images/general/pl_quick_start_full_compressed.gif| + docs/source/_static/images/general/pl_overview_flat.jpg| + docs/source/_static/images/general/pl_overview.gif + )$ + - id: detect-private-key - repo: https://github.com/PyCQA/isort - rev: 5.8.0 + rev: 5.9.1 hooks: - id: isort - args: [--settings-path, ./pyproject.toml] + name: Format imports - repo: https://github.com/pre-commit/mirrors-yapf rev: v0.31.0 hooks: - id: yapf - args: [--parallel, --in-place] + name: Format code + language: python + + - repo: https://github.com/jendrikseipp/vulture + rev: 'v2.3' + hooks: + - id: vulture + name: Check dead code + + - repo: https://github.com/PyCQA/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + name: Check PEP8 diff --git a/.readthedocs.yml b/.readthedocs.yml index 32a5a16248b91..ef0c98ec96797 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -19,6 +19,10 @@ # Required version: 2 +submodules: + include: all + recursive: true + # Build documentation in the docs/ directory with Sphinx # reference: https://docs.readthedocs.io/en/stable/config-file/v2.html#sphinx sphinx: diff --git a/CHANGELOG.md b/CHANGELOG.md index 199aa70329e24..2256dcefeac31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Add support for named parameter groups in `LearningRateMonitor` ([#7987](https://github.com/PyTorchLightning/pytorch-lightning/pull/7987)) + + +- Add `dataclass` support for `pytorch_lightning.utilities.apply_to_collection` ([#7935](https://github.com/PyTorchLightning/pytorch-lightning/pull/7935)) + + - Added support to `LightningModule.to_torchscript` for saving to custom filesystems with fsspec ([#7617](https://github.com/PyTorchLightning/pytorch-lightning/pull/7617)) @@ -24,9 +30,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515)) -- Added dataclasses for progress tracking ( - [#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), - [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574)) +- Progress tracking + * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574), [#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140)) + * Add `{,load_}state_dict` to the progress tracking dataclasses ([#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140)) + + +- Added support for passing a `LightningDataModule` positionally as the second argument to `trainer.{validate,test,predict}` ([#7431](https://github.com/PyTorchLightning/pytorch-lightning/pull/7431)) - Added argument `trainer.predict(ckpt_path)` ([#7430](https://github.com/PyTorchLightning/pytorch-lightning/pull/7430)) @@ -35,21 +44,94 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `clip_grad_by_value` support for TPUs ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) +- Added support for passing any class to `is_overridden` ([#7918](https://github.com/PyTorchLightning/pytorch-lightning/pull/7918)) + + - Added `sub_dir` parameter to `TensorBoardLogger` ([#6195](https://github.com/PyTorchLightning/pytorch-lightning/pull/6195)) - Added correct `dataloader_idx` to batch transfer hooks ([#6241](https://github.com/PyTorchLightning/pytorch-lightning/pull/6241)) +- Added `include_none=bool` argument to `apply_to_collection` ([#7769](https://github.com/PyTorchLightning/pytorch-lightning/pull/7769)) + + +- Added `apply_to_collections` to apply a function to two zipped collections ([#7769](https://github.com/PyTorchLightning/pytorch-lightning/pull/7769)) + + - Added `ddp_fully_sharded` support ([#7487](https://github.com/PyTorchLightning/pytorch-lightning/pull/7487)) - Added `should_rank_save_checkpoint` property to Training Plugins ([#7684](https://github.com/PyTorchLightning/pytorch-lightning/pull/7684)) +- Added `log_grad_norm` hook to `LightningModule` to customize the logging of gradient norms ([#7873](https://github.com/PyTorchLightning/pytorch-lightning/pull/7873)) + + +- Added `save_config_filename` init argument to `LightningCLI` to ease resolving name conflicts ([#7741](https://github.com/PyTorchLightning/pytorch-lightning/pull/7741)) + + +- Added `save_config_overwrite` init argument to `LightningCLI` to ease overwriting existing config files ([#8059](https://github.com/PyTorchLightning/pytorch-lightning/pull/8059)) + + +- Added reset dataloader hooks to Training Plugins and Accelerators ([#7861](https://github.com/PyTorchLightning/pytorch-lightning/pull/7861)) + + +- Added trainer stage hooks for Training Plugins and Accelerators ([#7864](https://github.com/PyTorchLightning/pytorch-lightning/pull/7864)) + + +- Added IPU Accelerator ([#7867](https://github.com/PyTorchLightning/pytorch-lightning/pull/7867)) + + +- Fault-tolerant training + * Added `{,load_}state_dict` to `ResultCollection` ([#7948](https://github.com/PyTorchLightning/pytorch-lightning/pull/7948)) + * Added `{,load_}state_dict` to `Loops` ([#8197](https://github.com/PyTorchLightning/pytorch-lightning/pull/8197)) + + +- Added `rank_zero_only` to `LightningModule.log` function ([#7966](https://github.com/PyTorchLightning/pytorch-lightning/pull/7966)) + + +- Added `metric_attribute` to `LightningModule.log` function ([#7966](https://github.com/PyTorchLightning/pytorch-lightning/pull/7966)) + + +- Added a warning if `Trainer(log_every_n_steps)` is a value too high for the training dataloader ([#7734](https://github.com/PyTorchLightning/pytorch-lightning/pull/7734)) + + +- Added LightningCLI support for argument links applied on instantiation ([#7895](https://github.com/PyTorchLightning/pytorch-lightning/pull/7895)) + + +- Added LightningCLI support for configurable callbacks that should always be present ([#7964](https://github.com/PyTorchLightning/pytorch-lightning/pull/7964)) + + +- Added DeepSpeed Infinity Support, and updated to DeepSpeed 0.4.0 ([#7234](https://github.com/PyTorchLightning/pytorch-lightning/pull/7234)) + + +- Added support for `torch.nn.UninitializedParameter` in `ModelSummary` ([#7642](https://github.com/PyTorchLightning/pytorch-lightning/pull/7642)) + + +- Added support `LightningModule.save_hyperparameters` when `LightningModule` is a dataclass ([#7992](https://github.com/PyTorchLightning/pytorch-lightning/pull/7992)) + + +- Add support for overriding `optimizer_zero_grad` and `optimizer_step` when using accumulate_grad_batches ([#7980](https://github.com/PyTorchLightning/pytorch-lightning/pull/7980)) + + +- Add support for calling scripts using the module syntax (`python -m package.script`) ([#8073](https://github.com/PyTorchLightning/pytorch-lightning/pull/8073)) + + +- Add support for optimizers and learning rate schedulers to `LightningCLI` ([#8093](https://github.com/PyTorchLightning/pytorch-lightning/pull/8093)) + + +- Added XLA Profiler ([#8014](https://github.com/PyTorchLightning/pytorch-lightning/pull/8014)) + + +- Added `max_depth` parameter in `ModelSummary` ([#8062](https://github.com/PyTorchLightning/pytorch-lightning/pull/8062)) + + +- Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/PyTorchLightning/pytorch-lightning/pull/8247)) + + ### Changed -- Changed calling of `untoggle_optimizer(opt_idx)` out of the closure function ([#7563](https://github.com/PyTorchLightning/pytorch-lightning/pull/7563) - Changed the `Trainer`'s `checkpoint_callback` argument to allow only boolean values ([#7539](https://github.com/PyTorchLightning/pytorch-lightning/pull/7539)) @@ -57,9 +139,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Log epoch metrics before the `on_evaluation_end` hook ([#7272](https://github.com/PyTorchLightning/pytorch-lightning/pull/7272)) +- Explicitly disallow calling `self.log(on_epoch=False)` during epoch-only or single-call hooks ([#7874](https://github.com/PyTorchLightning/pytorch-lightning/pull/7874)) + + - Changed these `Trainer` methods to be protected: `call_setup_hook`, `call_configure_sharded_model`, `pre_dispatch`, `dispatch`, `post_dispatch`, `call_teardown_hook`, `run_train`, `run_sanity_check`, `run_evaluate`, `run_evaluation`, `run_predict`, `track_output_for_epoch_end` +- Changed `metrics_to_scalars` to work with any collection or value ([#7888](https://github.com/PyTorchLightning/pytorch-lightning/pull/7888)) + + - Changed `clip_grad_norm` to use `torch.nn.utils.clip_grad_norm_` ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) @@ -67,10 +155,34 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Refactored Loops - * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) + * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7437)) * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506)) * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507)) * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526)) + * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) + * Simplified logic for updating the learning rate for schedulers ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) + * Removed the `on_epoch` guard from the "should stop" validation check ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701)) + * Refactored internal loop interface; added new classes `FitLoop`, `TrainingEpochLoop`, `TrainingBatchLoop` ([#7871](https://github.com/PyTorchLightning/pytorch-lightning/pull/7871), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077)) + * Removed `pytorch_lightning/trainer/training_loop.py` ([#7985](https://github.com/PyTorchLightning/pytorch-lightning/pull/7985)) + * Refactored evaluation loop interface; added new classes `DataLoaderLoop`, `EvaluationLoop`, `EvaluationEpochLoop` ([#7990](https://github.com/PyTorchLightning/pytorch-lightning/pull/7990), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077)) + * Removed `pytorch_lightning/trainer/evaluation_loop.py` ([#8056](https://github.com/PyTorchLightning/pytorch-lightning/pull/8056)) + * Restricted public access to several internal functions ([#8024](https://github.com/PyTorchLightning/pytorch-lightning/pull/8024)) + * Refactored trainer `_run_*` functions and separate evaluation loops ([#8065](https://github.com/PyTorchLightning/pytorch-lightning/pull/8065)) + * Refactored prediction loop interface; added new classes `PredictionLoop`, `PredictionEpochLoop` ([#7700](https://github.com/PyTorchLightning/pytorch-lightning/pull/7700), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077)) + * Removed `pytorch_lightning/trainer/predict_loop.py` ([#8094](https://github.com/PyTorchLightning/pytorch-lightning/pull/8094)) + * Moved result teardown to the loops ([#8245](https://github.com/PyTorchLightning/pytorch-lightning/pull/8245)) + + +- Refactored logging + * Renamed and moved `core/step_result.py` to `trainer/connectors/logger_connector/result.py` ([#7736](https://github.com/PyTorchLightning/pytorch-lightning/pull/7736)) + * Dramatically simplify the `LoggerConnector` ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) + * `trainer.{logged,progress_bar,callback}_metrics` are now updated on-demand ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) + * Completely overhaul the `Result` object in favor of `ResultMetric` ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) + * Improve epoch-level reduction time and overall memory usage ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) + * Allow passing `self.log(batch_size=...)` ([#7891](https://github.com/PyTorchLightning/pytorch-lightning/pull/7891)) + * Each of the training loops now keeps its own results collection ([#7891](https://github.com/PyTorchLightning/pytorch-lightning/pull/7891)) + * Remove `EpochResultStore` and `HookResultStore` in favor of `ResultCollection` ([#7909](https://github.com/PyTorchLightning/pytorch-lightning/pull/7909)) + * Remove `MetricsHolder` ([#7909](https://github.com/PyTorchLightning/pytorch-lightning/pull/7909)) - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/)) @@ -79,6 +191,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed the behaviour when logging evaluation step metrics to no longer append `/epoch_*` to the metric name ([#7351](https://github.com/PyTorchLightning/pytorch-lightning/pull/7351)) +- Raise `ValueError` when a `None` value is `self.log`-ed ([#7771](https://github.com/PyTorchLightning/pytorch-lightning/pull/7771)) + + - Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([#7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026)) @@ -91,21 +206,78 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - MLflowLogger now uses the env variable `MLFLOW_TRACKING_URI` as default tracking uri ([#7457](https://github.com/PyTorchLightning/pytorch-lightning/pull/7457)) +- Changed `WandbLogger(log_model={True/'all'})` to log models as artifacts ([#6231](https://github.com/PyTorchLightning/pytorch-lightning/pull/6231)) + + - MLFlowLogger now accepts `run_name` as an constructor argument ([#7622](https://github.com/PyTorchLightning/pytorch-lightning/issues/7622)) - Changed `teardown()` in `Accelerator` to allow `training_type_plugin` to customize `teardown` logic ([#7579](https://github.com/PyTorchLightning/pytorch-lightning/pull/7579)) +- `Trainer.fit` now raises an error when using manual optimization with unsupported features such as `gradient_clip_val` or `accumulate_grad_batches` ([#7788](https://github.com/PyTorchLightning/pytorch-lightning/pull/7788)) + + +- Accelerator hooks are called regardless if `LightningModule` overrides the same hooks ([#7826](https://github.com/PyTorchLightning/pytorch-lightning/pull/7826)) + + +- Moved profilers to their own file ([#7822](https://github.com/PyTorchLightning/pytorch-lightning/pull/7822)) + + +- Added `on_load_checkpoint` and `on_save_checkpoint` hooks to the `PrecisionPlugin` base class ([#7831](https://github.com/PyTorchLightning/pytorch-lightning/pull/7831)) + + +- `LightningCLI` now aborts with a clearer message if config already exists and disables save config during `fast_dev_run`([#7963](https://github.com/PyTorchLightning/pytorch-lightning/pull/7963)) + + +- `Trainer(resume_from_checkpoint=...)` now restores the model directly after `LightningModule.setup()`, which is before `LightningModule.configure_sharded_model()` ([#7652](https://github.com/PyTorchLightning/pytorch-lightning/pull/7652)) + + ### Deprecated +- Deprecated `LightningModule.loaded_optimizer_states_dict` ([#8229](https://github.com/PyTorchLightning/pytorch-lightning/pull/8229)) + + +- Standardized the dataloaders arguments of `trainer.{fit,valdiate,test,tune}` ([#7431](https://github.com/PyTorchLightning/pytorch-lightning/pull/7431)) + + +- Deprecated `DataModule` properties: `has_prepared_data`, `has_setup_fit`, `has_setup_validate`, `has_setup_test`, `has_setup_predict`, `has_teardown_fit`, `has_teardown_validate`, `has_teardown_test`, `has_teardown_predict` ([#7657](https://github.com/PyTorchLightning/pytorch-lightning/pull/7657/)) + + - Deprecated `TrainerModelHooksMixin` in favor of `pytorch_lightning.utilities.signature_utils` ([#7422](https://github.com/PyTorchLightning/pytorch-lightning/pull/7422)) - Deprecated `num_nodes` and `sync_batchnorm` arguments in `DDPPlugin` and `DDPSpawnPlugin` ([#7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026)) +- Deprecated `self.log(sync_dist_op)` in favor of `self.log(reduce_fx)`. ([#7891](https://github.com/PyTorchLightning/pytorch-lightning/pull/7891)) + + +- Deprecated `is_overridden(model=...)` in favor of `is_overridden(instance=...)` ([#7918](https://github.com/PyTorchLightning/pytorch-lightning/pull/7918)) + + +- Deprecated automatically detaching returned extras with grads ([#7994](https://github.com/PyTorchLightning/pytorch-lightning/pull/7994)) + + +- Deprecated default value of `monitor` argument in EarlyStopping callback to enforce `monitor` as a required argument ([#7907](https://github.com/PyTorchLightning/pytorch-lightning/pull/7907)) + + +- Deprecated importing `rank_zero_{warn,deprecation}` directly from `pytorch_lightning.utilities.distributed` ([#8085](https://github.com/PyTorchLightning/pytorch-lightning/pull/8085)) + + +- Deprecated the use of `CheckpointConnector.hpc_load()` in favor of `CheckpointConnector.restore()` ([#7652](https://github.com/PyTorchLightning/pytorch-lightning/pull/7652)) + + +- Deprecated `DDPPlugin.task_idx` in favor of `DDPPlugin.local_rank` ([#8203](https://github.com/PyTorchLightning/pytorch-lightning/pull/8203)) + + +- Deprecated the `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#8025](https://github.com/PyTorchLightning/pytorch-lightning/pull/8025)) + + +- Deprecated `mode` parameter in `ModelSummary` in favor of `max_depth` ([#8062](https://github.com/PyTorchLightning/pytorch-lightning/pull/8062)) + + ### Removed - Removed `ProfilerConnector` ([#7654](https://github.com/PyTorchLightning/pytorch-lightning/pull/7654)) @@ -126,29 +298,139 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated utils modules `model_utils`, `warning_utils`, `xla_device_utils` and partially `argparse_utils` ([#7503](https://github.com/PyTorchLightning/pytorch-lightning/pull/7503)) +- Removed `RPCPlugin` and `RPCSequentialPlugin`. If you were successfully using these plugins, please open a GitHub discussion about your use case ([#8101](https://github.com/PyTorchLightning/pytorch-lightning/pull/8101)) + + - Removed deprecated trainer attributes - `on_cpu`, `on_tpu`, `use_tpu`, `on_gpu`, `use_dp`, `use_ddp`, `use_ddp2`, `use_horovod`, `use_single_gpu` ([#7501](https://github.com/PyTorchLightning/pytorch-lightning/pull/7501)) ### Fixed +- Fixed `lr_scheduler` checkpointed state by calling `update_lr_schedulers` before saving checkpoints ([#7877](https://github.com/PyTorchLightning/pytorch-lightning/pull/7877)) + + - Fixed ambiguous warning when both overfit and train dataloader shuffling are enabled ([#7685](https://github.com/PyTorchLightning/pytorch-lightning/pull/7685)) -- Fixed dataloaders are not reset when tuning the model ([#7566](https://github.com/PyTorchLightning/pytorch-lightning/pull/7566)) +- Fixed dev debugger memory growing due to tracking events even when disabled ([#7875](https://github.com/PyTorchLightning/pytorch-lightning/pull/7875)) -- Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677)) +- Fixed `None` loss keys getting added in `training_epoch_end` when using manual optimization and not returning a loss ([#7772](https://github.com/PyTorchLightning/pytorch-lightning/pull/7772)) -- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692)) +- Fixed a bug where `precision=64` with `accelerator='ddp_spawn'` would throw a pickle error ([#6924](https://github.com/PyTorchLightning/pytorch-lightning/pull/6924)) -- Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/PyTorchLightning/pytorch-lightning/pull/7592)) +- Do not override the existing `epoch` value in `logged_metrics` when already logged by the user ([#7982](https://github.com/PyTorchLightning/pytorch-lightning/issues/7982)) -- Fixed `ProgressBar` pickling after calling `trainer.predict` ([#7608](https://github.com/PyTorchLightning/pytorch-lightning/pull/7608)) + +- Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) + + +- Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) + + +- Pass the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/PyTorchLightning/pytorch-lightning/pull/7973)) + + +- Fixed metrics generated during `validation sanity checking` are cleaned on end ([#8171](https://github.com/PyTorchLightning/pytorch-lightning/pull/8171)) + + +- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/PyTorchLightning/pytorch-lightning/pull/8174)) + + +- Fixed a bug where calling `log` with a `Metric` instance would raise an error if it was a nested attribute of the model ([#8181](https://github.com/PyTorchLightning/pytorch-lightning/pull/8181)) + + +- Fixed a bug where using `precision=64` would cause buffers with complex dtype to be cast to real ([#8208](https://github.com/PyTorchLightning/pytorch-lightning/pull/8208)) + + + +## [1.3.8] - 2021-07-01 + +### Fixed + +- Fixed a sync deadlock when checkpointing a `LightningModule` that uses a torchmetrics 0.4 `Metric` ([#8218](https://github.com/PyTorchLightning/pytorch-lightning/pull/8218)) +- Fixed compatibility TorchMetrics v0.4 ([#8206](https://github.com/PyTorchLightning/pytorch-lightning/pull/8206)) +- Added torchelastic check when sanitizing GPUs ([#8095](https://github.com/PyTorchLightning/pytorch-lightning/pull/8095)) +- Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111)) +- Fixed metrics deprecation message at module import level ([#8163](https://github.com/PyTorchLightning/pytorch-lightning/pull/8163)) +- Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170)) +- Added a mechanism to detect `deadlock` for `DDP` when only 1 process trigger an `Exception`. The mechanism will `kill the processes` when it happens ([#8167](https://github.com/PyTorchLightning/pytorch-lightning/pull/8167)) +- Fixed NCCL error when selecting non-consecutive device ids ([#8165](https://github.com/PyTorchLightning/pytorch-lightning/pull/8165)) +- Fixed SWA to also work with `IterableDataset` ([#8172](https://github.com/PyTorchLightning/pytorch-lightning/pull/8172)) + +- Fixed a bug where `truncated_bptt_steps` would throw an AttributeError when the target RNN has multiple hidden states ([#8145](https://github.com/PyTorchLightning/pytorch-lightning/pull/8145)) + + +- Fixed passing a custom `DDPPlugin` when choosing `accelerator="ddp_cpu"` for the accelerator ([#6208](https://github.com/PyTorchLightning/pytorch-lightning/pull/6208)) + + +## [1.3.7] - 2021-06-22 + +### Fixed + +- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) +- Fixed deprecation messages not showing due to incorrect stacklevel ([#8002](https://github.com/PyTorchLightning/pytorch-lightning/pull/8002), [#8005](https://github.com/PyTorchLightning/pytorch-lightning/pull/8005)) +- Fixed setting a `DistributedSampler` when using a distributed plugin in a custom accelerator ([#7814](https://github.com/PyTorchLightning/pytorch-lightning/pull/7814)) +- Improved `PyTorchProfiler` chrome traces names ([#8009](https://github.com/PyTorchLightning/pytorch-lightning/pull/8009)) +- Fixed moving the best score to device in `EarlyStopping` callback for TPU devices ([#7959](https://github.com/PyTorchLightning/pytorch-lightning/pull/7959)) + + +## [1.3.6] - 2021-06-15 + +### Fixed + +- Fixed logs overwriting issue for remote filesystems ([#7889](https://github.com/PyTorchLightning/pytorch-lightning/pull/7889)) +- Fixed `DataModule.prepare_data` could only be called on the global rank 0 process ([#7945](https://github.com/PyTorchLightning/pytorch-lightning/pull/7945)) +- Fixed setting `worker_init_fn` to seed dataloaders correctly when using DDP ([#7942](https://github.com/PyTorchLightning/pytorch-lightning/pull/7942)) +- Fixed `BaseFinetuning` callback to properly handle parent modules w/ parameters ([#7931](https://github.com/PyTorchLightning/pytorch-lightning/pull/7931)) +- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916)) + + +## [1.3.5] - 2021-06-08 + +### Added + +- Added warning to Training Step output ([#7779](https://github.com/PyTorchLightning/pytorch-lightning/pull/7779)) + +### Fixed + +- Fixed `LearningRateMonitor` and `BackboneFinetuning` ([#7835](https://github.com/PyTorchLightning/pytorch-lightning/pull/7835)) +- Minor improvements to `apply_to_collection` and type signature of `log_dict` ([#7851](https://github.com/PyTorchLightning/pytorch-lightning/pull/7851)) +- Fixed docker versions ([#7834](https://github.com/PyTorchLightning/pytorch-lightning/pull/7834)) +- Fixed sharded training check for fp16 precision ([#7825](https://github.com/PyTorchLightning/pytorch-lightning/pull/7825)) +- Fixed support for torch Module type hints in LightningCLI ([#7807](https://github.com/PyTorchLightning/pytorch-lightning/pull/7807)) + +### Changed + +- Move `training_output` validation to after `train_step_end` ([#7868](https://github.com/PyTorchLightning/pytorch-lightning/pull/7868)) + + +## [1.3.4] - 2021-06-01 + +### Fixed + +- Fixed info message when max training time reached ([#7780](https://github.com/PyTorchLightning/pytorch-lightning/pull/7780)) +- Fixed missing `__len__` method to `IndexBatchSamplerWrapper` ([#7681](https://github.com/PyTorchLightning/pytorch-lightning/pull/7681)) + + +## [1.3.3] - 2021-05-27 + +### Changed + +- Changed calling of `untoggle_optimizer(opt_idx)` out of the closure function ([#7563](https://github.com/PyTorchLightning/pytorch-lightning/pull/7563)) + +### Fixed + +- Fixed `ProgressBar` pickling after calling `trainer.predict` ([#7608](https://github.com/PyTorchLightning/pytorch-lightning/pull/7608)) +- Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/PyTorchLightning/pytorch-lightning/pull/7592)) +- Fixed dataloaders are not reset when tuning the model ([#7566](https://github.com/PyTorchLightning/pytorch-lightning/pull/7566)) - Fixed print errors in `ProgressBar` when `trainer.fit` is not called ([#7674](https://github.com/PyTorchLightning/pytorch-lightning/pull/7674)) +- Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677)) +- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692)) ## [1.3.2] - 2021-05-18 diff --git a/MANIFEST.in b/MANIFEST.in index b1e7613831fe8..1b97e27a98abe 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -53,19 +53,23 @@ include pyproject.toml # Exclude build configs exclude *.yml exclude *.yaml +exclude *.toml exclude *.jsonnet exclude .yapfignore # Exclude pyright config exclude .pyrightconfig.json +# Exclude submodules +exclude .gitmodules +exclude _notebooks + # Exclude Makefile exclude Makefile prune .git prune .github prune .circleci -prune notebook* prune temp* prune test* prune benchmark* diff --git a/Makefile b/Makefile index 04b08fa2d27d1..34b67fc458131 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,7 @@ clean: rm -rf .mypy_cache rm -rf .pytest_cache rm -rf ./docs/build + rm -rf ./docs/source/notebooks rm -rf ./docs/source/generated rm -rf ./docs/source/*/generated rm -rf ./docs/source/api diff --git a/README.md b/README.md index 8da7836fb689e..c0e5c87cbb2b7 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,10 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
- | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (latest) | 1.9 (nightly) | + | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (LTS) | 1.9 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | - | + | Linux py3.7 [GPUs**] | - | - | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | | Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.{6,7,8,9} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | | OSX py3.{6,7,8,9} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | @@ -338,8 +338,7 @@ class LitAutoEncoder(pl.LightningModule): ## Examples ###### Hello world -- [MNIST hello world](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb) -- [MNIST on TPUs](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-mnist-tpu-training.ipynb) +- [MNIST hello world](https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/mnist-hello-world.html) ###### Contrastive Learning - [BYOL](https://lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#byol) @@ -348,8 +347,8 @@ class LitAutoEncoder(pl.LightningModule): - [SIMCLR](https://lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#simclr) ###### NLP -- [BERT](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb) - [GPT-2](https://lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) +- [BERT](https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/text-transformers.html) ###### Reinforcement Learning @@ -358,7 +357,7 @@ class LitAutoEncoder(pl.LightningModule): - [Reinforce](https://lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#reinforce) ###### Vision -- [GAN](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb) +- [GAN](https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/basic-gan.html) ###### Classic ML - [Logistic Regression](https://lightning-bolts.readthedocs.io/en/latest/classic_ml.html#logistic-regression) @@ -370,7 +369,9 @@ class LitAutoEncoder(pl.LightningModule): The lightning community is maintained by - [10+ core contributors](https://pytorch-lightning.readthedocs.io/en/latest/governance.html) who are all a mix of professional engineers, Research Scientists, and Ph.D. students from top AI labs. -- 400+ community contributors. +- 480+ active community contributors. + +Want to help us build Lightning and reduce boilerplate for thousands of researchers? [Learn how to make your first contribution here](https://devblog.pytorchlightning.ai/quick-contribution-guide-86d977171b3a) Lightning is also part of the [PyTorch ecosystem](https://pytorch.org/ecosystem/) which requires projects to have solid testing, documentation and support. diff --git a/_notebooks b/_notebooks new file mode 160000 index 0000000000000..29aea106edefc --- /dev/null +++ b/_notebooks @@ -0,0 +1 @@ +Subproject commit 29aea106edefc9d1904c0c17223a8ac2b15c48e7 diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py index 53f303693ffdb..bf2ddae2c0084 100644 --- a/benchmarks/test_basic_parity.py +++ b/benchmarks/test_basic_parity.py @@ -174,4 +174,4 @@ def lightning_loop(cls_model, idx, device_type: str = 'cuda', num_epochs=10): ) trainer.fit(model) - return trainer.train_loop.running_loss.last().item(), _hook_memory() + return trainer.fit_loop.running_loss.last().item(), _hook_memory() diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index e16971bdc2a1a..5c15e096cfb4b 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -118,8 +118,7 @@ RUN \ RUN \ # install DeepSpeed - # TODO(@SeanNaren): CI failing with `>=0.3.15` - skipping to unblock - pip install deepspeed==0.3.14 + pip install deepspeed==0.4.0 RUN \ # Show what we have diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile index fbfd2224a66a9..c0fad8fba5124 100644 --- a/dockers/nvidia/Dockerfile +++ b/dockers/nvidia/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes -FROM nvcr.io/nvidia/pytorch:21.04-py3 +FROM nvcr.io/nvidia/pytorch:21.06-py3 LABEL maintainer="PyTorchLightning " @@ -39,14 +39,16 @@ RUN \ # Installations python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \ pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \ pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \ pip install ./pytorch-lightning --no-cache-dir && \ - pip install "Pillow>=8.1" --no-cache-dir --upgrade-strategy only-if-needed && \ rm -rf pytorch-lightning && \ + pip install jupyterlab[all] -U && \ pip list -RUN pip install lightning-grid -U +RUN pip install lightning-grid -U && \ + pip install "py>=1.10" "protobuf>=3.15.6" --upgrade-strategy only-if-needed ENV PYTHONPATH="/workspace" diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 13f70deed43ca..e4b3db9cac53e 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -22,6 +22,7 @@ local tputests = base.BaseTest { ||| cd pytorch-lightning coverage run --source=pytorch_lightning -m pytest -v --capture=no \ + tests/profiler/test_xla_profiler.py \ pytorch_lightning/utilities/xla_device.py \ tests/accelerators/test_tpu_backend.py \ tests/models/test_tpu.py diff --git a/docs/source/_static/images/accelerator/ipus/profiler.png b/docs/source/_static/images/accelerator/ipus/profiler.png new file mode 100644 index 0000000000000000000000000000000000000000..cbed276a4f515449111e6957e2526ee6a0ae14b0 GIT binary patch literal 129635 zcmcG#2Ut_lwk{eg3L;V!M0)Q{dJ*ZpgGiN*^d6*(lz@PMbO_QrNEZk_0hB7e1`NH1 z(0guB_de&{yZ5>0z3;sBeaTAJoNJ7K{&UQ+#uzgJsjJH2Vv}J5003MCd1+0|2>}3L z72dmrxdIihv;qKbo~bKq%b+pGE^2OW?&|XL{QUg%^z7o|a({pS@bJ*o4ATp+yt;jS ze2hY&PEJnHPTq6#E1YU-MMXtPNlDoNoQ0pm78VxB-Q7LJB6@ExARu7(7oviL^930F zYj|XNd6|ow_t!6ksp*@;#YJ0NTXPGWkqA&$cJ9*X*gM|p=aR^jxHN8D0%?_eCONT(i$J1h+f;+*+E*{Iv|#oK1Rhv zL`J2iWe^e)1_phI`4o>140mz$L~m}lwzZ>|R-9bi-hT-B{{08Kw|{GUd+6uS;*zr7 zzW(*~jfTdi@$rfIg#~mQ6blQhzPi zqVr48pOVSR$@OicX)gW;e^1qFrY+S=&GR&-Bq)=eB5 z8k(~53L!l`F>&$G@CfhdXz4d^&__q$%&gGlWB~y||F~FbX_>L9S#)zpSwjN^(sJ+% zv9hv?&d9O$_n(YGbra&!BS))zs8_27h*S z_n?z9&?l#d%PZ)@a`f1QuCp^cJ3Bf$v8uV*Ej%1OK4s|Pp=f1Q{)95UxMU&fle(Q9 zIzAP`C*B4Lh<%q;>eSJjmen}w`5w@OZc>oXtEyhAsyrMW z9ddTb{`&PxS^0ika&~#eR(IE+xA(H1>AaK2LR$JlcJ`i?)3ASl)+>c`z>Q-%1!+lb z;Plo6H$DJx4?xX3DN#yli2ib?b7BW${zG4Yn0L|$ECAqB*8l9sZun(W4StskIl5z+ z@5}u`uMMUh)j9eANg*2%iEq~1bQZ1zj`t)DN82k6UV!AgnCfm@qjAC#cMdtqqUyK@ z6C%CAOP8<7MQKF`PkkjQ2PrW~0bjY&S&aM1`aorsk3;kx005thz@H=(&Ewy-DV!rM z6r=ZJrC=qu-DlWo4Bl{JS3Cg#m{iQvi|Lkfx6_raGZ&-17zGn8SJhFam1FW;k2S`~ zdX0&Uxd8z6#OLO9=6iAq*}$3PTJ`v&4;I3iGz0n$fC6=9|A z7{;$}xfyC1(70w6!TLjnk#F!K)AYnhyM*6J;dXC{t`s3$4}YKsB@Rw0?IsliN3?s703 z0P$b|HRa8nLd>+Dgcqe!&v_f822vY(yCR7>8%1has~!{ruW(KuId^g01hgqBk3Co) zKt|LTwWhbKqw0qJbml~a6JT<%eUqNBK;BS{{TQ1STgiL*>p>I?5=0uohN!zfjT2%U zJ0-^7J8}@GoJ&$|&pPh`0F>N1ytyK+tl&;0!ePhGM@M>`GwGK^juz_T@gDWU$2YeD zfl0~5q{BwL#1%PGqz%I@Y{D#H^h@#E*rrcz02Zk8--fzB_JwUq7$+6J)_2wxMThv_ z`(}RKM*Fu-G^{3pC1nkTXtvcj&(IyFL( zU(NndA6y36jWsT{*l_y9Z=CA&`^D$4MC&d|N*ylf@?Kh0SEO9peV2B4M(WogC010U zZLnvhhNOR-xt8?}RfQ;n_#LAz!{iOi_&t|WGP;s+PXnGtaf|TCl*RTqp*pdkixq^u z7sRJ8&-h+LJX~~7yEVI`*qJ3Q?Jw)dbiDC9*BE`QcT;|lLSjA<`IVxi#Z#7w^AA_Czd4-XkK2G45S?Wwg$M_z4+pbBeOFo*@>Fz>~ z`p0go9tCSh;?>WZdu0gyxjznG`9>oA&|4*$!DSU6ttCk31K^_Qu}9fm5yva1cu`TU znNR0ixW(j^qs#+T$a{#H-I5)poHFRli&l24CiY1?DE+&G1n;mlxTg_obE zyjZ+P^pt&w6YaIFdbVQxXJMB zLhC2a@zkp1_B$A_u}k+@n(E7khpdZvdBDMj1*Zp6N8c|`m0SEdGvZp}g(Q%4qKuLw zT>cF0dS)@^J1Gc1ZLECFf~k97YYyQNkHG{04^)*jaxk5V3QPI^!$Lgvn}ah%NExSY zv?eMp!@Qh6NcvQqi2_>@MT8ylGV4TQajCp`y+p%^_UQ%1?vM~3nH_Y&Px>H5*Hr}^S?3O;C=L6SoZ)h@M zE9AlrP){CnDNw#=3^ZDI(t@bpT`yDZ86wsc$QO^wH3By}nX<`-%B^1(#kd=vteaCT}v??s24iaCnt<{TdNrIUXuE5%|Q znVd8y-k{g9a3&YjQo}P~c!WRj%EP_SPYdEbDn@tz=1+7j0#?oBF~{ifG$JWqa<4uc z`1$Vp;}@tn;D$OX9ZQ+{k10QVl7MI!5b>O_=byGx3ebburFw7tvZaKV0(E#_1h1_; zxMYMl6hw4qI`OB5S@rU2s!RjGpK<}|Q!McF#lweA0X>aQouS(zTA~cQ6H1HT`>EY! zkM?s=AD_fHi|9hEyp}^K8}+ZKy2j3y3-l_k)>z=x+;KMZk^bV_dHa^*$P9$aZh|#; z<|OrALiAkTWVC87ab5Ld!DBL$#Jn0O;WT#drU*TzXeTw}9AJDNv6+=(HxZk|H;#_Q z0t34H<<^;rN9io^@w1W1qq`nV6}M*SGo2eW1$9!!9f*)mnxN!w8cKj6m!s_u^$d{O zb`BzS&9b>ji%bq1sQp&=fMUt8QR|oGt+S*&&D7)&g2Tm5=0mrLwlXFP_UxgOs6zdB z!hIU1kjDj!Q`@eY2!{kkJA|vD_*0YlO%`wfbvjFVxcO|X-0`;>h4-4@*=g_%V{|xb z(l|a6e;FzO`o$tP+gp7f&0NLVaQ$$?N!Ap-HW{XJ>(K}S^EwD(%+Hacr3CTph}Af7 z-%fn~`RVMoTiRap0cB6U%4)=_K!1%*Z}Mu6wy49yqLygTrKPuRM>pKT7Bbd+JMCI|U68mCa zB)Kb+2uTz<+L_~G0{6vkV<>9@@G`dGFTN_Iy}fj^6j-fqyLS|!C)=e4ZBWdBQVvF> zQzyn%jyw{!@Jwfhdy>EB<@|Z$w&Y%DJDaMpj;P*E%>f$H3T;sn?Dx}m4SMbGzi0IE zK<&LG>>vuJGA4Km>U)eBQT^&Y4 zShkao!jH=MB88A=fxA3>|F}e2-&>FyI_K+OAWi3aBAUJ?zyv0d8Zc*uJLh(}K%fSX z`JOD--&Q_EFkkzRUxBeq#;y4cDkJE5V&s`GULSkh!K&@B&!S_pRc{00dEShz!wBz7 zSnn;nQ5md!+b?e*HzgJNU`>^I%l_2I$|F?k^5vYvKWDYC_=(gE3|_q&anwK^AaPfw zBqo@_;EAGM)>xDj6BsG7c&HYpGq-=N1KSrD@XvCJ%Isl2Rz<0DDFxq-ZL@coP{Z%5 zxXmANU;E@G&hn3GUF;@v1VVbveNo{7(!|rutj?h6%hv!(&7&hGTZ(M)VqyB}zyvNKZ)Y|d z55>N>U9r*aLHVx8xm+xJ2gNUT^`*Va3MRmp*SKAU|tUeiy^tu%`+1^n?frL5w8QMqTX;Us*Jhn(x{yknY7~{zv#_0judx&F#2c+GeYjr2SZ_IQXvigE zRtDJg_NHW}I^?OrYZ~X50i{4AdcO9Ku_&PHL#^P?&XHL^o*u z!Q~v!gP1&DWC<{Ulyh0~lBX1yzNE5{f=}N)D*!9W0?ME)np!?KD)ILf6K!B~6RV{48hl-iDH!pzqg5Vzno5ttj(E6n<>eKtw&OTDH<;w!p&- z8(f9-Juogl=&P@y<`M&pmZJE3gaCPw3-lP8#2B>P&NTmo>+l=#Z#P8FB@DAm4!f0@ zC&qPWdo2&qK$ic7zS&-7XFeOkTu^bA&eyVrA8h{Z-c$q70QvaCc zj-Sn?K4{1#?ntqK%GG}X23ffBX26vhZq?@liNq`hj4;o-mlN?9PEl&0iD4Xq{kd}% zii+^IH2w&^=O5>z=D%}emKedw``gR$@sFRa=%dC@Qd2JEcO;{U)fz{6tYEMd>ULqx zqlo&DPin?>G%hNOthY&{BAze?i3Bq`=|L=|`3OG5ZnJ9m6d1&;K|NlJeP8h<%J*Jg*}e{ADy&Ly)?q6`EHX75nxy+v zfQ15-m0Akyj3ukLcf4=%K=w30u!!+3%OC%R&tm+=q3)?qdG7-&pRw(Vx_*fE^Sb)Z zz}^wQ8GruVBd~!}YJPVfWvWhd+KAS3gGMO2>fnJBBArd~cvo#Vdod&Cv*^MXiO@vxqD`CZSE9f^MZ zq?8`2kAC7+bG)1ITqq)s$A`RLdE?dumOoUoDs^7EPK2{o1hwN`dYS;wRi4bi|yXA2kb+j~G7G zu0dBCrZ&w*V*?^T%24Zc59>@8SJmCZ;e4qdRb#*QUL1nZ5v7O`*GiVNiGNp{(lyE# zUv=?n=AGsb4nTZaIzg^sv_-)CX2S_G z?mhYudQ?GrHp6_HP8txLK9BQ+d<{CQhpY#W#8@ow-!JsYqRb`>%3TkM2Ajo09!6qf z01`1BVr50x{z9BN_pPX0MA-omXBZE+oEdA7>4Y!N(%=VQi%vzRmFSQ}CnqCO>ZHim z75v2RZ>o)zECC;K5vWuBVT**!PgGHRX8TYr zjI8T2!pl!7a?LDuCaBvY_*llwoEFuggo%?SH6+`)-+L7T502eBftPb$6}XF=fcwRR z!d6FzZi_R={~)kZ3UD#pk#8st`Jj(>0P!g0Cv?#mRg~sndXPSuaP7HANKk)AD(JB^-TO7g4}Y zQ?~4ysx2z;^{ZK|RTil(q-ZC^0H2;-*gM$)YNNGPzbM%9L?VAiOH};Irh;Ea^mQc@ zp-NX}@-)YqTq1(lPvJEE2mR&)hx;%CtEZn8TNwSN8L*j#Z?c$!KK2@;N=LYIfkd6z z0}&IHMM0F{b|#ajT(t)czmwY>=&6woj;J7?_u+@osX`w1>bFI&XY6%Jv&?647#nm^ zTYKw?_1@F@-r_l0kKe@gslNk_KQn6BkVpA;Y~A)p+3ILyQNVkRA>%fgg+dBO2M_rF zL3c^8Zn-qTSCapP(vm0U!d)cdaobI9oO`o*2#o_R)$WDDu9PLFBBfjPb28l{ht?;RK1c)dcxEwxS$E>F5c&TL$hr1**kxryW?ZwhVzx_1PZ(Ggo*1miA>Ux%bJWO|qQ zG!|hZ39cjT%oNM2eC0a|I|CE2eStKNjn0vKjq4J>YzJ&C1`E0hmeS7)fbO2_z}J-~ zsFkm(@3{y{ftU(+5G`z`&9Iv_7eVj7ImLXz_DEddUAZo>36T(c|J9}}38)LmVsvMp zE>dbp6~%x;0AiXVlB$l>tY~f+z(io_>HB2e@#3%?{ZflKb9eUo5+LzXpu2N8{?%#-#BA#;r~+v%43)O|_NXC%GCB(P4}bNUsP0=8Dta4doBF z^@x0;$cgf5W<$!Oj=o`!FY<;RY0MnVw-Y|Mz4eOmyT!fpPF!=x&sB)I(P2e79)qf{ z-()A8?IP+y*{}Q>!uv(^AR2K2R^-D?&PbxcXE!=5B{cjS_9uq(5d8@0x8!Lf-wYYE z$OrobHAPcjiY>)QbYH6aJh$JYz3epiZDxQtX^AF+y(MBjKR{UqNzTJ<3|0)40F+bA z;6$k^StA%u=*jzoiQS%-$7*T_`Axbk-&6ujc zspAf=Q&Y*^{_(t=+PivD<~oEVi904QAPP4p`rsZB)@g$?X{Ox`Y5D7QP$fp$e9HVL z)zgo(u#qN;b^N!Jt!h7(3MXLZ5icV2YK5`iC_Ht1GQu{VhnSljPVLa@e{KC{DMAjb zFe78k`32=`RQaMY6WGviQ0Q0kFma3&c_wvg&%zQPNyJw~!|JD=or1t;1mry})0Whs$YS8TXyOV9&0q=9OAXyW z#s<6m<=~~0X?r#0wj3T%i3SeaAnt%yPcSpJ$}Vb?YR@JzA6{R8+R1~w&1#7Mct*Z#R8)?4%y-xtz-3W3tyMUMhc3}qPO zBD$4vHcOm_aoECf9GVG`_Qs#2U`k=Has^RVH+zVl6G*E;!`KtPgAlP!q7Uh`Vib12 zJyg~bqOqu16PM}JMe%=_IV6g~)>?aHkY`5}^Ii$THRw*JE{eTKJr{+br!9KVul{sU z-*D6=Xm0-n)MHp{QSdkGf3;UZzrSU<6cNx#9-pIq|uUqS7}LaSad9_ zQ^qy=%P7RT`srEE)d0$_Q#bV+jP_xLEQs3euvfZFxjNE@Nx?VdT($k5A?2Ei9dMKh zZKsE9KrRh>`6~J*Ya_C{9%roSGQl#U_7rT^A?GpLN^iE(;puZMw8qwXhbNFTBQOV- zJJgO`7(pEsKnp`3A5L384rm_;$5PIo!3i;0hdEH@ZM+!Ki;(m?6+ph-+W+#}{^r*D zZbS@xltz2f&G$~>+;+yxfSrUPsicMN@@@TuL98g?+-UWRh|jMXK$-Rd%L{WanZ*~W zDH~u*Sy9YZUgU*~j!sc{@+47!CsXOiCEHY`{WZ4_G*e^HaMZwg;R3!UsY$9dpqF;X50$N;nI)KooV7eq&&e2 zF$^u;aX+AOzfnWd)&BahsAk1U3ikmyO5Bj^94NFQ=(WA*G)abP=+Cz2}OczMnQ-0xkn;uJL{KaR(rvdKNzSl)eGwz{hZ?_}yqAE-Ut8OJ;MU z%fb6JlULofP2~7a6p$vj{k{m=QxoO%vGXY(F2rGXxSX`KMW%nr1{7~dy*vlSyDg}- zrSeQ~Ti{lR>=)GeC6CJ{|HOQ~3Y{bAJVM$ytDsGz6=~OE-4}lB7wM7iuoVRqh)70v zMPtOW4In>&V-V89;^~YuG}(&=Iz|cxacMEC3U$R!Z6BT0Q(F<-h$*|C@ZvkK01fJq5tL@rf^ySTi9=aua|LCLlu!sXq3npuc&`y#tuiEqwggjMCRS1=tdr zS|3M~OGGdVPr9O?rN4M^OY#xlJ#@$|Oxc?Jhbi7ga!@hRgR*C7z|&M(E_z(Rr|%{2 zzU?SOSQw03R_-Y)+yYqa6;qUp5LQHhk}8C5Y7GMbDUjl@%>ihM8K%+}-S(Le@PeUY z<5wdusyO-om6#*r$CL*B6B!`AHfZkzc@VQLAW#|E!YS^FtUrFsjlvd<2@v=-4sU^( zB=P|+CaZI_?!TdWkn^7PUL+5$zLaqCRqwa+miK!1N=XNpsZqbqaSw(O$!lBT{PuGh z2}RL9shgmrO4W-YvULu}%qM^$7zG&h&Vc6V&E`(7vd8SAp&EqU^=l?hZ69e`(>Fge zzza(D;jYcM-)%Y*kMvVEzI8W%fm*WVQhx6e`s`#IBYhZGR`CGt2hoBgmp{Kl$nlXH zO6R2*QMWG55zRa&S-uUCGmLiu?j!HZr+e})jq-(eyu}HT)BKYKdJ2v^2=GB89>8x~ zCBQx2f%02TYp-^<*iUQ34a~-9FbK0ftRDLR`Mr(u+=J@+ml<3=^Jf$$`nT?lr!yz|M)1DgX;{H; zm1!PE6FXx^S##44>s+WayhUCDC-{;TDT44?>IqB{4h@Dd%Yi1K<=(0YBXiywm2n+k z9p3uuX2Al_D?NnRxO%3_3b3MLMF>^wodr|OzV~tF zFXtn%sqg3Gp9S68SfTNZmHh!Uf51_-u;deQoAu1sPA1M&Tv_M)I{1A&PUF=Ou=aUbkjhx;5?nH!ZF)@$G6PiVU8Xk z$yIZ#1W9{@dHCS}9NafBj#DZ-azRYJR`UN06xm`c=Og5N{R(L&_{Ez~N9w+iehqfK ztDse(&X>^S=8#5wo7XN|l63vsl2{+)K&$TV*+Dx(D>qNv-b>!+Mh4EE1>}7d?{)_) zD?^IwPMDbWM?c>rjKTQn@bH$bvkU3|GbU7a^v&L{JcJc(U2b1uTUWB!9C2+6z2x%0 zg__?cb0?ax(in*3^Zo>Eh>FbB52PSFuV{G0^>hg8MlgVoo;zWqxW*w41k3)?lFgZ)g1Ed0nr_=K%JzFnXv7P=CAbFlQ;!ZlE2hMS8QqKi@7ab@ z#T|y`tT!}N7iS*(nzSC7%ABq)o_=N-Et6as9ok{E56s*aOyi6}SRk%%$Nw3#`wrp# z3O~4p0NDmZ(V;vGmG5{CG&*+3$Xs1C0}axGJq_19I_!l)l44er78fBte{aNdkDmbg zdCo?hb%P!gx3@d8qF6o}AmX7tn;&8Zw?CB~cos*=Lb;Mc3 z2)kADea_5*67RXYhNrK%fBFzsq_yN&34noLDBS{|B|~RSaUTZN9l-jz4efv1W{ zokbFXD?7;|(7PYa_iK0etT=sUTc6BcQG(2Gtc-o=eSXINz(Rs?Ng~A z%7O@Dehd55+7;+40BLP`J&|Hy5zRpHpioa4;gVprf^?q&L4$FRwolWhTNEJ)4eo+_ zhL91_Cb!#lhwH*$iaKs&s?GG<>$ZB#1_k?BGv0CjcE{CO^`H z>x3TK+L0fK_wS?9h>tG6=uKmvueg^Fj3b}x?iwJrU2K?9zt!Dq_}PK)92G2P4yYsS z*6iwtAO6*5)n`eS%+6E9|BVLwf=2#wc5ejAI#xgjsupxbl-{e(W#W+ofIM9e>K#^2*0r+y|^IF`bOtxn!mwH6cvaCUH(W-~jN z{N_R<>F4WgpVa?ikktO}%ZoS{@^4EDudoLbb~V>Gj&3`*dsq-jKqJ1U_!->cIXp{; zE)pA0?SF?>pMx_A2WRUo3rDlz?b--QE$ZPHt2oi|J)>57f;=^1%oorAX7>Tsj=?;=cls;d4<6?Ag!D^nY%J7CL5qt$gxBGl zb%1yfmJt>M>S1cLV|k06DGREmKQR2TB`6kJ?K68Ngi58zl>B8B^~2vG`fEuaH=bwM z)Ps^(iG{KQ0c0Yudig@9cj}Us>5l>QOqkBmsgw|zI97=adPZ5VyVHC`Sk4#SO2b={B%u_d3hQrr zOu2Y+sj><2rCFeFpWP#LBf1aHuIf#;M!&zpIkyvijB-N3@>u#jLAGacr^cGg zOb0;?a@-{m9$t#kaC==h_Up(3@G9@c8+?Kaw_--)ubd6enb(m+Tfz%OSv(PW7?gEV zMRe81-AC(VWefOsi1qEJJjCN&EU+b} zkxGagI426GI`%&2`ldR*Ql?3f)8vv~LOVB&{Y|c*72T1+swfgMN)n{d14x{Yg^(P_ z9`O%ql<*fbgxE+|njSJn+-9*zivd;BLfO-x<@XXJ5Q?p9d9&#*w#VuF9R$@d8H~<2 z?GsR_1az!w*Srd<448i~ZvqCs`?*PoBVI)Nkg%K<3}>%u7CbWGbcAxC{iOwubBX!rRO(BUP5n2jE8 zogotI)-1nz#7Yk)j8YH{gxck`JW(AY{~jV~PR>31h1;abTmU>hfCn zN2nbkL51malxp5?p(}z^#7hA(zR_T;9-`;%El@~P+T!~}Tf_Tw#q-lDnctm~;Yk~w znnlB6vg)T#v5L8aCxKZ*R-jQXokF_>bXuE-4;@^1JMJ3MvOOc5kolt`9X!aN<1w2W zMC8<|p`;GK@3fWgTP$wCm?Fg8ujnSgU0;~%Oak|c#vlUkM#)272J+WAEh4LTWY@?9 zh!km2)8&c?v2dEQLl2>e30Wd`HBq(JvQ(hd9nADEw7x))9uW-ghQ8jkj&p;j{3=nh z9`)T1yf#RQ$)@#*9c>NN=WHf%(~7 z7P!KwcH{m~7z7Gf@4FYeYlQ&>6t>tu}H9hd`e= zpik}9H_uuOnf4(iC`zANQwLCzOin42t`XZdhX0uY%2Hm`51JTRZ;dN7#{93x+YhQY z_^i*@^TW(*45k@?@i>#>?ZSkdKeGF(Pr}$L%1gig<;@Fm_#<0m%HYkfECv9eb*eU) z5D@evJe#^Z#Np>v?<3Yn($Wmbc5w))iT{|SJx+l`*Uak}_bVv^^lifHVD_vU7D_>q-81-LpW)0R^Q-j|a5`(k99y zSBuj6?JyfK(;g^Mk81Dj#&_^EWk~G4!#^5E`&Vi=5ts%bxN!>NX(WbJ$pd*|)c-~j zt^zqqtN*`BYu=jTR=lM_B>M}8Gr4O;$FU>B63bRAPo9xq3R`LOwltfJG*m1_nK;dY zi!0r0HJ#WO&v@T$O4P&F+x}U40S9|bS@8a(u%MdJYYGFoRTbW0C=T!M!wqrGaY_oQ zJ)&|D@*kZ?^Oh?;oGA>jR*RH+%{SK|wkuGjJ-`1!k@p}*+cP0TsOj^T?Y%$hz>PJc zbq@#3Hy8dO4_bxNszJE^0u@5_T_5&wLZ}`>Jm(cl%B-2?-{}5!PwyUY_ud%Zn-sq3 z)B|O$P7_@BM&svbTXicM_JIVxSN5w#_~_Hom459)JuACJ_$)hO~u*8jQEVU?4pT#gCeL zt&2n3BQ4_R}F&NWHdHe-!L$oF`pAgRr$?({u#!f`NCSo z%*1cp(QS5S2$)aVKIT_OT)gL&$SeKP&_%)M0IE7KDOTV7fjJ~I5#@V>r1$Q}r1w-) zZr%?MkF;`zsHi`P(~t=kT-A49``LUZL``{o<&g(;HK?1eGH z*$i)bKY0k?ec1U}uyO9FAo8lOQ-Sxv{n#XCllU-qI;!E~M7fwB>3sDdpFMq{VrB(8-lyhmkFJu_#>>T5S}X| z9O>;qq+cp#r_MufGtQ_+p}MgcwA$uiT<`SK#0k8?lD6uZvFKe~Vh^72h+b_lS_S^O zIM>>1H{$N^hj`P!a9321 zB~dbN?3jn#r0?jhsmwlgF10kjbbZV7)YUv`xA|%|;(C_CwP%pYK;+sypb$nrE&ue! z3h`7Gs6I%4YK+}=Z~g31;px_#|2UzVIu}CU>(1x&pzvRh(J^9-B7#Z{p(>CrVl~lv z$>r>GBFuU>M!ouBV8W~CXq z+C7{gbgG=PxN*)E!uv>Uyqt7OV~|o{9`2ce)+R*8M?m~aJ{2_nSox1>$LMsN5CZ?X z=ru=(=i_CaHLZIoD>OdMzMiUq zSy^<8R7?&d=7ho;NqUP{1Ur8)nBjVFAUyoyv41bL1j{#u*xW$JClsii4!l(rl06-) zlisX6#jXmm>J$1g@B#`qAHDG;zQSeP5_!f5p7F|>`=5$;5UE$s+c#(NV7A_xORGh$Y;M3Mj+;qUw6sSDgDv?T7yPr>mle1B$+N5Kgh)GL z@BfS4PBR*ZrJrmqEO9K5tcYDchJnH;a)ebKX?FribI4Rl(r?_K&&Yh5pe*{~<9;0P zBh+5-M^D2v$$eLay{%p&ft&z02SPlIo536bb_nnLS_~9@w=S{pfUcCz(H+og%X%BY zx6K(^?>6ISb^XS-#4vvb5-T2KDuO4v? zC^3MWyXt^M6feQZ0^@+?Dcuj)L4tk!~tU1v<`$c>d>=tgSaCTE9s? z_#ex=ar5-ciU0##DbC!`{o#%8YrjbH2=NY$I&w~z_S`AxdGX}xHv@Ai>(qr&LpkUw z=esnH%^u3Gjj)vZ^tg{JWeCoNWuG{^|Ay~)%>c*SCktLZ0nbR5gsOod6tfHQD_@o` zmq_vJ+eXH}L=TQ(seCSJ?K^A&{$^TJXg&it(a8W;`_|Z6cpqcNFB__Wa@={5*iGPxwIx<`<$&thHG8C>vmL2rK%7y~I zk8U^OME_#LTgB{hd^{i=JSUlM8W4?Xtnr3>tay*+_3xMS4qm@7B@5GjHX4A!uz5d!zN7+$1p6!3BS_~(zG zml6T;sn5jVJSdW!uDL_!w(AfRGdwkjob_CGjT zJQHQe%e~)RS%d$<(LZ@=SWU1k0390$$4n=ZhmU?1R=6$bS2A*^nV)-98=<*tOapBTs*;xc?(p>3b|KbNWVZs^!}a>v}=uu zq*Qmg%@Ip7&t1#8a)=)Gklx+(JHzezt6P^deLp|YJtN;33OjBw0U9)Z9voVxp73e* zSP^yqz6bHxfxN@)kRZeo{bk&1X1A2Ov7*9z6z6>AAtO!uyNgqxi6?~tK1W|x4uhgk zOdzSdL-f7-<0<=pPmupCM}n~8=Y-9kY=1VZtuwuu)?4CYmK}_z?)BNIGoLL#63#fp zYW^NGT(ja@4g2LRNd7Ra^Oy66=X+5GVrGS{Noj<}- zCBZ4@tzJA>z)XBuJ5yNTIySn}XDm1)vfxG;%g30{uGX>@%d$AaLbV;czn$kDtuPw< zzXQP!v>?IpvF?!X!KsG}SxSkkWxT6@Pu`n--fMBo-kY=Dhg%nfNOfV*sko9sW!d`q zbG^L0Dd^LB&z0d1QJQ$k!2j~zSwdv!0afHFe;)E@VT<%wOO03>(9)zyjfBw{GR&RC z@O!%k-{=B*uDwPVS$yXPz~Z)WXWJG2K-IHUmp6r;lgedjTV@b zm$8`01axMtI5V@3xRNDm$L}?=*hWtJNb9ZNdClv>V+$9A42NGC16Y=om36Z>q_GvK z9{(Hsni(TJtekIK33?c&se`uEZWfO#zANHlg|L)bpPOw4O3dH=yQg&BHP3f^^-28* z>6Ie|2h4V=SFiKUrV2^5=U(j-FAuQ*5*OYI;cu1r_@agScz>|{h3E#ZA4m`5(8ruI z^ai^K2k9A%8~+PNgj66@znwe{!=EiE{1g>YWQ_Mem%+c}ksC#Dx_`LxFD&P(%DS5$ z(1yvL&(LFjQqF}qf)?C7y>^=$khBB_cs!j#99;@94s*(#gxa-w&om0gI3`Yv!@S+A zA$1%BcAE&%Mtx%gz16us=u@G2{(sk6%7+493;PcJEfdCIeBM9vPcb=Z%e5XZ#s-g9 zfA;T9Hwq)I7~nmpb$^xopRIWy%>TAAs2G0BWadXrJHat%{&b|6W2-2O+@JmnbYS+nwXFiQW&h|33>Qa}Q(Ee`*#Ro@mN zb}ek!dNA7IX+##)y{gptZzZm4s)oV2FCrI}@8G=u{ww9W90A2}&^PlR6@~w^aG;7| z;rPEOEU5l(g@u&=vapbWDN`V42L5dsBSVo8iGgK$T{d}?2HfU#YT^JFIPzEgN44U* zu(4r3>J<>D)cp*4>fHK_nedPIo{8fCJ%GD zh|Lxz?r=LGXrC4K{wg)}{ETyDR)R{hpwWGe@wxj#WkvAuep;BiZSHP4J)(Xw9q-(9LUKdjo$B@zw>Y472wDbU&?B$@L* zYbq9HoIfTU4AZm|Mq$2TE(s42I(?MJRqVI)M0{`QXT8?(uZ2naWLrH0F%kZEw00iR z*gI}X14@<2;S#Q!!+5hDXw_k8@Nbox=s+2J&Vx$k)_z-w4@T|f%b)OL_9YIUpoKDk z*O)Xn0<7_8&Yzgys(o9p!IPWfog-KVeTm9Pa{7 zcemckQp`l2_MVjY3ZvW+pjO}NPyd?p;|fOjy`1+?sh^k1zA8fC5WY*VBK&l)LYW?d zpIsRQViCXFxjTyYkaLIjjf_DHRxqhwLGwERr9lP|%lsegN8R_%&NtZvgfN+!o(G!j zc(AiQi;hz(C5?NwBk*=+xy{qp$rLM3wjC69!Z9tM{PjH-Q~*K7Zw?_1e0k@b3)-GT zsKyBXL!*QlitFlCiu6)#O8EJj8`1PhsWCW91gd!_6o(?|?)tIu9{o=${k30Bld>JJ zv_>j-qs|_raa%laE+pjaq*D={s^7KzPrUte+`kQJhGWo-GN2GUjZK6uEf z_zgA9Gc0zqw@(+js8n!?`-p$#FP3kAJ?uI(9-Z<bq?bnE>OZ_Nf+!j(KAyZ6pJ9@@WVBXM@)mp&K`h;OMjv}oh zm5;r}Y1mH&ht+*tg00TvXH_Dh)80)#Hlc@H7WFnNW~D>0iG1Zd%_aB1Y5oPlL zzhIEp_H-@l_n}5VJQ24`MV6FAw|}h4={61qV;9T!gEXc+G199rxyFZcR*R|w-YPNh zlDiGvm!&|&El+aB*0R-)X+E90J2~GoY{B0f`t>;GTU|1YBb$%;^o($D-oOf~^=4

$-NA#X2=9dxBllpQMY2gPQd& z?D5isKh7CoLhaKA%>3EOll$#a1#HuKZdi(=AG9Wpff&z8T7LVZq$$xUkBlInP^lu~!?jx>&H*~QRs*0TH&jz4R3GTR>K1oz$ck~7N*}v%~#1lN? zLX)@VUN52BRlnh8PBx_t3I4!r3ZW%zYq7Suc7tiQWIl-Tu@k+V6f0dKNhIIse{sh} zDH7|xZu5tH@c$z1Euh+Lw*FD7P-xKtrC5svm*NnhcyYG^!CKrkxD*Q%X>s?W#hnBS z1Sswfp#%!W3A9lBAL#pj=bU@r^Z)KzcP;WHL}vDEnc2_YzdiF2J5WZVxS*D4PPynY zIgzbWchtL~8D)*;>aZ`V^htLmj-zzV%xco?+bzu9Kk^(Y>1;KMW|_t*Sy=c#X7x$& zeQ?wqFdVn_)E!hgeb49Sl4+o0flLY6?x3Gr9sBAkp7qH)2VdH=S@AcTlXyOk%F@3g|d9~p#F1O zA_p7@=1)Qu7Nqui&lzV8VH3|4kH6=lW|+nCYpCDBg4*y{9NLponTWq}Sw3F&cL-*m z?UK+u&2eiW-MhsSUne2O24TR_jwE}YD;T`a<-HUv8AisND2Qku7^SWM+&u3 z!K#{`JV)oiol1;1mFbd@B>DE&bEO&+!6!6{ z37L|dQTq~!nH|_7pI_1;c(r`ve(YC?&-q=fIXT<5`1#oS?60~XWO%O+z{8fHuQYnf zKX8J#X&ya_&T@RWvzgLc#loNZ%5k{b#@feuAJgfqF0O|`40_s;N9b?}N%ebj?sU*c zxd--#qxzNp^5s6#75gpI8@righVMp=gr9+!8^Wu-ZZR74(fp)jV-R}rwKlmXJ~Xh4 z_)AcMBxZIEiG|K~0L=BrxQ=PY=e^mNPkqlw<-PpK52+ z2AZ{`%*3_F#y^B&+o=gv242+O#cx(>t`F$)d=^F%Y^^o?rh0@cE@e{3%^>$lb)oSG z^erMk!CPgwVX0@yX@q+ui}(DVvnjeH*9RC7XcZW8nQY{ot&-15#Bi!7f71D3b(a=p zU$j{&1(>YS7uAGm7*MDh60Lu&`(EPVeNgw{7eC?siLGh)PN$S>la5=46^--o*xKTW z{CHFiLe5=;2E~{0<1=@OP46E4P?v$laU6t{+LLKXOV(#~Iv3pv&`SuUUvm9^mx$Ndk|4h;%m{Wc13y&xiPUrZ z<~&~-=1>BYl-h-$i$ChNy7Psq3>gq+7sHoU*qlw}ChPaS4Zp1Sx;+x+nAy55dUH`J zFXcx?mxa#QZWxzf!cKP;Wdw)onBJheiSiWes8<^2GBd@A0NU2N`?B@M^@^fQ& z=EHCr6aT6yYB7;&-i}J7IexudWI!&;%m#9-{-SrGt88(dP^j_aVXMU$g zRC*_ii?SRPwLt}gRZN02dBsrdZ8h!R0RWJGo7X@Tcik0*AV{PXm+wR6uLz;asBk~k z$;egkVc8}i(`x{y!ff-4b_EfjSGYgf`>c^!{j+;i<&G0%JiQ1o`X;FDSvshah=qH0)|KIouJ--=*`051cej7?`Rxm{BjMI>U~grlbiFF&rk||RQdo=vlv3Co9ZgE}b2AMi zwP+*sbmObcFc3b?fLcFQ#q4!iQ~_381h=z@#jfPewM+{4FVF3~PT~Dk%1;#szJ!24 zXS29kBJ1fz$Glgb`LKVRiFVzDA31+ZI+?SkSYVs1~@mcA=W=E zZm15*z3Qo@wPafa$2p=uH?F|>4gBvWbPP#d$g42IsRW|dMAH}LGD3at= zA=r)k82qk$2)BHcv~lh7;h`Ubak{UyF4D8=MT@l#wnFNmE)6Qa(nAn0AQL5XvK4`x z{=lqwXiR#yb10=UJ5Oeztm?{u2ewosxrNy|majBf)MYlQ$)7JcEAYJRy__%E& z9#@8LP(lDFImA_V;Y! z5z}zd&T!u4U0%|jw0VJ|TK7Lv(FpjTX+@>nX2!D}M7say;%vAGA#M1^jpMi8BsIfC zH9vHV;b;5@&^Lh=#VG#puG#4t5^udI8`-mX#w#`9mfHM#$>D`b7>Bi4U>P!!sSQCa z;`q^mFFa?WE*A6hxF)sNTiub?U;u$2> z;8g)@f4MI4J2IT3Bm<_~49?n2DH`lPb9oOV;ibDVRkW>XHrqT`o`z0->1#03)D*Nh zumrK17}sqzF!${yL^DdMs@;Nxnp@fVSw>vArXrk^Rv0y4Hil#Qk9G$9}_3XcP%Q=JBd|iK$baf!w-#qV(fAt?#N-_0Fxb zOe6`_Anhr_3Rc33!VXfN$Ha6^!K~EVC7dn-y`k+c%E(`Vlr|8H_76*oRf*ud_<#e% z1{Do+_K#3U@k2s9av}~Xw~{MKqRez~SeKUYl?6-^TFa?JCt7XklSd+ zGyB8@lB9?dlb#m+fp)e&QY0peRcIqAKaKS=T_VpK9(A6OMdI5jGY?S8##;_Mdh>F8 ztlV6Xtp%;GV@9X%k|FXKQ%D+_8DqLr1Or>Cd{Yz{ou-p02hx?Qa!|Wu=InOzK%$_` zGgpTR6(|h74!0d%tXZ}ai@KGndi#~-DvJC1%*^~zs=Ie(-0 zl;a%}d;2|dV$Z_g5Ylzn*-Aw9xC)RPa$rxA_=S2^bbT2AJA1=x4HpCa!cqZeh-}i?B%+u50%;QZ3T&_X{>hcv`zP~Lx)9QBh2J-8fwN)f z5YB%n@Za~YS46xktnUwKeLHMt&RGZZJvH^TBN$3{xfxsF@PiH5uD2`I1W!Kr-H`cO zG?lq@F|)*w9B>T`Xe*`g6ebGVphvuzCUmH!ah`g7y!9sM@!QXE;JQy~4Q&V`Ile8{ zTY?A&77&ujo9W?q7HL5`s{38vDVXQbPWq8QElO%E>R33zBiTn^IsJUCTibe5i$hG; zd{_m7wME(T+Y1H(`1y1_=)G(=TopXedy<2@Fw20pVAQ3>70A3%#sBx`#U>-p=)mR9SC9C+<0M%|HJ-QKcF*Y0ip(ril>NkOi_`<3;a zTqRfhA%^foHYk=|eoLbb0qMB9C6jb%psoAG>;?^^r5e7P;#+}88rpD0Y#|vLp5<~K zd@J0$cO)JW$8Stzs5ZZy-OKb8O`Cmx>gKY%Oc5Jyxt6O0WgXcuO$1V3%Kd(1U=C=<+W%ywa!R%W4G}ZjVXJ$;QTtk{Wx=5R<@`5N_YeSxV=R@R<-~@jVd}TWUpvP5;}+;<2G~?Anp;exCt0v z2i9ZU##bL?g7=TI)u_2qYxBfsPtD?gj>e_r?z(d4L6WBZmgnUR#AaD$Ez#hih-t2R z4|2h~jE@^M6XtRwtt;-~W4xL^)7!gSf|L9U?)@?O2PCz-J_!Q9Ahl37l%pFe0wdur z>!Wl}w$g7R`F|5}9i^t)GROcIi&z3Tzgf&?FI7|+KQuZ!8CWQ&(^_~H#(=Rg}2Uj~H zn+`1r-~XdH)4_4zo=Z!lr{Oa(ylj)(Wh`6djh>uYuPM1v&FNtESE=9|&>7=X(8T+% zE&H-W9QvIj&fGsir_bz@N_6Y6Qg~2hxkUnhlF0%nn&NwAXxMcB@)gVj3kQd(EZWnz z%RsYXV3%KvAye&fLJg715y4F$w9MsC7G_HDU&)%Xv|i~!c?u5e+v7^m>17VeczjKQ zRIoQ%Zjt84mwQBev;dH6I#|uFS~kx3pIaD&mp%)6q6me#Ns)lDny}x{DnnuE;88(g z)Z=Rzj}AaG!B{KxhP%#0)Z=;^{^^*Mbk z%%NEBbl!KW4-FqJgyd>kD#;6WbNfiU9QSxrOMuODjDUvt*zoRAyT)}mwZ(zUmWKw?Fp zSQdlHMAP7SIni+5kX@#G2Kf3x<)#Szj*fCOWZo};kfO7Z0o`&cP^Kn`&eZvU?aC6| zyH#W(gA?_U>l)S_K(7c*OZ(9o#DF+)#Yhjulfbb{9t7z z;ISV7gzg{XDL@2eKEVJQTxw9LX@X3*ZQswo$)92TW1(mcivcJ~j&iI2L(cSQ0t-`XTM~w5Vc(AW61eqDmb_E@_4T#@?G& zs02{19Gs+FD89D$)c#1j!~#7lkGc-S;oH_t8DMns6~i+9CtN>#G4CFQmwDI}$6%RQ zb?e0>jz`A53DfH|FNPwNAzuXwRy*o53aOLH3nVtzCi#92C(q18sqk$zW`O5ZeqVF$ zpCpCXpY}r;`iqgVf9t?~&hxT_wYu(@T68z6qP+ETrfxG@^z5?fZPx4cTgRLz_0Kbf za!=gs4{NvJhkp&`WGAtEc>EQpEDTwpo%CE&7N$z%A8nR^Chu*hV zT0<~Lgu|^mWlR}(?oj@`OGK>rj1`xG+;t=}oVzmI5{bRlwwRfli|}am^8_+lo-;AFQ|sPA)Cv zFy+<@u{cy2LF@Ya-pi1rUBmtBza*Z{3+kz&=K{o(#c)WMVoycLU^`zhl zqbO7_e2GFWB{CIF5a(MUYV0r!(r~Qt#SCd^WAXV1bLIg3WTFSW#9K*jwip=hlOpwHh^jdiS{`~&$=5FAx7y1Ym1}Y*BN`Da)RgwE%cFCgAbCKwB4Q686 zwFv|vWyNGB5Y=2>qWLZN8e2M*fr2aGq2VF@aDC7$B@E)Bt5^0qc9XzPWaA=0K96Wm z8g^;QiBeWjxRFYkFazR^BJ|NZ8YvMSehIfJCJ=cGi~Q{3a(H3kHTxB=UsP1hlMEIv zU^AFVj{8hj06%-;(YoSn*tIyYwKXlE{ib%AD@M3oseH zvW>>1ZpgU8K4>G8gvau7APsdTwxUAJCG6}V5zwyxL@Kh?^l$j2z||S_6y-iX&yjxR z9PLYSaVB>4#TzxifKZp)sUdJ`psCAsDE=g~d<@Y^)Mv@Ku4Rd|hY=p7zfyO9pE=eKkWy zai@c;PR33Q8{DOoB4IIX-wVwwk>7c>NJ0T11GTgp_0{$0CgKoTA=*0xFk~*lw*;RV zMnrdre?LaMZ3*k#ybTXkpxk}Q;NQK7i+56FJ=aHsJf{+q_-*wVm=~<^d;)(Wb``%p z@Hs)W9Dd+8hMB-P$+uE8gwmehp5^mA!M!J;DtMQWpO2qpf^cGFY*FvBX??74xAAOY z&$BHwZE3tcY>5;-MFk|9A$iq&kQt&~aH2Huon5cB;iUaOT6UL#N9DS|;5Cu7<*~(H zT`gTX=0XQWIDed}X5*Vh{AMazQROBGL8D&N7vSU`yJaZ}{0496D;B<0b^$UMJ=o_& zP|6?|m?G45RgKj4kAv6#FUPP4t|0Y120{wpN0oguFv&d&N8`fH6J6|?<~}ZRqgb8o zb6=n`kB*Pmy<``@(c+@Z1~EpQJBT=diqI$Rkm;sXX7j$t&pkVh%^t|N*h7YhUxv^W zlh(-Vdg>`NtXG5~YSyl?M- zz=WbKmD?ZM{v!1cxfIq6xoE48BuWLlnIa1qV*VQbU)=cPUNV4auss!_PlGbR_tE-D z*O_D9ayr@rFYUfTN+*NuO1{D)ANJRn4=qcg!UURF~}3qE4Sk%dLjI$FJ5!4 zVhT=P=U=^nS7~MSi3f!~=LO?csN|q~9ePp`JzGMXZF1IO=}D^gg@IpH=h*w3ddga~ zxW#ReN`5g)(ADnq_(`7cJ*?m38UDvF`fuZJ5J5$VrHP=TMSG2UUz@qX(gssW7(fF4 zXvg#)w89KYpBsz*P3uoyH>6wC=hNzAo2gee2!CESc{C;nEu1$36Sg{C$OkNEzfU#H z0EngmKOzUeMei#lN@Byq4{bQz-o%dpKD2VN;*>W)pMp7-X#e$N#@(-y(+Q7w1Jb*-^W~;;9hQ?!F11z zuh5G^285|GDh7aXHx}_lQ3%Kf25ID|d5@IY87XG1&KH=sm0k@A-|KTdU4E86zQLbX zU_plcGhK+a$0?{ZpVERje8dIK=Kwk%n!P%Wd}q?JD3a{{c3Y1s2(Ca1uee6z~V4C!grny z$ttf6*WPS;ReB$5R+`2awT_iwuKL9h-+Jg1u(k&sL;tJ4KZm_w{WRcFji2QO3yCu$J<7o*s6(Hutx@WE@ zdr1t5?Pe2W)Ve_hM$7b8RI4KN7vF-Z=VHRskx8No-1WYozFXf?n%#cD7?Y?yi_{*c zfVnP=jFjHX8JZq~r@4mXn&E3GG=0}{%ADwhM}|nE-AeXUj*Zjr@hN#^g^7k7rs*@r39p&c_~ln z=1aPF_$8h4lCsrFyMi}{<|~}&PcICJHpXOd(gd7*?Z?q~5A$6Q|47*#J_0s(W^_`T z+)5o&m#myUtoEtA6nSS83G){<2LtJG;Nv_r#y1|lV`<7h(hz&_4sNK)9ZCYW6rz}8 zHlG`NV6XCoZY8h(&Bj0KzM5^$N^uZs{%xamVRmAl7{=?6+S$-1iu!c%EqjePRdWQB zw+&kVD5(f@n}g|*X6JK@!CV!Y%O?9AV`&wr5O#jTWWLL2j;f5W7u8GH{A<}IS|W~5 zR~x11EhrUi|JYH|%#B6f2u~;lT;=$6{^0W0V#_iAQcIGMzfYHlizSu+DO!W?3&_UF zGZsVEi3VEP{z%~~{ERfC)$v7(w;EqEcuks{^XkAT3p`w?Vx4(*o&_R# z#bW%@)$xGeTSIbaf8=u60cBnEArW&*S_$|)g7U7+s~_+cU!1J#+3277AXr3(6A5K2 z)Y(S3v9_R{K(k{Gkk=s#(eONVZ{%y$ni9)_e3i9Xuem16r;lHhF#e!T>_g^#?|tsU zlWrHtL&K1{DHo?6ps=J2)tYzHA@z-*`(HU<`BlG`(Ty%`&|Yp4YMYugxExmM?dOMd z+c%^p3#+UFuZ{_iP-FPX1IT|8Q$7576Ij3!MfK%-G4fH9{%o z9vJBLi93wuG22WG$&)wnM!JrNkDZtG*3Ajj`enq~gzN5-Qe%ADMHUdS3*^1sfOz18 z-^8Lkkst6Lui=x4;JMM7BGrh0DM{6Sk^b->W@GJn^mX^!a*n6z@#Jz{i@w+hhkT18 zKa@(yQ4V=NEJC7R9 z-Xcd`aJ1cgipRop84zCN+AyPg_wP&pf0Ahuly_d;Mk6q?80HZcMrAuV%3<9}rDhoy z#dq=by@lKRonxka?c~uT3{Bzux6P04TFY8_A<1G4FrrBF0aQ_^7;|38kUn3$$`Zj!b%_BAiI7V`Tplpb^IYiSQ|ncf#mcf8GIqF+oWZ^B|Rha!tiyeZCe-g zWuGgKc58?BbEUD_`x4=a#X)$Ld*6t3nHgUceRX>(HtZudb6I--a-p+&>zn7c5IECK z8OmGz8MkeF-+>x#3<2&7E!t+W*zBLzi zz{-nO^5K~czJ9R!9E22|kx}Qx<6VMw%6Wr%^d4q{Nnv7rKF5UyQW=+Ug|85FiAR2w z3lf(4E^TRg@(aY{b>CIvaYeiafC{}4Le~DQ+dgn>Y^@4S)KtNs*>wBi%=J6KiL#D? zG`w{1_@Og}NY8hlO}Gp4?I};7UJ_r7A?9!myB$>vZe~yUyLG*FcxlG~xezM#mlv5x z|F32zWEBJG6`HaH;Dq9eI}#Rz*RMyndR+lCHY11v(G0^CB6v_UD!*^oVD+cRKQ!FD zLuIXPvL=X12T#fzS>67tD+_epqdO(=rt=|)PDsi+yk8h4f$pYcCC8lVaIz^R`s+H{7<{y#{cKg3|FwGtlM0~3dM&VXnN0Pqjm0ywWnjjMr(8W49Z8w!L ze-*^sJYDYuG9i-aaRJk$bSkn!rhr>;y;D0TqHgvG`RnOss}Kf6qAkL74X%$aUvWmy zFySE(OyGP}hOmqsMR*6sdFLh;b0PW2y-&#+woRBLY_9Pei}ElzQp0CmUySp*t?%lG zzaht*Vl4wkKW1|xe55cEcP?6KoGCM~2jPOgYK+CIYW`*OsQZZ-f22P2{h@*F9v(mMIWCKv5z_sH9F|xww7wOUZP`R(&cl0eH zF}v+k!ynxqL?Ay~Uwpo2_P$DAy0NQ;z<2@4RWCYk_l}IXT*S0_^0Pw*r}29(uQGpM z^?ht8XX97Wql!{wCwEs}yDH}sE_4_8GeY{nxrTkKLRzYraV0_)5epz~e5T2v#MIRo zSVqwjHFeUfZ;T$8<@Gn*n>~@U55G7-Ba|_Y(7=t4yH)L7cVJ5&QmZ{qC=qcOKZfUE z9PjBIu+OvC&d}7}X#mHuUfTGwT{EUi=xal3|MBC1hiVwI7p>#=E`?VFIa66*e};$H zxoS{fw7X*3{Tz>@O2`Dst4|xk0)EkNDCpJ64OH3xRDH3o*E;dLIdJm1_4&xSi1vRl zXmPqbGJ;}ij^6`;JNy#X{snIv_m0^9#^sxRny|ft?o;r; z97QHr1sZ%yaBE~94h+`^mY}{&%_0M<(nt!2AQ;Ei4UbZn>Xm)JohV;OYFd}x)|3#; zftM4P9E9~zYl=Oko-0QbIN2xD&)NGLQ8T>9YboCy`f$6^C4VFHM0_x1^1+z{*^Kks zNm1*0!&(pWNri&=;m`5j0NS6;fdlfin0ibc6C4|q$k$$JS!3kvL*I!>U=sIhe)#%K z^$pYSo_2Zabzw5~Z~vguvIg2noWjTvqKI!oDDHh&1`bTq-ES+fUK$ZuIrVt zVZ(jR!o|3fiM56VF_kyQcq`v!5rcxskLzhh1auO@BPW!-HO#b1>dKp^*(uA37myck z*Q4#`!-Jj$us=)sqFVJ-^_$m1h=K`Gxl91jx@~&^Eg+O|wD~Oe(a78@s!e^aDKGbGCBm;tJhfSrkKTEzufn zjyIGD$rkjtF1ApC5a*GpNqeEGfhcTZhot8!;{o8Yx-#(R=tAV9SB1!8Flyt{qf0?K z$+zV#fS7|5#8rbNrkE@KJ)O%!<-kJ4-wY)26lB&E>-UM9P?+y z73<>@teLLN2zzD_`fQ(RlqPh`t4jmLR|GEBguav`uJ|_ckXNh&J#|VdNe;f_o>+KO zB&eT*YzHzUgwW?;5UX?v3lKZJi{Ljz6Z_V;W0>j*Pb45CXAN6&@NQ@-EZU;T)35?8 z#@@S8?SSr?h7Xj3+ivd$n&`un@*{K{Ekjecs{KGt zE?u3fhv+`^AM%E=XabXEyY$!ACmESDBRm`t*~i7;6=H4{&=lZ&J5am^JVs_+&N-oI zTr|4{4o#i@74dJZ-sAW7u%*?)zk93pU85pr_>4 z5s}#w-IfY%`8y{(yl6&7c=X4uT%79R04P2`5XEB(C8R2-Xz`q`dc^i+Qg?{Xp zCf(C7ifBVh>!1~u%=$-*{`FHxmIq+B6#@e?vc73GB{p=F9C zGom3E8KRE5O|JzFiqq5Gc`-rJp*qg(EFd{ZEIGK0*2`|xylqSbe`v%ynP7TELpD-L zXP^wMm$hGmc^}=NMbQb?ztrQi4q8I=l7ot9oz{eAs%W4YSqyF{20zz?f_l!j=!HRN zXcbL%Wd@l-P0hPl1X4mK*(C>`KjgiDtu3RqW3;Cw$2?P_>z$J(l#LldaV;LSR(4)i zX+eR~lkk5Vb!ch~*Mv_iAph)wRtRml4|ywY#AI?c6PoIYHq5=m_P;1VOP4G&cJ~+C z_4hQPHZZh>;xL1li8)l&uZhT6qmmrt0xh96J&w$poy>YjJ3az)ya=qX2^G{tOUjXY zugkv}-1+}d5z!7+?!X-L3N*_>Tf}?xLWV~>4|%E4O0SBtCp-U3>}e6C68|W0<)bi| zu({jQ2qcEz$U|j`Z-EN+5`cz>#~Qz%?{1SeVkwv7($J7CG`A{aYiGVlhSPv+}3U%%iRD8#pBSV3GLx#P-e5pDbp}fx<6DY2P%UQHB zHQ1TG=4W3QCO)W+Zs5VY)Q8WJO(#>Uy52u<2`XMl0U7X73(cP4;IpPlDjLuODPTE1 z>Iw2O3M;#x^Zy?WX-@AfI%dEvyB}RMMjW8`i0SCq<%l#5FOJ!d{Dhz4auxOFv|K%% z*tS!WOC!<#4jgmmpxPDuURv~W!$?kLKHP^m(qCxiBWGy-EM@KvGh+9t!`WQ2R`nuh zb?KWHB|k8S@8a}3Uqe&X@u*$SqTVWz+fq>Z*XMtauKE)#;`Nk;88S8g-Kaql=fi@B zowMpmffO62xJg!e(UOBy?46FR7Kf@8?223*cozCgkwTt28-{gMAcag0V*{LjG+6f z#9mwF%WTg3pl>#50K2qpuoHVgF3{<1G&9tmn4%h+cV5X@+iX4w*6luBx?`R2`561+?@QLV93PodE=)He6$(y@8|Q%$>ENw3sK zBc3g?Ru>2M;;kWLuI@y6k;-!`vxI7ndh)gH^4ZWP0m4tV@pFcXQdLa74y2K_z2CF= zEvszk%_=c|Bq^2bwerL!!Y=>4x?lf;C>z(uRZ>CXFqd<$ipfi;{^UTwVoxKXQMoR;4|9C+_m`+|AetZ*ibUQYk1YQPmX3BW)@vXP2EnWAIVvv4da2g&~pzD!tpKP?~ib z1n{}BE%G@s<2FHs3qDFf0-onC^Kaut+u8~bZ;_r&P%kaWj(<4q$H##!^m$jW#7LdK zoB=A@cY}WXJo+fe_2q07jZV}x`&)fssk8_shjoUG;T=2`d$qUqQJqoGkXob228NzF zNsn-F0xucr{)L~vrcsbu+@*(L1qX6e`YO&K6df09SCX@V38pNq2C5i;lH;-7MpsD^E%r@!;cZnO+BZ-#EF7X@%gN_;_bK9hrr3zBNv+ zQr%V4ht}Fb;)CwOqR6)q1ROPDNRd8R_8~gJ;{S3I*6`@J9=CrjBpkCY`KUfT6m}<6 z>gbqBX}ns;PXaZLT(4vJ}fVB!K(P$fH@_Qp68GR^0ggBh0y%FAYNU`m~=&C$~Pa3ANg?dBe-TYJsYLqr;HmitX+~M~gZZ>Y5f8bQR7=7r z(e8Cg!L#I+^;UzNgYTJ#!NF2p{bX0xQTBsY;|W{o9_c$`PyJo|zVnFwmZqL_ZA(ns zm67V2H)%tV5XXp-X+mR_Ar)ZxSL^b+D1#Xp5qXTzxq^q$3}RHVrNl86`oDcRIKRq3 zyJ%V3WNbf$Pd=@pC%Irf=y&#n^Vw(A(z}d%jd#B%H_}dDsUovYYdINCofNWHm|bn7 zpF4R%yUYL}H)4VO*yzdYscGz$zBHI(A9?rDu;lCF_m&U8oemU;nMiSX-OgAuUnSL` z;1Yu{PgFG#2X%4s#O@{}BvrWhJY2V^*9giAj7;i^C{)K#%SA#WERbA?52%XE0ohHN zD+3liML@=(a_`=&&gz;~zOj9WlVO%uol8Sm^W%-~V|(9+s6T|Nt-tPNRIdh)Z@3}| zh#j)Jh&G)0GK;|8hsYgss4!oemL@dnZ5#4}jv2Hm)kQ?C!aA|Q^85EjtIFqE8^3qY zu!3}pnpC9O!$VSMlc31Jt%wGdZQHJvf}_V09*SCLIUhbjahAVqic7gE9)~nTVQT}M zkCSiLpW#`DFO-y!+zu)6dKf;KUCtg@tTkD1UUgDwM7dCebSnZqUQD&{ENBW4o8Z;Q zfipY|J^Q56sR=Iaq24BoL(?+0KTnAj^bE^;R+|~or-SeQVP_93_)&O7(wTcbaF`g4Y^`W0sn_1eglvGjo*P9sy z`1y4l+mIQqAI^OIcN7CGr^>+MZ<|+E4PuKtIwK0l&aE*6L=Q5^Sa)@wd2pq@%-Vm0 z%9e-An}zW$4{UV0m|zn_B9ij1cp(uD4r5MKF?+7*J=YRP>=wk?Q;(i&w691qkH5p#$vbStzW>60pvOz%STjMsX7F=Ci_?nF&*YpF6Z zR-FZwgMO@qpHi2Bt1M8a#44||zdiqP`GR?o>%}~>(*)aA6ZMrB%Mf-F1T!U7x)s~} z)#xE@nu_W8fWpY*5G!{TEg+Wu7q>2=yW;rnLBUb1TWtjbrgaD&KE6`7u?5A~yvVRG z<<76|W=JwCDqM4$&9@Grm{GwCbR^aXj9VV4+@W{HztmzL({b>`5DW$~nMdtZN{D^= zR>+OM1>?$1GQm=h)q?(bUb?zeiCnxhPN0gDt#6E}T;-`p7b5hNGT}9+Oa7}|&nm}3yDC*$Rl7;2>B7by2Ve_Fq}a7-8=5AV$x0x(m>R9~`Eu{~Kc zPNqpM91+j6Y&tN(8uz@QEx-spZSPSp28&FDogI=?Meq|*w(ymKdn!Qzni!zh0mUAb zB&x-DRoqj11rf*iig_^(4boC?fnA5iu;Mg5ADbR3EoQm``QuXicWnP;+*On9^yv$bk58yiP3N= zZtrU31-Jb?Xoa5hLe#2X6*~prjo)-06h~^z455o`aO)3mSn^|Vj@$$L6s`boCavGf zJ)Sr84ZrfxV^0WM%^@G7f1@mXWTPi~tn6YlxPKR^r8!rs7CJbagWC7nk?LCELJa9Z z&7@2oGnib7ID!1+1`YCQ};4YZo0@ z5*iWZ|De#~c2Ua9N98onW2$-sbc<5@-J!@T)8luZFDpeqX;X9L>!BoUG0o-X>CtD? zZP8v)!xTb7ZmH)H$yA|ieFp+zny5Lm)~=)1wlPgnWhb9StnbZlyL%7Np!-Mv5!$dO zxJgPDJW82j#7o_ItMsV4T+?1UzHr>BZbKw&WnzSrup~5R47XOxd$o_{RgvOt)#dhG zrkEK;g3?_NHIzWCUsvAWtI=E;QhI)phY2=vE^QBQkrhq0mi5x>O5+WaV+I98rQqhy zJw(4~jzDI1rL~DI_kQs5E$e^}mJU;&ttz3$g58evfFIZ`{xxr{2ve@3ZbU_!ktc5d z@%z@Pk^q=lEpT=F72NFVAlRpP)Jctxho)q^x!$X${ke^exa^0oOnAwG^4Z@NTDBeO zpzdP{%ovN4Ga^f%@Kcgh-qPr$2mpz-{rOn%?+6R|0`yF5DZiQPNDx4f>-FRm6R*aJ z<00svTBa~4MNS%_x>Rf>A zen5oI>+-(E!NrOVyWP5}{ujBg@0WXs>Xe9W0*sV?pMxwyC{s(iZHnwKPet^U4N!gZ zHmYi)>Mjq?N05^V-%;|K&^N>milosMV3VWIA&;3eYhT`)I)q|S9ogLZlw^q;dtUoY z+UPT9r@1?6WnS|@#I$6)8*#;qY>lxpvnvbwR7>1(3Ju*Zedkf0WP;o~Urdrt1b08E zrw>LPpIG$7NCHH>r%bD$S7mn>^HeArbU_!$?{A3r##fs6M7`FEz*gw>ZhpPq zu+%Aay{XGuFhlzW6+ALs%3+^KIn_7~Z{ld^vP?AKXu241#NPAfeI%x^R9{o1q!RyuCuKtw~| z)AYxs;3u>PRCn_QsqYm$WTpywJTM&>T+7J$agciAh7PZ>oT|1|ut8vsx@o%Oeium=R}lh`FITkV9ky0PD7cy=FpAbFO$$PxJ%y z;=ixd;pL2|>)Va=B<0ZROLcMU)v?e{@}j^U#<*OM2Q7zEcv4Thts9o~2`>xW8h1<+ z;ZV`XoSS9j*VfYN;KbH6WUJAM$K9|1PU;Dc77u2R>QfOmeoBCwH>tHy3#dlLCy0-S zDyEzDL`L+$%5=~uh(J(?dG?3Xuf7;xk|X0Cvm(Dv<1NO=FfNcz@pY}m=l_;meagiG z`mAvFmEWZeX`(fH^y7eFD-`Wl;^wM8`v$nWUCcUL>(?jBul6!%&4l9vOJg?v2~Ld; zDk8I57sStus3GYOI}!L9(sjU~>Nw?mbm!aRFygQNj0ZO z^>{Q7AbDLIG~*2^gw(Wr2SK3%2Kfr+S${7T$JE{)M028KJ-5z}b(gRxY#SA0m z2s5Zlv)%DmR0gvAGbV5pbyw|@H^2q z=o1|tZD#>hFNC4)8wq`1VN^vX@PD-P6L-P$ztAMykUBkMM%Z=ul8SL))eO1cmsV~z zYK;GJU%WArh99RR?o0T_e z4-`*pvvoZcWCXF^VY65fMtbJv!zmSWM+J((1*z58U-YsjhaDGUn;wyJ%~NB%;OpDh zWH_-m^IZD=I9H}iFJ#4dHB`5-Nyc#Vqo_i$g7{MQxw8dJazXjn&k4~TTELj7#KS?9 zt5$q<1b^Hd7_$OQO%3sN-c*oX0}1_#TRmB6Y@=LYt5`_OGt>qN2D-0CL!EJU+&j`( zYJ2z{s1z`3q3zyQvmf|XatxhyV9qb3FS{c=5cIAl>L@D0;CaTQ@_qYS32|@+P0xc4 zD{mIWmWt@fsd86~1pe+I`^3PT7sa`Eo`1V-7fhj>HvsaJ%~NDJ#ET8*v2Rx$xKE5- z8zwIID{jE(Gz*C@5Mdkkf7p70wAOv@J5AJRQ z8Qk67WzbA;3wNGR)_U){_s{uvq^tJc)qT3U3R$gGI@KDVz$eO+)M>yGU~FJpgi^qS zk;PzGeqs@zti|!sjrzkK7}YUIUP3lfe((suBV(2f2?{j}qP9?GR&1V`iI8`y9E)f> z{TL21vYFp%0bmK7jWw4b%pm~v$S4?i+fcq{->emXdnxDe$>lEz#e*3bF_P7<1 zed&5H%a$`R60=R*2&yU+cSLZN#4{z3XRe0kjMtr?)2)=}yrK!M^_K=7Y{?8NjqSMV z!}^JGSIiA0C2)_&*dxQzh+sp!x5r+jH$dJ&B!$!VH(=2I5mu;58jAYGv?v1}OJc<9 z9f)~)$$B=%>T@yD;V3U)ge(XlvG+NjBX_BQTPlBcC~h=9eNw~+1)=;1>1niJmDFVe1`G2BS$A!AEcBn!PXH&d zi)U^YGyM}j7eMSRwV^>irHo!NwY+)7lg`ubYgEL#)av~wJNA5S5N{EJ@9o0&A7=7z zfk)(g$x&_lX~Lm45pKsQ%bPMDXS97j9fwigd(nM($CpOsK4Z~+zP-19AhmPWg9#R_0FY$1Glw^FEAC~~FvNlv1(UDnFmvfseHel1Inw=E8O>dQjb z^rLf%P`RKFO)TA*u3z;=9e`{GaAG`=kMT9(XICgWHRvU?XZ_mt#79GasbzZmu`DiW z8AD&2x46TgYK?X_9A7ExXFx;gbS z9%nD!4A;}mw!Z#tdLxfKUJmYObJPXR-*M=?Cz8fPp3Omo^e%vP>)evUPb zVxLniqTHG(QwOiOwdTea?(TEAWl_JgkA214t0D$(-iD){tYY*WSI8EQ{~7cUT+iN za%?qdLEUWL-*3t`Me%#TMy`f4SaN52G@#>s40EeHg#Y;CdphrCPU(@|H=0?~I`kEG zOJ zbBx#UgHJ`$95s~tNaPt2<5Mzc&LSKCtQsO4%VnvCehcTg?aB2cawY3CPL1r(gz#z< zu4V=@GKuws+%Ii9A8i+=+X^&l-z(49VcUJnw{4R9r7pGD?7M0)p$i~NxW=dy!kwk- zQs;^NnA{iTeec;9J&Ze>I=k92OcC&B?K$-Ma>W=rg?kD5fjg=XvJs2U+L3FsA>c?J z{S{@!8**eYX_iG!*v+R5xyPojk%SbC2~F~ULidm3$tM}}reIgOQe>r+DC}1S@E}x1xP9dDFQ8AJY4u@|D;QXWi5B7@vVjZBK zUdLvFjJwoN8l#BpOV`Nv(62mA6F!0o`j#22nw<8Y9q+-Is8N>hCe9j75s0ijn@J>O z@y!SjhqBIzI?<6+$>7P;^y`~lmCT2vG06xpk(n7}I*apA!In}H*`7gl>APc2fDnbn z7PmNzh*x&pV8yCG3Y9Uor|DUnQH|91#1`}`yR1&gw5POHKs)Ko8*R{ae#EvSy<0D6 zjl9DY=)L?SvFhI6wy)+MCEfb|e*}18TTl+Zu80h4>~Ks&p7T=p)J*YqukBcrqV%lR z6Ztx1{>FLTN626CMnxhNx{jz;Eczk(IAHG3Dg^_ewS;AWgEebv(m7uu0=v9wLj7Ng zD!D__Gd?x=Ydl+G$5sIZvKFJl<7_eXDl~s~f7N<2l*w=KY@N7`%tE-NX7E10MYSmD zL`Nr?C&ts~t%-9F)A%rtPyNgXcb%HoVq##0qhP`1YSf6tWMb)PmxD@QMaF*B?vY1sv0fWR z88hZU$7_5#auKhhnRimh3F5!ui;XdF?M9@RJZE-y*}3MU76bl?5K2b5Nw^l0%Dmeh zGQ(zD+OW-S=yHplJQ9g?zBGj6@D&9U1^f=@b zHtzrLx#9%Va;8Q2(UFs&F3Y3fGEe+f{i^Lw2demG<|>w$(>~A4eZ_IYJMYC{=6vqw z(?)UAE`bCi7pA_F!BqRI_c46|X$hf0&NTYRv6gp_jA4Vc!=zK~=1kSsv2IR3Y4Z?2 zJ`5nuVE(rK&AB4yIXn%QpjrLDw0}o%cNVE@sUHb-~c`#pbB{8~#l!$`Zx0^?+dedyg(PQ^WE{U`N9B-F(!& z*8QVbp>@@wJ*xP1DG^?J9nZ1LNwqVUp>s3FuFkN`ub&=Dt!i|sy|S;9n@-;Itm{7R zfLA?S`Zf6UlSJQtObDp-iSDn}6ii5|MnZ;A5Ds8bx%?oee9`{(kRjTLk6*BFFDlb1 zJd+KpKaVAp(T|Js7PE~*y}i-Mq3b(V$ilwqnmHFTHxU>$XevpGLiKn~Ibx-(YUaq0 z1d1}g^muZ9%~0_ZfZb<#*?Wj51vV zio-vbSVZ$UEzCWMf<|Km_XPxtVFEcWh9~7^Y==>yiM7xy4C$IMZBQ4*`S)uva?tu~ zl08=b#9p@&N6+kQ<8_lhD1JR^*}f&K&-C8S=JJmx1E+|c-rYbcB@=nkMC(16%tuc^ zI5Bf?9jz0w;tQ+x$Ks$T-Pv1W!GzEX_Y-iJXzpGS zG}W>gMhyVID9wvh-o*giSCG+LPn&KZ8F3v_nsZN}YzqH28(9dye>2B^n7|Hmxfs{PDmd3!7ZWGr$#rgmZU6-?^@RP9rNGX3&{^$U@yJv{&QBngro}jv$L75in z^vgw##bDKNL9lb&E9KK}%D0RE!N_e==hJyuiz0fEMTlrB%dJ-gnOtHYAJG#O?%Iq^ zqx?nbG=`Q3q(QjrAs9T4+@BLz)K6fyjf@R7t#JX05@YVu2M&`-?rPF{^Jjh_+(t)+ ztuny}`LnA#C4{FU28}&=wzO#zc=3mhYuA&upE^7WYWzgi4)M-Prq0#=167P%jjT3k zUHsI`wDY$Y)-1xbcNdoE-@ATik8Piq=Dpe1%}Zp@!RM!jVG6&;LjKO_z*!y8n;huV zX5c52ruo;csPL%1Wkj0GSO}Q;cd3-62QT(VWeGzRpF1xI5kISkGA-P%ID9uTeD%nl zZXd*>K@*m0SNpLBm6VJzvwDLO|N72W)f=RYOl)~C01^fK#9)wrc*M)k?=IV4n<*G1 zw7@bR4ZQ?k;GGKZ_O%%Mx)Jif?YzfFwV!I7aXWFTU}*Djcq?Y9TZMd}fL;D=0slqW z-^cP=4KU}k$9N3Rk+#hm8WMeKyGP#f$wg+0 z!{iL6$sAM zqTwsCk-<9XAV1ki!}sFzBvO%%6!Vtf{kEF)^wwLkU-{pqWK*%5gY4X7g#5y}T#oHG zvRTpF!u+jQ<}Zd&gBl%+4LxYl`wxkdrz!;egf@ z>=Acmk0I@Mwy0sk4Td*EQ&1NhY;7kEgdH@Dl3P@8!I~M8fHa^v*nm-JQXIvIyhRHD zFCkBauqdPh!fY|95id`#BHv&C1u%nhw2L)iWM{$}$U6_%=yz7<3GLG!di?YqCU|r5 z2zAi>Cm7<~%jS}*s}R_-x9Xp~`zT>x2>lRYfKdSqH0e^me#5C#xzC5~H+guh=hp$Z z%IjvWCFY`CBiBAFMaV1pr+jX(vK}9KF0CNwX-g6EI4g5s)A}$gsXgx2KbMPtL>Qp5 zdUtPE-le|y%%G*T5MzI8052bejnnkjPgZzR{K42>y6XpX>z-su)M-*x)`SQzlE5Hi%n)6WIzU>mJAHof(567>iG<`6)Z5LG6oDxz^zO+}(E#CPzE4OwRnw5Kb6}ub~Xm^Q&!0MxHw4ZBaFh26GQYmOfGcuFKA+98=)=O9n$X<2ZyJIKscHO< zSVvTGnT>7F2>cc3t>zTiVC8jVk6k#c|;bQ4h`FGh`@e)`V!w>Lzdc zAhfa-ME7iz;u?`d9}<(}%I9WV2KE&R;d^j1NDJ#!SONGX_?ZtCPBSF`Q-&_1kTnbN zl^@bm$a&8Eb1}tSIrvv8HmOokG8oiL3d+O>YbdS{ELZfZ~w-&%{`qI&je_~zN&oKf>1Lg5LVN&i>8tUxX( z8HfSziht>dX?8lY>?ii#v%&}pWS37|y7!`6ct>IHnMXr-j9M@XPc(~2fEzA~5E+h9 z>CI+^!DLM_HqX!DD3gDNIZg@W zP^h;UGTk_%llf>Vmz5)wC}zFC-n(ikL-SrVF!|Vi{ZtmA{t2`}(NCaY$3J&eKw7ZC zdXF~;bx895-YsjeGotKD$mv3a=Qbf{AE%&Fs#uY!p|;)ypL;^`eij;(!RT9u`gZuW zH*}0wn8#a2(=v?9EY4V_X|p%Q{S@b_G$&5EkI6@Es^{Z{4H;5409h~0#SLZsB`pp+@4QUnwX6?2!q`38)Kxp~ z7ap1ZmvxU$Vwj_2#Z|dA!eRh}-BIBvi#*-N;vOQ$M_HLB8H_;r+msDsG|8X~?ls_y z`}ja^vN%#I**c^F4uqR~M*(>mT~U?~i_H9r z4zVE{H~}i5w?+J5__|{q%)qZY6+;i12vg;uqGZVt4Ep`Kj7sV6OXNPCK_!o=C(rZW#ym26gN(AD${_wn0gkf(*!jn; zqgCU3F}&%zjN~l1(qcKX>PK~+?IrNkjzQ^Tc+fBF+G48|Rp)+%07ZNs@5 z<0M~i3QNS35|SU}JJQ4WltIIBOTUh0l}4+5tMecHUzWLz+*UKMe~m0(4_Wo)Z0um| zKDrZAWDh+~gA>S>u{!dO;>l7ZKj|kKPn7)NsEX07g)}B-GH5Z9YR?A5ELc|v<)4f; zbxr*h={oy&9eIBtHa_5Bxu>tpkQeHh1yp=b-IM>by25p^J~wUlmy9u=K!p4yX)^pS z>T;vjSyx5X=+B8;jqGN#ZsJ6bQFx-4FEJq~3dV$FJdDGZE?Gbak#OMTzNCrUcrkFG zx0V%yB?*ql$l&-Ogd~}D^sA=s6sKv!N{#4-mK6X!vk~R>aBDxO%-^@YS((OcZyH06 z-PkRwElvnk2krE4!xBd#3`CI|DBrUcG5N&d=q>k9s0W0VMl$@2NZLbOqK5i8eTQ)L zr?3ax>FsrjCykoiI!r9~zDjp5PBM3LpHmp_goQ0~%!5j(c7I`~oCOguO^8pPd zM@RbP{?zVCml47N9l=RH&P|Jkf2}cZGwlBfs*%8&vB_8H-7&;P)~QzMI1a6Liu%Or|zMucIss5em3)~ zj*SONjDo-MauC&rsW7I|$NgjtHF1oalw3dK6yKMLP*mjnga7ON1NA0;Pq%VAi#%Lz z=+?nLsQO-1?bJ8h#(T3(6(sPJe;~EKJI5WN6#+bK`UgY+zh8oh4PWt1U|fT4ejSg) zy56<}o@Pkq18x;TP&+tPpVQ_8UVO!H`YdX$-6-%n$htw%m7X_z;o{@QLpV3t#h+!9 zx0Jp2|I98f+y4;nocROz$iz3k@RQZGeaXz)& zTg@o@Pc3jiM_e&-1s(6Now`j42KR2Get8zic4A>~!e5MI`ldJ4B#b8wFw*?R+0JBd z^9|3+*3d=U=VZdY)h^69guFKA%&<9d?D^B6$R4&rP`2tQW*OZik!RUD&0!+Sk1SXj z^V*--Fnb^fE;J{?OahB3Z@zv6dvi%?ll$eeHqDiqFD1Ex$TQ-x zbAz3{P3T{v*G16(Ovh@>gq!YsLPqxANKPJ1$Jz89qTN?znc6^RfDsFC8i^a~`2UqY z)mO9e^{TH9Cj&?lUf1e)N;zz_AnoYn2lf?hlAZkkqvaZ%2sq$_u)emsrrNRPhh%yM zj&bz;7-g~1WE=0?Lm#i@(L!tI(*yPSArTlwx<`k7^soH#7H&UHUkdp#u_-M8pJ*))TJeZitSUwey*cU+{xJa^|O6m(IjVedtPS&sI9vs2FB&y#2$@8e1) zCz1x0%PgS_awsf#^}IPG2qgUf^5l-p{(GpCuF{o(Zo`of?cr zX~(O^jNw_ZZ<10Wy-rDpyl0?1w(7p}(O6!GLB(<}pso{f)C`-L42V|n`Ba>P3U3BHyK&&83E!g$?59T6QVa2GlAd5{T+$3MGpQ%%Ddr~I`)kYbKhS&Ko^SlRm%WohBg z%3R0KSXLmp^7$S3Bs6*aB272A`K>2(H9tPuz7t&v{erS03IJ z&dT>+ZUcs&ojOHlrS)!SZq5gLYZ!0f_6v?gAEF^gCB451mfylR@3!VQ9YCX$zt(jv zXHzfKN`HNEBE@^6wdt&&gp85G4rBAdZV4DFp4EeH9lEC7PXDh*S9&f(Pxv85jbzo! z^sCKd^j@keflGSO%YIX38@(4K%@U4gwXXFozs@(YJm_7=_OJCGTl;aoO+q{;9Hvk7 zmHP_Mj<8NZAh9Jm-;mD6tG{EUEL-A^)xhb?PZY5ynmQsMVrA=AQAG*29qO5i z%_=hS`*F^j;?YC|(_Z$sFZ#z9V)-bqvG$>Ps~o3Wbj8bg5vBboi_ec0#eemt18(Qw zP)nj^;Q0Oip=y_HGl#pK0DbS4w93%bDseV*UDdBmc`)s_7(c`kIP*T4&ZOO_FOJ5fLq@Yu{?Dk;f zCqGCa!{>is!xR!arr8L!gdF|IFglX=P->A zxwy}Km!P4bSwcf2RZ`_BbHY?E?sZ6VQ}bK_eCwAv3>lJ~e2$STZrJOd1||nE4tY>E zXGLQTxsS@9^RKXKUMt$Vo zTv*bO<_09df9d?@T0%s$kpjvMS%k;O0|LxNd*(+c0K*otyLydMtWWPX>&<|ro&d_(NtSnGTN%WU0Im^~-;p}lW@LqJyZ5whavXyZez)I6kp*npxTNycmkP<)06I@i9 z%@bfDGmS5(fSTWWqrKnmQDS{3{BL(OLG`LF*-=fjX5YrHUF#hW0P(px$l&(`k~0)1lA5m=zy=av}q@?JtplxG7#;}}`KC3f@e83^einl9k5r_ms- z>@NaLxDXB#{0PxXp!J_ae_9fpESyXAChU>*cqg*$UZT`mO1?%S{SIz`&*uO^KA1fW zdDqKMmSnb8jepY)g2ZW2z$r{sPyUJEDA&!4gJYgl2c~}X zPus>iyB!I?;U?74ih+VVO{!a=1mvGXdQji^#3`qDE;J$U4^Oc(S?tuE^t9&9Is*na z@1S!59C^qh=LlVx*Kof$&}v0+gRlFg zk5W<-0Ps||Q9~7od=k~X#1SE#IrIm3p5i)Q7J1U7!zR?lNd)*4Lit)p8pnC9mR|6e zq9wMocsM4V-yUalznMVp(7UA~wlJOMso&G!XOrn>si6ru?_*D$$qlp~o^xuSuam4d z2~WF3(5ZZ{oL$$xNrL?V8&=}i0r|mHa9ST~%lj0V?J$oh9sS9))P?6K;J{vz!`85q z`^0?yv1N9v42lgp)J%wr>#jVPqKUt1|@Xl{vkH7-hRL3eRB1a8!v>hze{ zfCYr0#Z8ePVozd}3EJQDz}Dj?F8fJt3-X1%{{9%DT{8<^D^%HWc%J<9Y_g*9=ZFPd z{Ep`JCbi7(F2y>?F}t2yK9txdhG$7B{y?HB_C=C6WpFxslt44XJW%z@T;)`zQg6b< zg;@9AO_VLH(4>-6VV4~FHW><6NBs07vNnN;0L@!nWmew9X{lp``}w~6 zneK_dZ7ryIeuQyU5?oaokknFDIGE*|<}~}bc5`6R4gY$8b3d`lnSf2X$c=oy`vf|lr%_3Ztu6!1fYg0%90c$TI0c%=zU&_9kR0F zGsb)7lo*uYLJ9N;`h%Q6dhMRkaOd+Pkvaxu1%d3$B=xL1ng?l2ZW=yU4wsYrg_J~{ zS)D*dI1~xuBE90sAQC_(Z1xpQoZXH=yNKgkI>0Q%ECx`?EjXbE~vHE%k`I}^Xxb&v^E?1ZDoh<>D4XWIbf;PX{E1BU2JHLjPYERZ1%xPR%&iV z#O@j))I)Hf({Npn{A)f&RbnDqD~V+VQ_~39{HDFld)x7VvaV~OMW^>_k;4qMv0{?pe<+pa2W#1n#%wP({jAE|{$Ad-&swX1~1lnlP6uQz-@L3CXL zeuNs}K`{fX3NEXJqK23hiS-fRC#q86sq%ZHyTC0O_DQBjtAMRwLLLC+{6Sak#W4;0 zi+L^mf~;M!x*|uBOJ3S;*oaILi<aaqPXfRo-cr*XN-*J3<3?;q zR`;;NF+%-4gDI2id@ub~$DZiUx97St_k}?At_#bRPC#pH0 zs}V?erUA`Jp?4l|p>ZF4F7axG8l;Kgu&2-7!J9>HRZE91d{u(tQmKbs6;GuM8E`d4 zYh@iBA}`Tact7ix!%_s^W^iN@1RV7%%rABH9-r7L&DpOJ0}D%L+KJ5S4P8neL}rf7 z8G}C#QME=kJEoJ+(}xqn>!yiU7{dKR^{%7`NNX6PzwUgVzT>rHT>g6h*0cAjMwE(0 z3{!2M|E1dOSnkQMko{Q57CpkiMR~GE>e+J++mWDwb9T6Vjmqa=hAa4NeU^=op4VY- zwc9~Lc(Kq-t8xbX2>*6@3|o)GE>s!-+ZwPz9IW~BK9i`BDErH-H8?mGKt2-+u(sLx zDWw#0of&~HELi&f7ywEJGbF&#A zV9%B-oRHsCgMlmNKbm*UQra@f#s6)3#QjE_oTnx8?*m`e(`I(kqgYG5Y& z1F7sUocDU5=w{=_TIav%fpw9JRd@u*(RT+WSQ2DS(5O+$gd?Z!1UbGndoYc%E@u@U z{N(hW03--gcTS-n9{jRa@H0r=9EDi*SO09LX9wYXj}X7Gw0h4PV0TWS*%mNo4Q;)V zHT5llUU%XYTegM8M2}sZJX7D(DE7Cad5E79RU8-3N-GNm`h^B$NgFkijMPdobNh0M zGwcP9iYTaYmH^;7y1|)kjQ;M_{sdXFN8K(#_|h}cm|40KCjW8@|1bGBWdRJ3UXoOf zRsWT6=tID6ZE9C(o{&U~0EiW+j+E9l&N!FcP_v&LtnmiWepbY1U;)3j?CFXQ5Z zI}X%&HCH8NBBvI|hv+aP6`M1(p2OAR7@1cxD}RPAJOm-!_xRsrL9ixy_e2WCU1+2R zlln&yz4NA1FYkG4(d3fXC;E<4SNYRC2sLN;BPe*wAkE)h^>ymth6Je?lMG~pa0F5MqZ=661?MD`#dEZaI2^ag zQ97!;(Vxg{&~jANm;#lT*>n{XE5)cc3_1EIp4S68Z_g?Z>9Wc#p~f*$We~E^%(+X4 z@e2cwfI5(H(oFsh+Zq>v(s+m^!U!P=d|h;)P7=U;WX3@KV^=QJm$iYR=oM)i`ERKE z;#4O@p!|^vwi@x1hAl>7W%J|lv9C!2rI6KGKch=?f&A1?YIw(N>q{U$+&F+A1p}!P z;PY=2pm%HUQH)Cn(v8TEo^3Itq)~a+JBN&lSk2CXVVuqxj}h{c6P58&0#a%je0Ej- zl6O|YG9O*RvZpT|(QP~#)b|GaU6WbcL5}4>S#?>p(b2TpI1>1G{zkNYh;iGK?0Vm% zZ~P@-Nz?!A8$5O;D;*kIpX zjq;9Uui{DkRE*}No%PL5Q7%=sZcu}QYU&0nV~2T-m7g0E4zD{Q^Ezd9j{~+Xsq&58t4FSkNoIm;)qn+w(WveykRo%$CxhR zZyE1LMZ(&Pu9qLZIj5j8f=~{2B+?s=kj(5O1*Zf>@K>cnhg@_=tfMeLNhAr|Vs_0qT1eO4M5`UUWB`46Bfi>9ugP3c>;>+V{k zQG3gPM;e>%2vU0K&$KBFjQ7+ZC8(%Nq!cO4C~PRq+LbTq`i;2vE$_Ui5@X&8vJ7ki zN5v1;hvy-+<|bBzx0k zF;-WjB%7=69?0>xz!9@!o?AwsX{pggx;T4t8k>DE_cZhn65Fvn0%z{!wK|0Rl^l_c z_39zb;Uvx!5H!`4m+yU2BCeVn)HmA?mi!jR(t+26x9X;q<*VK_3K{Ch%-pG-_HSfd zC3hb>9QzcC-c-O!UwL70>74z2<=+_uXdWml(xfip5@Z0SD_caUKiTPkP%shzk}Fl+ zW3+qmXvgsJa)=3Hdf^$)MG$wo1Ppc}RzGd-v`oQ%*kMt=fd;(Hs>t1kC9jyGHypU-5>Hx3^66GCR zcb)Wlu@CWB?&-v`psaft5sCq{2Tk5ke0Pbb`Eb>Fq2ot~J0|_1xV!T0ku<7#4J zGI%ldX$_$kMd=^mQOI~1?Fbogc@0hv5w?RpImcEzv#MNd47K6DZVPXuq zlcG~(E9`OA_;swMF2*2h|I<8vaByt8Jzz+1u5@{F26CHENRl6w3m>nj9C+ipK;2Pe zYsE@TO(QHbW$eAC<}vt81MwJiFc+Eau^Gfuc{Uw4X0-?HhPt>8LE*vJnGNJIf}=UJ zlZ}gBUJBY}8P@f1l(LwXm~o0aOg{o>Zm0CK{Sa)VjPU1r6_&w?k?)X69Y=C3DUxnd z94`OFVW-DbEQs|XTxaZ%^$sTP?t;}U+)6Ju1vBqcPH*w?N-a-skpl*JgAm!@0}s9o zYUSu_`PlzHaX#xRYdd+YaiU`Mrm3Y!dWz4;eTM~4`6>NyLqkO4(<%q722#w#dL?DV z9sGLY;<3rv-*_HT>Fix3baUVMY%`%&5to;m+2Q;}YGKp1pfbM7!s!QrOeY>2KT=;7|#I+5FYDnKJu% zL4G3QsQI^uyA1=UmwFq0+A;B|8^D{Dm*v^l@B4&ZIb)2$GK#)V5hrL(xC8bTzM*#q zPTGkJCmlIz80GzO0-3|6Ym(Nx$| z4Xy5JYx3%4i_MgV#LKi#yvKHge>l;>DN2X9ggQ2s1m9y&xV!C5rk&`1KmHd#<&Q-7^zrvmRa-k;5$RM~D7^`TL zTpcxv3a0ze=Y5`9$FjvzhzuszW0-^;BjgK`diME}4c5C-iMwWXRlrDrmW_<%+oOVVAt%f?*RglI0b?#`Sosl4I-KRM0 zh-GWE08>{5;J2Z|4o5X7WdAV%_%m z-UPiZZ2&^F9DgI|`PCeBLc+vtyZd;c4+oaa;*X9sEhARD6L6Jmj!gM?&qo^J)2ON< z^2y{OTbA2_+C>`7V}&qV|7Ys0H?KcO|KaDK?BFvF(WCqyDvv+Z{lhkoILlBBqy9m6 z>e{_^Aoha*CmEgVnyJA7gT@UFKujUN7RyZJCWB>~x`I_R?<3_~QKV9y*g}|5iPdu#ypp&~X$x5RIhdRiu0^%X{z3k@P|t^9X~(%zz_0_F-9%}^!&mz*f??wdek z=$yDFv~<6yYf}-1QreTKWms$vn2uu1y8A8rA|uwUdyX9x30Uu?0)GiyyCo-ERCQLz z3Mv%ewkZ*zZhp&()ZJ6u`l1L{@4J+*ePl&r^zY=h$>L7Ls+o>&7))h}Js@w3zart4 z!!_+O96lX&4>|0yFnKZ~|c{}e1vR?QmddmztY({F?^?q9< zIF+1~oe9ZOq06cTX5GD>gh8&o9ylbN*_(Y3Kq5wf|iTDz?MmgoSwOVF4|L2u?#n*E_!P~c}kvQIiFIAHv|3crI#}Sa_VC}*O9?%?2idjsg?m%aI zC5c47m}Yv{?NXE{E_BYE^((1@>%&nt8ZZ+7pjdE(%{Dv+yN38L;7qylRLfYlw8ePx zLKxvFJS==XlJ~J*XUnQ&vsS<)h(fm1GI9p}-8D&?M?oQ>^3F$tZ2L(+N{GzR4R-JmIta z%4)#vNIdrHm1lM1>GWw&uuakuZ9PaGHU3N(fQJRt$aXM>ZrzVUhrU>AUAh*Nd%veD zWU8yltni{gaPg@GmevAQ3jU|2X650%+{-A;PdT@w+s_&Nh7nP&u~_4LlDo>RH=v%~ zy8F`wkwprvAg5ntz4h~}R^n)wB9|xX+2Zqk%tu_v_C};*nQHggq5t%w2!02~VP>Mh zp0$_NmOMWv-Go#kXismeorEnmD<`T+`2Z$Xmfn`*w5G~OUMw8z5Eq)u$1D@@kKC`{ zS`RvOCU#?PJ^_CLnM2})=U{=7pkDzahgjAF3p}dgGx_UKCNe?RqZSB@bSO6k)_9Sn zEP15?;+=c1XHx`q1kKV04gufSrE#vVFMcs{#7ubX8M0Vs8(|=AnhY~nM?}540&6{c zpFH^AnkTD=Lr6Nfr51HNLh|TsRFzcMM)fSyZe>2*ucwe%MMY)KqtHjNbR^478Fyb+ zT5fHpc2~2IGF+xbL9$Jo%)&iqzZ^)M_cWFqdTCaXxtpi$ja>D4rjyA^pHK2ANDm#w zvSyrz*Kf$@>0`-9TMU!zC6>7@UIB-e774C?Cw~C;K9}#Mj{iprYHnR!&=JmmbCscq zaX!DSE0&>o$W6hK_PiN#OrRk34SRBV}<=Gw+6b{ktXY{z?Df6c%leTeWPfA3s{WAxJ2 zvjeRs-FbK=VAOKhS3wZDmK=0Q44YNOg{_6KD{5dmPW1oMaY)oX>9be$ngF4!>kGvZ zoNBIa`{x4Lg1x2B+2Mqt6~iJlR;Tgwaljyt@UF&kf?u|EI32dul#Et!vOMa`1O ziR%v?nCwJ}@FEOo^Qot~X|iaY_GE=0UfAbS3vAqC@;LksCZBq?{gSKvX;+cSa~zSM zHVNWUILfD~F&(nS1$~-w^HblxSs1VxlS4Z#nwaN8B@g~)cF~h{3Q;-K*=*pO>T93r zU_fg`bxes-vcah=HftfC8$i1#mYd_`_IO#Rc3RPM$l6sGC8_P8ka<8W<4$!I6w$hk zebwH4JG&R*mqhkB6ux@#`K<~X{K)Jy*YZ?CXQC^==job=ux%IBYndsOrG_aUwe@eO(bFMj7jg)wHvDW7 z2<&Pc`ueOmOq$3#ivYSUput)|0>=n3R&fm1#tbQ^3*0OlN+as?ShFZbE(PWMn!k$V zC$9bcqHS*0&)JjBZa==!dFq&}3c~t!){_8KuI^Fs)%nd4o@Jca?pD4kswlsQ;u=5) zWv`-?^7vw!OeV)Hih(a8g6H1xGRu!hha%NGtk@d9c~qlHKM z?7|%1{;=djc0Fx2t+SJ?ES{FSQFYps450#bk`{l?>(Sr~^pmpW*0lUeX-i!xb~#v^ zALt#m$i6E;1nxdKRS^ulVnm!Bv8M%Ui~UJ^#hVs*2|^o=2EP> zNAd57*L8q~{y9V+))=#j%)aOF^|(6^$os3!jj%yUw<4F!-Xw?2-V27iB;axT9OgsB zwPh~d7Hm{?4gY&owwodhw?ZZ~yM95+vKQd;L@!qb*_qX(#uGl_G zrcR_ORE!RW{aQqKv8*|AfkR8|{WADtGAn0mYxyV*je?Dh#r_lwkG+T#&rLDtGeg_) zn#&cionQTjjt7H6C95*?s?7Uxj|<`6mJjg5#fsw=GcCYow;Z}@(1dyZoW0dO<_jEb z&YPC~wSN%j%CfJx*Bug=4uli*)K3hSyJOQ)#1K^gp(PCX&8_+<8rkeIQH${jE`X{( z{yZs-s6ia1@1V~mp+J0CRTTbZE&Ze{Z>>p$00K{si0LoWSu_qD-EfEO3|mU?`1C9% zK)h6J`avI;Mw=y1rbt)BzY*KE_K6kr$7CgYk~kG#p&`3KM=&?Mj%cQVykx~Ky$bm2 z0Qit%PFd~bCJ$bN1k??->UyOsN`*O0> zBpQ$f`t2JmM-U+;VE}LMNq)~b4d28Dais0WAzX)4(?SW?mdH+c%xS4pDEfhj6dLu zAEBuCnRHtv0cs+3AP1cmw#a3;|19fyWKz4n>`&>MR}C9LlYgU66hOk!uD@8e;MZbE=^zfSQbKko^aI8dY(x zSo0n8s)e^vIVJEie8mDCwnRRFc{0EWXD&v&1i?1zWgI)ZPDthG-v^_OfXU7Z8LwQT zvEO5zVw{}J_#aB33sm)EiJ@&j?Lu5kXh{|Jy5q0$rX%l77ragHsgtpLA?cr`j_3lC zq+n1~GP)gHxo2+vTY=G*FmTo<@}GXB{6ExvbzBr}_cwx+ARQtpDc!w*G)PH@lyoD# zD-F^DBHbY=C0$D?-6fq%H!CR!yo28H{GR*yJ)ige^W6_SGwjaHbcdwdoU+Vln2QAE2yc&A zWPyahysDQ7JkwQYpr?4GXh|!}2?2)c5hp122U6q}89Jl3B}xvmy^z3D3Eia$W9aau zLEvyS_PkiSf6*n=-|5@cCCHKn{s1OU7=2=TIye=q&(&T9sm>g&%AbD@Gtl8;a_%1W z*v_D!g$qf9?J7ct=KI~QQVwZ`Vu8V{kOUPzmC5E^5dRmkqfyF?NPKmmbn44RUqeTQ z(nO{g&mpzp1-TnYEf_gRSZB(_JIF%yJ0Vl0ZrB0{IxjOsq-+WLvMo?I7@vfBL^1FC z-NjFNSo(VGV65Ar&@lJ{wQOTWl|&xO54{B?$`7mrqH78kHY^v)8x_KfS~PvTcYVMM zdc-g*oMl=kAU<=b05XE;Itu7RQY3zc*m=o7NdSqYA&L^2^HB%9FSpx8mk&3JVzB+$ z(11DxKoYw5I|;3g8=%42oep$%EX!yYZ+m)Qs9sqg*eWS^O|KjV+$f9Q7m~&+$B(NY z>ewA<)~_r7AqoTO0=BmNVRme`@NKARSxA&aQ%rVo?Zyy`_qGm_f$T{Vz0h;g?e!UP z8Z1r(3=u8p$+lqlkJ@Sr9|r6?e<=Ex;ibwdT)VLe-PENM%9^{au^igfwUU;L3`rC@ z0YJDD;`mMqbqOyU=Z`cUnR~Mp;jkA3U2xANRDqiE*OF9`@2P9#!mIBogylYvVXc{% z9I{9W#Wc+>;V`aS22F^zS-4sn5LxgyeOz1}JP&+Tef5Il=Smwxjq^dYZu4A`ntB0- zrg$8#ewWKLS=&oF55EwQNNo&xPLI`m^P@H#wBs!Xkns-l^@Vgt zWQmN9aBz^r7y=N5@l^(z)AnpzJkvtl1SU#yEsj4A%|cM7Sic55wHGNI^O^Pp@gVXV z8!{7ue*PT9PX|}F&iba+6-0NSWARG2XXl4P9=r!@>df9d7r@^AG`(?h;6MR9_lKSt z;x;+N-N;6-w+64xo*>?sFruG(+wgHiC1wsS2CTLrElHd4R1SoDOfa1cevmaZPjnB# z>x0UK&*@k>P3v%pkr1SITdc+%xr&WXU?T3zw%sPG8{Sjmkd%D4GeFH}cWMgBfe`B7 zg^s837F5M46!-2*RVjZbY?1>#ac5WZU3wuq`a}YjoFFX+x_8%%+QGV+(ec7MsO0Lf z1K($s+AIACqGP!aTHXB_cIf-+U`&}EG&pXVLe%8WIgkpp!ZkcKE3VZK(eQ`OK2d!pLYol>5#814 zj$%UY4c?T4&Pl?0Qw!PvS|GRuODZAMr%}zbZVwjA9==fm0?UeX(Yi>XJKrLh3|U}; zXUDa0`^SBhCvXBlQ;9oGbF<_fbI8+wvchH1+Xqj6DBUtnr6t^U%YkCwgC(AR=7#6a z_xHAJH1$JtLL>+Okc`3HAk~N4_2wj%f9nH+nkdwEIQ6wP*Z$5ZTr)sYsj5#>X*4WE zQt27&`b!^H{oS!0#eY!2WXgP$y%jRZiZ{FreNwd{>ug!QF@_W7;UWiu{0ZIbLUe?w zc z&Dcg(Qve{+$guPD<*XK)e;FqTv(%cjh7w&M4oJXsQ~w1sp!^*jZ|WVd@S6_23Q6oK zk~6i`H!9vFcF_7wV;_Dw4hZ-no%%dV?0JUBU7PT_1fn#sz{GvpMfQJ(i@|+C5Kd%T zuqwq)&{d??elN`93c{Tu(u{7sbP@v&j)bhQ%RznpI%1X8V2pl3U$f~E^rr+wDa~uo zCk%0Fw0(|kXVI>IxTr|Mz6GnSWh?;=@f7#Jii2TrV1$KqAbrNg!$v487JeZ{pq`^{ z0yaFb9&G$63G=o*jMAJ#`S>&{v;2*pu;(_C99nhgn-RMXek)cm4|+y8i=|>W_*(*< zLICGJPPSkB%V*wxq8jGHZt3r9t!cWazW)#pMWX*X>i&gp5nma4O&ZrVUl4p2hF@NT zuhQR~c3|5QIABVS&1#jz}hr- zVFl}k%ICaGb8G85Rm6{XZ#P=(xrz2 zZ~-U4kM4N{w5|+VD0ttvo6>)Lk07r{QEEPUg6U_|#n0!#fedc`%ymv^wB#kaF&XzM zQZ~#)8bGJ!Ow>r+$Ek1Jjm#NI6-W?U7;1EPlp%#hi3gy`8a@?|^zWbrH{pI4W`HX3 zXX1(-Usbvj3f&CF9X~9-9PpN4q+8R0O9QaBOx=4G`(+*fq}&bl9a?u(RZB&LlU5tK9C_`^Tc-Gd@6ozsFsbv##| zGf}J09380lI}+kjCA( z&FwOvoRsaZ&u5SGM)a4j9{CLy7`6$!^{`G?u62G0;eMHbTAl#X)5&=6`v@MY&Bx9{ zWwzq~fHv1oZe302M8ZZAUru5|V{n+!O9fA2FZL@bwZGxI`U@`twS8AmVG(s1SRX!w z`Gmr0O?{-RN4wi z#NBNb$>oJJOb>INWuuMs>yNn!eV>FIhjrKu?5h|swaA@&8rmE?EFeP@KICk*P=;X$g@W$BL&_~Da8)qxfofk zEK;J!B*qfu2Xe#(2~MYusM$Z-JT(jqM*m>YI8OGETHCkX-cGK+rtGU+YHU|lNYqo5VlJ)0 z;(q3$H8y$eMUZ`q+qCbl%w@|oc-HZ5y%ma}WR2qtorMc?ER z;M8eqWE}KD>f!tRCo!(uYkucz)ZM~dJ0+YQL3yU67CJf`7PYbry{NdjGYE<&uBiPt z;9wd{Dxb`cZDBu5m`~2Y?uOjYKM;&;+dZPKx3NYeCCmoh3sRfW^<3U&>4so!l(D4(0d(`*`7#0M#JFYkl`zfhOfhu6bIl56E+9zLX& zC#HQfV@*^UYXgTrBeJ=!7@z4Fz2pX)sL4+G9pgR38V$GOirU<}kpXJg6=;$q6wHBX z2H6aAvv`#D_nG#$B?=awkLkJ^!kr-jiC=6}4a2u)OhDPKy;J=WZx%M+vJuhEuGW(9 zZ8)nT6rsBwTKT<>mgU5=)Kav(O=OU+8D|8FbS>C{?lx5RU5$v!dGJG+7tgPj;-1!w zJ9VnZJSSB7Olh9YpF8zMNtp7!ypRCdtQGe$jcFu(+EfA(=8g$=Rq396F`;JjjD2vg z9UlpH;cApW3lqKlJ6FTuS7WvmZYalEf!pkORLVcy5vWoVBTa3#^3Y`3?aT)Iy%3W! zT0U9$a6Ao0a9`n&AOC76z%^yAk1(Tf^hBUyW8hd+un zB40+-I91-)eDLKvw3M~?w8)ZS<1J|FrM%s59*xG9&}2ys@HH!;3J2 zY12(DiAN$tVkQlMxbvdgP-f+kgd&PIb?_u$_rJjA#sdHq}rjJDj!* zw&HPSCmKPPm))%qZ#KG8k!&2I`UE)W&`o_WhY_Uvx}Z_Ph)o3=mzzF#wgRY#Dr`-@;gV|K_wO%hF$oSA;b3I%4p6N_Yn&U36=o4EfP z(YXHJcf65wJ84l(B>7<3ZoJ~%_RkZf#$-_qceHh>hgEnoOxtS#w?`xuI>S9s15-F- z;!X&C@}fDmEPp=F9864o?qHun>oPa&wk;n>3!!M{fY;(<_T3W#6+&Ds|3{>Xhr5en z+uV?SlYR0{xjQh0Nv#rA%0bxI5FLIVb|V(Oo0&QEFQ4!0li^4x{Wz)WgIurLNuji0{E{L!2DTay)}IF(U;Vi>=xXd=1- zEOL-$tPj5lLWN@~aFl_??4qN338Oi?4#d zEtD!sk78al+FMYFIMyITIl5hXLsN+T)X)&w5v?z*oeCvT2C60zN41!vE3X(Co^AaJ z!@3QFHC#g6jL7|!LQ$!JWJJ(G_&D}*$O{~P2mMgnSobuNnng#aji2Cg1!sD1dPx7# z2)>?-`6sXxMjvwZmk}zP**Naht+DJvl*roT$sozOdOxAX0#=Dftykl1f#e8G*GiaK z3UjRVCr!O!DYq`g_pL48rLe2`0hLONi0Zf!a@8YqWA)&VrB-j|(fxgba!~fiNxZQ_ zu+W9ML$?MRT{OezkZ!Up>`JW$AFcFdv(m9k*OwZN@}c6sZ*vTGmQUmOqZxoPIeyioB-<;lu~$KY94zZ&bxkbSAM z^M-ycBXognf8Wt&!Q?q9+ckQGQ4v@{Q1y5!tw@ZjcG6VX!nsW=o_3}5dHMh_G< zQa|{@)U95;eVD(mRk9t65+nSG<#82`#omWGT7_v%w@-Xw^8)U%+td!)yO-5XWU2U( za$;vOY1o-P#V>}kTI6l3@}$N1U!O-{k--qf#<%_4CCq3;B{r~@4(_*r3=HX^Q!JT`ng8KcyGA&9sbp* zRK{~Ka%GM*Qw>L5T)feobg6d<`2K?3BmPfg+)8J>Nh-9CBXC8(+{v@UnqstMg) zd~6vys-WE2g?KJrK(h`G&{33+);=VIJ}1QPW7G|2UdF;Z{iDx3+t(C6Sme z97cN)Jc|A@-^#MzhmEpzj)bO&B9jC?Z2p8FG-_;6CzJIh8Ph~sD;Ai&u3k zLrt1r2LhP%I6?0ht!gS%3_>v984D~lJbxLS210nkKf?L%xZ(X%xA1B?pTk*@ox6Ph(DoRW{06B4GlGA z8%xrJ!R(S&>R2J=jYMaJHM1q!Y~M?(YJ;z~8PanX2I|a`oy_NZ@9=V~cGi$1vr6E7 z-bXjBaI;2y((?K3=M=!=rWZ^B!z*AuuejRn6D6#i-(Hg%LYfJfD4Nz0bzM@a3ji#*)NYCdJ_vs zKH(^`iL<2w%0Y>&to`vAwCRubT7C+10&jx%300;TxN|B>zF#g+4l&Q#Ov<5mJ1e?jN z`T$p|G1YG(HsdCl)gnUqqKc3p_ zUzB{`gNJ+U#`YCc;y1fC=YUQk2oHlyxHi*Om?!~FagUxvMkm$e_lZMw7b=nkZH3a* zV@fa_LiK|7ZFV(6dzlZZWT$R_?18WRfEmXNRJ3rp|70J-6?y6pjJuIR^wW0ZO zqWEx0gFu#)M+;CG-A=M`G*4;sl2W(EwDz&@=a&kf-fM2F@#~XKLXAOiXo{?Dt%@0* zH6L$iV?-XG^V5<&T+NaP>mX{|%us}rE<)l;SbTc8^+O5;z>N#yyFR!!#+*=ULN^K4 z8T-=D-#Js-g9`O;gPrH#c;MSXjf)qM@7G3U;GmE=24>fH&Ww(hha6Qq@RUx$;kAJ9 z3XUWQXoHvFTr)nISg%LWw1uL}sjZ<$iDN-Jm}FVpj7&onnZ6WlTHb(MQbjfa?NLO- zIKoK8r?1uAf;2B8?@4`p=B7b~6kfW)^&CNaOQ(G^D_%y~an@a!ZJkNkqhL{&*X!GT zlsDab=A%|c82u$eK5-{GkY#CRY}T4YdL?3-oC7e7%`#TwwIpT$1Ul!jI@+`4{?syk z#=s&&cd#c-MME;Ab~UET5l%`U?Eywd4HefH_@@6P{n5@PN8}$BO^t$@$%@jx^2&bnN>dN>Q!n|{#;p$|mhccu8gl8O3F3n^k<@o}XFbwMcWPge zJOhR>AvH;a$jMq`OPth4>La3MjTe3M7t*`V!tN#ufAaoY%#$KXXEkt3gRJm~z z1;i?BDqoHT&1`2+0kVy&tss0~W70JVgx(kLFqNoq5p6~1u_lF45Uaq_{`%GTHJ(om z2j}!DA$(Ng$>Qa8hi-pn3zqnXP!oPy9}{Z*t}w(5y<>io_nvVaK}?!RD6T8#yPak? z8B9q^UA1VlPG)X_${2 z(JVQF@QY-KNd<%7eK}Y1#!dCf;_Y!BL`CIYD;dpqvkL0R&4h^63vWhjUt@3SHn}&) z?>y`O!u4QRo+9*{+3aB!ay3J&zo;S}rVLMC)uQ|MQJjR zjPRMWc@8pKNy0a$=8Pv&X`%G=$y6E4C}Y4*9xl*)xGGWoa@!xItpGKL=X zZ7qm-gNQ#>q<jFD$`!`6CRpKH;C(cp7An)yj#DMT?3AgrLTZ`pc>Z)gi!~dbsi06@ppzF4h_7UOj+IRbSQz{*?mN(5qmuLBorRY4KXYE)Vm>F3zmp_v)PI`0!iMsx;+YJco8?_`ow5;w=wz=FR&^tBb(h<&(lpZ#OB6!SVfZmK51a zox(?LKdRUDpuX=5xY)}EiXZg1Z|%9OG`R&lgei57u!Zc7u~{a`WJ#;N z+=3wNc1L=t7_!k_5_9Gy=f=eSs!@$e-}PdBOHl3`ezBR`o%>NmRBpX@HXiI#9X;5t zK4ZF}`OQtkp?Cmmlb#F?3?7;SYOENL$|SGeb^XQuN7p|dl%aTD5Pxk`2lN$y{YEK^ zz~^fZ56SvO@}8FCk9LQDzffUX43UjxH`0o8iPD5J$1@yK3H>BEOji+8i&LS<10sMn z80EI-9luHW*SVRWG>>l|n3@Z+4hk~)h;cwlJAr##Vg+t2i%id84Vv!yZu%1l`yBD( zD-^fyG~d1u+)ROvt}#0H0C9TK<*p|R^qpO)w}C(-ZS5I8xu~^BFLbWAE`BNB*<=Wb z#R6r4#Ny$nBJml}~}) zMY^r0x0ooUHf{zQmZ8_u+=J1H*r#Eol)evMMn+zo?3H_mzX{zRI0Ku*?jayNi+#m| z?>KlJTr+HG)+0hqXv#z}8mw0Yt2aC5g^qvre+x23HxE7L}bLC1dcW>1x{!?|ieIYDfLb_z|+(B{oajC_X zxtMBgE$YE=$$q{{mzb(c{+6d{I=` zR#MheoNxl*e5UIkP5TG352xR(hk7=s@e1GcwzoR!fQd7~wL0J5Y)q&_Q#!$K+sBi@ z2!N8>iqIG(P`@WUftRIzsWxHS6V8Wqpl=@~m{}7%_({9mT@iG(lM$f;O|Deof(M6& zp#p~~1>-6~uX7SNr!uD&TFr+Pp_1n%ysg(m9vhVn;fX%Fn;lhYS96mcSArrU%^=J%Mjlf&vZZ)OI% z7@&ETRs{@(`3EgTSBB69;;xP+H2OvJLj2+JGW_9>Y}259A^5NN&IkF?(6KU87~^w3 zxE(09d@7NT<-prv;ju*m5!d92bBoXW;i#7b?gtgT`Y(qDlMRM>y?e`2NPxLGzIno$ zFL40S5hS}iNKZP{+dHlf4T(!a_+R1zA9A=VLX8$8;{rMQoxDZ(SM8o`Ikn& zv^rG+ej#p(l2npbgc_=yjCFKi)F^{gnv_A&jmF2-yB)vYg$nf2G|!u&!V2W9VbYI4 zk-u!}+Sy?W%Vt*u0S#Jly%mI4Q?A%fITe!F{R#7bq!1_kjr^zrK0f=zpLPfsyfe*j z^CS~QKHV3h&j6-UBz|?oLr(#me1s4`IxM7x0}gCw_2O)FjQhMTH|Wl``0NHuc4qpm zw*4vQ*V~2w?oA@v2e&3i%es%Sb&*LT-!kBKKVIe3tupU5QCAeArN!COvSKT(CwOed zk?=fC<@=R>djv{*2o~P&#>?vdHcU;v&x0bMjeKhQMKif%=y^0z--V(xDo zQ?^Hbi{qN_a4BA(@n032a*DQDf zDy0yZ*i-w{T+M&mp^W!WkK=^9Z;upPw}vGWM?eHyUjao@{{(39Up!PXL?+Ii9A717 ztb^O@v&~Nb=@{yYK{+uBU^6nZJNqSkBF>zhjW{Z!<_A7oUQPlxr-NrRk7{<$`ipZ^ z?4clx)&+uByOtu5pB^ys@vnb@@?Y)pqbZ(+zv-GAE<>UGg+rfSnnEIp11@F#N96cl z&7}3t$hoSW#jcqR)b;sslW*jTS)2BZ{9CB?y!WV zzH)c>P&Xt6#DD!-5xV%7DI@J{A5E=GyB(a1o}y+Fc~}-J!BU0t&!#v<{zmD|zc{e% zPQm+GH!>S06WIP(wzhg8bQA>trrA&dPNm)KbiLf~bL5Uj$5;8_{ySd%D>BJge=*cj z05jTfEn=W=cMr;9G-#t@d64n=w{#$Siks-fneUgC)tkL`x?S@ur{W~UzkJ0D1(=o+ za83rmIb|Sj@pRUgUk5T&h~@41GE~Yw^`V}1;r?IB{SoT6jr4LxEB@WUUT@1NK$tI*HZxd8$WSF^?Rs1-q@T^d3fE znjX)(!J+O3Ae99{(#S&h^e}CW6mWzm#M^M!yxFr6>dS6>eR=MC525Wk8B9;Hbn5AS zdR_@|=|&M+fnR?8cB$;jA1byhS$+_US{$)ivkQalbZy+N_JKS0jj7*Mp3J#n@kS^@ zZTT*FYmO%(2W_W1|6BpBgWNXlcyeal=nYk^VXVNJ?~&g{{WbzrZU)0E^i~7nVRe4K z^1oNV4qqtoc`#GvQmAVNGpHIZFG??Purq98%lKn8RTi;XBy9N+_Zu)yHT*fgBM>&R zx7zL;GG8D`5I(DN@k_jl-%|cs{W|=>;#9rIJza8;8vigQ(no0@{~@RZ$&t<7c;Y3eU2f3JSoZD__y zssTjzK{%!egfbaW4At}=Ii}A!FcZyW~R(9r2iC^~g7>SJuA|M=zqvB&D zAm}(vr-Ap`fBs^{A4dIU|6hg|(Gm-?n-ZL0;erFjnydjm{@Jvr;a`AW1I~>t^b64N zuNxhs$GWtNy0FT^Kbl@R&iND1LmjkMq4 zsYS*I@C4&GPkx*9w?w~Mi2j>@%iX{igdb=>M?nFY3Wxx{Rl$_!=`S2v;8;geuMhkLrnknqP~cXeD{pTe>B~+DNXaqz!F9nPrnGT z?jkGju7^<~V5bZ{Blh{_%71wMFNuCR2*n>xbn_QD?|sJ-@8O->VE+c^r5Q{_@PA>Y z2i9ZNeIfkvLLRuo9n5TZk=eMj^S`mu@oznP#s4!an_WgiZGdN;r1Qc&06J2-zWU{x zZ2ygwzJCV;6|Db-mFaJvX!d^39FzGV=LnXh`BXC+nFf=6?6fEegZ4j1puF z{~uunVx_j>2@jAo{>d;A_P@~dxzmIEH=3_pU^T5rbAJVwKZy4$+@0(EWuB(*IR$q) z;~xw=fQ=Q9mRxQ=8Wp`Y$SPjvOmk7jK`v{ zGBW?;izHMaXPNxX{@+>qUz2y!-@((C?-xz~xzF6+$vYhg3rKpulXpb&U#aPzdG6p} za>&dtZ~f;!4pn~v;em*NfMxZ|t<+g8{tgfSNcc^E^S{aB<1c{!xzC`?UttI6c?x6& z-Gv>+in~t1A5rq3odSoy!$3;pF9-PNK74=opBRAf;zvZP1nsVixI23i_;%J7CgKIn z9NwHN=sxY9>MuVW-i(kO{J8dKH{j~kCYDytgH?`qzORCzQV{_}=+eymrC&7h%r}^b z6R2ni(2z`U_w}lKZMd_Tokj`ok2yGG*+#s|?V=ZWiV4~j9!ByVS-^ga#O{)21F<7d z>z-_f?68qvYs%FQ5%Y!(y2AFcVsFI7NbC|@$!VJnKVU8j!-c-6YqLg|1|Cmn_M7f5 zgKDvc)IBQncygwN7$_=qm7xt^PbSL~ixsXhHyIcfo76UHwch+#iAb@yduf%B;~y%0 zuXr~rpOKLX{>V3$>H?eU4Nc2f>UDE=$!_J@1oJtQm9R1OZ+qdu2LIS_q zH4wTies7n~ZiAP)((zI*s;@q8t#_iMDS1E`-q`-%q$5#=viJIsoIjcQ;Q?C^7z(?M zu>2)fryeJ~g(1~^e$~v{1m^wHU_ocGfA;HW`HykfLML;^Cgj(j+kSWr@U=2UT-Aq{ z`1pq}v5{={k(Mr2lQ>zm$)DT00a5 z+=a-W<%AzFWcJl-_YN0@X&ahN=WGi7vmI1QRl9WH5iQF|K?tsLH;-l zK%XYO)CDF8FDXh;gf?dE0vswR)1&xX5Ib@&sR}n`5JLv3?amFKkM`+x2xu z<1Z@yaheLZBiznOLEsUl$vFJ-hN$g-6X9ymhaCSN!q;b*tHX^rm6t7eh~*AboeV%X z0nS1}9N}?Qa@WKEx1f#xt*i=66dlpos{FdStiGhuGSLuX|ZshBX_{61Y zIfDfCk&r@Qnc%X3dse7k5LD?p)7suhY(}tk>BF?|pOaiR&w2>tY^$ zM_Te^Ss(FEt}OWpUNj$sbu}3xPsCfAq>fdnO`TMXBL9perbAF;Uvm^U?p(vFWqbP*5wm)aymMqSWp|nX4aQ|Db8rxsE0+UzGxbM z)Q;>#Jy_3r4fhu;)0c~{ZpmD5F){SoLuiL1bMwxFA6cE=yOf0)&#}FB@ATyL&%%FJ z&AO@5hG{fajra|o-9*jV`Q(Be`4gSHV5DD6M+u_p{dq!5sWMIfaRCYWB)NeFsXdY8 zu_%iv1N(4twzpXlKxtbNbyOkv9J!OEF*hRNJxXb_h_?AKFK&m#W zns+~17VUqYo%8s`I6vi-_!=ou|Wq-Od_vm-Bf-i?(>=sVpxyNDw3i_XMy1m zDxJ~38f*v;55H0Ssmo{r3zy4GwY@BkrGM_T?6#%&aPU;M|570^xI;f6V|tmuu)91n z|G@5If-XPjk?@nxToG-oTb;L9v`>!9hBKPpj;sEBtQRbp?2sYyFks=rjTFX1T6NrH z#W`R7RV}`Fgd4DmJh7XSiSm)GNeJh;P;dD8GrV^=6<$@_WSr365-AJ~2)y9VA}`Se z6Ox3k_hddBK{PLaT%1y5Cw>AEbJ49}iF2AZ&fYnU(xiDX25h#<5iD~AX~F&FFw-I) z8`7f2PdOb|4xnVj6ex{v6X}TXREaI;-2H8t7vEdSXQH^S&C3t&NIWyhOu4td-GTX> zYdxvWVxc8MdA-IE-ttWHLqV##tW@xo8j?$=XL?5*C?t{EP|B!vILNgVB{SIyW2LA^ zWICRfn3)aD>p6!#!tf?X4a|J$bVfb1^McEZW1!ddcu#j-iju%XjX49CH9u8L$8uSG zILDp>Q{=iy=~;Td|wM!4nM(EU_iXWD%wcRGvL3CtNYd1sdT*eCgXHqaqEfDTgu z31+^qM6x{sQes;II10#ea-RT-6X#v^{^DqVF?+W=>pc_}6dR5Gju z)mP;q@$fz)V7JAXH16kM7$2;%{84!WL7rC8f_^yfzr()0#!tyVgRz_EknNomx7ky^ zr!x9+ncC2t76%$BmZg#y8f(W&X86D;v*C7hSMW87w#xiP>>cuTY7q{fzLWLDkraGR z7Btuo4rf$hXAIhAS`Mhk6;fn6^7%t~r{PFQ1(NOMN&Lurmm6#(U%|%q8j(V zZqD3MKlQVoCkI;bdV(2$z;z#~x=)P|8bkNPghDVbb?C`sgUhpF4SJ zp%}2P3vCf=Y&48u2v%+%|JhJhUJOh4mqfOYD;#GMPbBQ$J)uGhprV$FsVz+qfA`2f z9&enAfWX2^-;LkQRF5CM@%?-0Rr;o9ueE*KmiV-qKXrT`8c0<#r1*Np_qFZWP@3mb z!=Y9IbNOlB{J60p=w4Irleec@Ll^8w2^gL@4VnhgrQ&sjhr};PFx^fs0i4nM>5 z@Ic`WC`4Oi^%OTDe2jkOV9~Q+%Yu*5J?-JBZTFTZL$@3rDcD-!u_C$979#Mf$<~I6 zs-jFkHXM8K7`=M7Cjy)BOpuwjyavxE=p}0V15TG*hfgcHBj=+f;rFn>6n?xEtOH8O zC0{)Wm?>hv4(?K`aK7}D{%F}Lz5Xp6wI0jwjGzKX)nLXoio-ECUEs}k1Q2msf~#F& zrW4x(3TlTnRB`W3p1#NZdn#?O~`{?#Z$9N{duiM>x%~hID_6IYC?&35AoS6`4Vua zOQrH`+mi^Qjg75dp4TcSouFEIAe_4Q;-3ngLR4 zhNujvtuNJX(LFNxuHZ|Oa|*w@(^Hth1^b;ze)!pz+i4y8MLPG6ff2W{_8`3J6;pf>MMRm?R^_X@D!?AGoj5O;vb`NJu z!HI2*=v-}lk)&4)L1g#nazuEH7HEdI9~3_9)(9hL(*{{yv?I@N({LHWEAz?;#t<#k zRVEaU!rcABR!9kaVz}J>9>TBRqlPy~YE$~NhT8wEaD5L_5=)rUL*K+cH=vp}+mKt( z_`3SFI}O9vKa>cY(X6kV{Fg}@#E-q9FiydFD1}ft+iQ}?f=0{wnA*_Ugm|xb>!_g6 zoxXgRw-B2Bj?pTb%2$h$+FX1S!}UrCDsXM$#|e8>g@+*a7HmCxff?6L>ko`8mgjfY zum`N+$~fEG_qC1(C$aY{^M0lm`tkwSF^^n)TuZg*Y9?{C=m><3Zv_G0aoy@QNfOYB z+c6ZLs4Ls9;I*oQee+{N-|&|l3&IkT;^UAH5IRaG@{`8Xd6oaw7FM}87Y_$(U8eY8fcswarGyoT#j+u;CNQRY&TXIVKf~VMvRlDmPRCmW>-oR&HO_}wpW)D zmEgmoLuyaK_5+$-bd@8R)~ji1>n`9rsA8@r4wP=#K!;di*#ic-Vd7@WuQ6 z+Rz~mdqjd$TtOZ9SYikYdPkkV&DY12;kV>;0|>=Ls&B;IRVSs52PMTM84=e2>dRt7 z?sy-x*RC+-nej`*6DJQGL>3A~ZovT%>1V^d;hvemR)Rj;FQQ2b*Y_>cgF<0_m1Ods zG>1}dKhbpnMSZ${$e;Qk?p6k7wH(j(Jbbh3*?8>Q8)qp%0?eSR`It#RgNm^P1vXnx z-hy~u`kUXg&G6v}Vyzuv?&A~mdrc&}P9*hK`z%_856HG4b`LL0!E4xhrl>t*Nk#0f zA4Ay?b6lw4g&|E7sjt`xy;Y*h!L8VfGEL}qyEJMgVEm=T)`u<6zYEDTkjI7Q>(665 zCgp}I5D#mqP0>Qi{ESl2K;K6QaYdS;X-Y$#pQW-a^~ z^^D3M-t7C`*`Y%pM(9l+l3a(i$LwV)hg^xnelt2|0m|&AVGJj*egz;6gnRc;?|DbeO;dd275L{fLFDI?UnZ zeuF(->q0A`9*REQaKdpeVhe1rY5EZ~dsTum#-ua#goD|n2cvXAC?;D%Gbl9tsEUAV z-8gdM^8psO1!bsdGaAFkkT!hOpx*YcgR5&c6;Db}dx{VX0>33C)3~{;c_Fx?^4mvf zjVezF5#4!uXvOuTHaLbN7WeRj5mHdiwfIL1_ou_h!V`%3v72)YEK&s83F~ObGGk#h z1a!&S3|whODXpS7z0orkTX3y;Vi{%cu&Ga3hqb(I4alDQ$DtD|ol+VM@aFsSmC$ zND9a5b%Tx{-BhtWu0@F};Rruiw!<4W!*8#CwtRo!>eHhtfB`UW&oXP-}fT>+eg^+IOllCb8TQohHqBZx;Vm_~Mrz zEP!!(FSevquzVYza=Dl}Fl9v=v)PEE)=8Cu2lf!@h=5FKj;4}CX0b@!f!S)6#VOv0 z<4f#_ZbHlc!>S6%lDY{%VvuKuN@LB!o3hjTMU#RH)E;GW*WA0Lz}m6&Y1%GFFlGlt z&Sie#SvI!)NkF3>nh7V>(+0V^;b@~ItakOc@~VVr$Zsn&6tqLFKBl9auZ<3H2(Gqb zj!>ipy%=WtEb*H$6s(9o5Q{Umy`+k&wKhg0u@l_PZR(N5q#sr2)gQ!goS$Mux1O97*CwvDVWHA$=!JB)xY;Ra%3n_w4wlvSsH>vgJn4yvDO{LAOP2 zI^UOXXe(!5Pg!71`E_~zGFPbHUF__Ax$N%Qb+#fJBh(QtOPAFO)^%Sl`~0sKv%@zm z(qu;!TSY_ob8q$xpc_zxr5d_u3~{yzwtO)g(ju+n`9@8MrYechqWtjlK&+ zo6CW#+f$pcD?EwiqfR42clVZBY~(jb32A{beDM}6v&$pAaA-jWzP|cbg(W^=Qy2Bf<)BJ3)%NG zh>od~uf=q1wwy@h)XEg^GDto%g_8Z|o1iQ8v4EmUoAQ^B-PgY;)g{g?0wsoHRA%Emuvm9+|*>eeLHpkX^PsX&o$a?0c5pU5nFWU^Z zq7PdNO$EJBofKC0h&dM)Ue^Xp=xycH;6mDm{O-xD-Ed2eN0=yq?Z$VjDj3OM+kx#R z;6FbPY(Mh-`ak|{{o~vBoKua}Tdk%Bb!=NpJ$vjFLos-Zb!FTq*b{aChqku>itE|> zMF$9wph1JXOK=G8t|7rC1Shx;?lQP*fP_FuAO!cp2MZ3teS$-P!JRvhobS~+_1^v8 z@7DLGsHt6>p6*`Sd;NO#+5t%qz1|57JPZ;ZV7YaHj-wUmc)ZjQaR;wk-2`akV_xAY zm%3|{f1zxCKZ8#AS+`bOlDA|%Fl?<)z6!!jx=vqlB+)}S|1joeM=9!fdVuQ=M2}8Y z9)FH1*@*rX#$#AY-dUHzqTMVT3cg8VZZQmYUd}NQboUAXFOtn*=2_;A_meCM?^;UioAq^B75yXb;^Xa z0PbpW8l^>q^EQNzMzi~MKDkHf{Aj@JTh8=JH{{4i^p(z zAIaRyyNcd8Jp}nYA8YC$bROk^*(@_HYbecKLbFN*-1V@ ze4Z?cF&a~wSfX;8GEh1SKKdNLn*nh>Hv!;pn(QVU`#*=XOzxg#JoJk+Give3B@q*I zBz+Veh8}OX;r6~JG*Gz`27k_LGyIY`OKnCCLV;NPDiA)Tb5qJkcM_Mxlr%rW&k zlImy6P6To(Y&R)D%lIZm)?mUNFOR{tnGD}geU}Au9fW=8M9Qn9($IEO_e;4!!lU|U zN??LYBpjSXuY3p|PT~yiaC3ZJ&LK*iYbxxM&hw{9#3!!=b;2FfeIqDPaUMPIEqOzq z+cTTs_2GxZ__cA!@QOh>BgpErAd#`xGc&zLPKVx3MQF`*@JlXGx+QH?o1_m#i0*4e`053fIs$}kk7|{)0o^234Uu63T2EVY;O9^o`xRmfLy7^$ zu2MM3F1-RESuQJcr2uVL5q$gKlj$>{AK1_CzD)w2z|I3#D+LcxT4X}HG@uQ;l5oC! zf8b6nB>K45@X(=Daf3M@Q^-+UB>hxAluZ|mQm$UhF}Z_T3s~**zhE*O6@wR?B`baX z*5b8(bshjG+HFO7{JG=aD!j~)Ot$au-}5i**i6`=-d z?4$PO=zH#KbcpGj!+@m6m;yZC93yf&pvwK6xX|+&x>7DGwcGJnxxaFgE;}6l%6_6Yvo&Lj4BN*!7mfoO>rtp-%8@hB1=BywJo>dCWj)mvXjm*Jr?>x(7BYPDBc(G@o-*+u8R%QZ&vuV!Pl`Iad8SQjRK%B{#X#)6_H zha4kv&fnGNx&~BwCg(z5Sve+T_ZX*+*;L2XimTb*kPBGO##HS2i9QA}MMIEG8q>DJ zh%Y);8^sRt7yyNdK!D2Bo|hkSl?m+IBD_ruyv+|REA!m*FaBGP0QhoH1WxvcTql)`c|&WJyZ}($E|-!tKR8>xB{+Ir#-L|Evz+ zhhdd$BG({YUCh_^5Oq0X-S@7Tdt}@c(omyJ2qIc3cOb2Qr|D3J7sfnMa)eK2q}H@= zdFI|BwlNUCc!gB5m&08+H{6k{;K_r|+GIjidP6>f>J%7kDpe;}j9VzacQC9I&u2Ow zj0M2!C%o*U2-zKh_|V03Jx#20+Rrp7pTQ(iD0-UaaYJVuB{@T=3-=nCaNqG6^a;Y_ ztdo7RSb?+){wYUKA-u_Y=|xS+89rf~5hiUFbIoWhxW`g4Ah&s3XK>+XM)GTU z(Gi9QNj`&BKL^fOKYub45=sV9b_iN#ccdyS9Xe7W<8hcD zMK*Yq-7L5*6sValQ37fC(T?BC2p!vfxwKBeVn*YZc%<`g6~FPd=xO2}Z;D<-9s=Z1 zKQO^uR;&X@7=UFiNQ6wi^zEe_)f_!}`+Dvr+fK^TSh&7sUy}M#3}Wx;s^AHEHJ8PJjxgWR;(h|5Lwc=OJ@{3>)WHWIq*&jp zaXq1$U7hboB}$JI!tMEleW0S>>`e3B!=y~v7=zM8b)5J2?@_qg&C(GEK-q;?>0iJ5;4m3!&um)w?mniCZ=Y^-6Lf-h;quXvtCN>nqB20N9HF6HB+#*Xn}?~+#)8X)cScBnrbtxBKg}m<$+P^; z%D#)@ce7mT+FedmszFH7%WY@!Tz>SjRN%@e@|vcVR(O)NKSNFytAt-eK8TP5RPr?zW@(>R8bU(eI)MmY3 zGLbwNp;7f>-Ok}`eHLXL5G-WR73T)wTL?Nf3%J9N`a75-6L>p2&GvvCSBF39d@#D?(E(2IJ1aq8@30TrA~^Fl;Kxil7ab{FZI0^aIPPzy*cX z0iCt!veg~=FBaJ4wpYK)SNY>xYU$Aiq<1vv%e9srW>t6kpl1|D8qZ(qgxklZR~mNs zjGDTVv{i_A3$HBC=tiwli|kLJ@>=|6QN5#2KRdi4e9M%qEMGtHmGT(CgfjiLgC?;MmKboG`l`Zg;V?C8ecJlJW68hOwO*h`uEi zKF{8y@l!{H^$@mm&G4q=0MN!o%I#{Wz#7b>5q!7gBK_T?f^DjWJz#kgw#9A?5Z$A3 zQa!4{GD^QKKqH5o{8 z(}7m;>_D|?VA6gGxob?691{s zz>|9~@WCw;^>AQ0G2Q%Kyk?u|Z=o`TS2w2!5`9IRlL{xhV*Uhi{i1n^l3Pg`CR8rF?_y;Y9A zi=axeJmUzX!O}@snGCjWE0@7&BN*JjO~u`*Ov%sYQ(}R&NOEN&d5i^{(_<_0kJRIF zhBkvMjd%@D|v~6xyqbuOy}Y# zZQ%oC8=vx!W!;uG!FcRasqJEdFTDa z#~L!JcSk6qtWn{SOC?@(dh*L@%uM`OfiEqNT->cJJJrErDRJ8knbJ*i>(^F`aevkG zd@#TGXCkVuUoHI-G!ludC{ikm@O`52Q=zsj_|l$T@bGxReF|Z%LV0r@?%U~BYKh^{ zYyaHL1cK=WdYFU|yGVz%#4rBHRpc~yOq-JmvE{EFSV!0;VZNRka)OkB$4k?ojXx$FwaM{dWM z4Or^cJ^d??rhHkvAkW~QgiCO(CZKqr1ih(`b%I{B^Nq?=F+pw1)ww79LAb9D=FdSJ z;-XL&(l!0g{ZID2sYcD(&@lODDbaTHE0EXl71mS+iWzIc1hZscsYgM~HVjE2P>XvL zX#e@u;a3X}*z2jKZ1+lRyOHpoV?1@X4p%kS@HKr(Y*VRT_pzvdNyy9NWxngZ0}(5& z7pk)K|B0Z6A(v)JGaa-Hu!YOWG5OEa*EAls>n{7(vO;MA^XXOSFHEn-9z+VeWY13y zy>)KN02&+qm=%b0h)4&Xma&vjVON1+Y!~(-yt0g&T*yA{;*sH^Db=jG{oc3N_(uFe zW+!t+O|w+6c_Y9mp9i3Gg*woz9T`yK7OSHK9@&&yJJjRE)O!B0nJ5wXvEmElRnjV> z!pc%gtu=_qM3<1XIK}bcr#u{_h&Ooh(t`3_>p6)*hq_2@T(cs7JyuzP2jKme`rAzV zzkafOSC>^H9P?KfQVj1L7r;5)^%K^Cm2$6{7wxyJ_Hm@I z+!HEo_5=>UbUA9EB(qDiBp97=)&!%(bD7ulp$MYC zY%8#J)F!_z9vVHI#&tY~KPJWSubI+_8n^le{!*R9%{Ge8+3ZE~g_e)*3v9lZ_}w;w zFv+VrzQ~jV%=)aF6hIAgN`jXp@`yOVB#tdpcj)b9U`zTcB*#b*M-uFIwf|HNeD=QQ zm9y0dErabrJ+Ic9out>4r{d^~j)aA4js_mGN;DJd6;bwv9Y$l?tgS?CQ#rp#G{vG9 z=v4YEd$#n()^HP?QEKP7H$Ww@3EK*!q$-v=`xiAiCk{{#ZSe>wMb|2`(Fz;!m3?27sL|ujWA=Ni0*frM&oC6U zS~aRRoSFP((yD;fmDyn_CJ4V&z@tCW?t;(vxcpa5Y<~XRR9aJkc;>k_7zR!`^dfbt zc%N!!f~aKWH4kXZMg_5U!XAdm zqIDkP-*{gzZ5^Nl2*GHWUv}Ivz`#h~ZrXFnp}bE~-V)_Ls!&9dq)JHLsUR5PRqfK| z{fPLe810ojd+L1PoD;uhf30ejNtO{9y5_;`-9Mc9dJsu!ndl z?^8(n=SC5jL)uYq9&p0al`0CK2&2z1)9UZtpY`Oii_Q=3?-+9{a}Ovn8NRm`zc7@AFnE)zCRc~ z1}~fs0NC6EF@d}f=nDzF#yo%u^Zw99!vA~ogZ_U9TiV0#`?F($<2Y^q^Oul+AI$#$ zr(ymzeVir_69m9x*_Un+r1XE0s(kvZ458~Ra+?hY|g*;8rb*m zH2?R|tbc^N{-eQvFc{(g5H$ObUj7fUw*MUL|E`yRhV&l#g)H7w^JDfnliiuV6Oe4i z;AaZ;BV^$w>NS6C^R@9KzH{e|bMOFnd^U!<>q!3>VMR|OYSfv+I*I9!X{2dM?T_hW z`b@`@sBg302nqlGj8`?dchs1|Z+Y4xD@<;H>m}w&3L}MD2r50v) zdN;EeY4y9PfzB{=6KZ)sukjja`yATdxU4{dRQB*5G&f%mD7OW_H}%uW_%rlI0~xu)f;M5tg^iYnF}IMmB1BPb^AM}tD;EZX5p-bstBn4 zd=yX%uNy)Z9whptKEn+9z(sP;M9?$)P0W`qX;>tKjuG3nnW{#Plka3xeHMe|s&Kz) zmDJ#I02i&v@IT>(3$$f04RbNG-(+UIA~Jd@dN9e`Q$67)Z~n7dgLAOw_U24pmN-+X zsoO;=S>iddCu1&FH114>6Z+OfT;*eSSarOmrtcG;$^F(t?>VPmePNVcRdu^Bwp3N= zh+I3If6Ziz-}3*{0Ax8Wjl++^XV)r)RUD$JqLJA}FgP zL`pbov42_b(?1YoDfq75VWn#0k0f}InN=N@0^H8fhE!2t9QOte&)Pg2#`67zqow9` zUc2b=<~i#-WLq)9UF)|Zz#bkOFG@QTwT=YQ#uHUs?3?jdkJo3C3Gud!Nb4C75qsS! zt;eO5{f~f9d)#CKt^3>aDx|YoC9pj{ina_cNFz9NsC@K2tf~4VD}=kCM?~6DBYpo0u|WO*is8!M}H?T&A%I)`0NPVgdL>N8tZ0y zGb2{cL`{UW7&+Ya`O@xp(9G+hz#L7Gz@Bc!K$a5tJ5tvQ*q<+r3E-Lo5c)_xR&`6&>|F-TP1&%NzIWJYFqXR|A`s>2XMFI6h9OEKYmsH+_~uYtC7j zr%gXUvC|jvFU!up|8Y#!NfwpffZWmY_)TC$0-CW5?c;_M+1-I76^4qqg&NK#<)b^Y zF|1_s^_LuPEZ4of$8+#FY_f_P=hZr7alVU2d)iDUAMg%N4nlj#c{)Z<7S2vw(>F=?n{+X`E` zdcXJ=E)`GA7VOI>(u|MEW5$AtRMb!fioa8v-en9@pI=fWR5e<$&YVi&h2OPjAyBhnXK3ej}^-skt7(6j{`074(PP4EkYpUNIs}bwZ_O?gGMLvs#gJ= z%EeEm9#e-t^*mUjr`TRE^tP9>%`BS2Eh82Q^EA!sBL(T`lquxq^&|nqJCWdLHkCr% zXN5J;sxec6tP*a}iHxArpw8`VF^Q5(Vp#a=*os%@F+}FmwZvlD^>_17ZKrTACOsGF zH-#aZH1dFJ=K}OSu}o>mZTdG7jaqkTGj%{n<>wY3(epO9w@A+CA=-$!7Ge|F1B3Lz z$JyApWs#gP8dP(2DZ%Q7c|~x#8zxg;2Ey`Y9V2=eTj;~wHx@mRIXU_~t8%z%j<19JI!jtG&-@!Lc{&*rG zj1m71^T+~tW%D6p=_P#?5mpJj^1V2XX=|tGqvO~ti&R6USPPSoYilYo;Z@cfX^szb zuZFfDt|N-zUWDddU%GKcFxryjoGEe}=eXG)mgcA5jKpTK%*P?YU*KR5HS!g-=jWis zRUhmhdzYI>%|Obs#@YGCefaff#8sYDASv9ZF5(h-%VDY8@TJB|7l`s5(}X>CX3HAU zp=CunD7?5#<9l@388MG_w#5GZ`>bL7be>k(?0caR+|sXsU9xQ%G&x1TF0LZ`Xw?8r zquUW_EHNCd#-&?s*}t50pt4xjkAu9lJ0Qx5dnxNl%vADYMl9g4KXYx~9s+rO{A?h| z*JNk}M}gd5fs{nvlV!J*J&O!mIUwgWJ)#!jZ`$1==5tg8JL_+Hv|-isGwEzrh{sAq z3&P6PzDrdI-U@)yUBAjQ7=zeo4kXE~!;Q271?%?S?~#xybam1(p9!K`*iC*Wq+1;S z4v8ikd5&mt=r0UY+U3VIp;cYl<^L;CRZsX4TI2q&Q*u25xQtild* z7P4ksBxrt@6?MxIZ3LFK9epOoO|yvn3tt1$V;^Zo4eDlaE z)Nw%*gYl~(LrwOGB3`xUjd=S%?IZ=`6~NtFHdnSSG=y?t@l~sIzWv|mJ`!DApOr!B zd!4QG^jtM9d?o4P?`$Gp+k z0=FeNgb;*t#2-yYEOrV=5aL90m$uW3jpo=nqeqdZ0uzynNiw#v&y8zwXU zkd@fvnH`YMFH=wGTaY%P*ssLdqpVcP9Abj!2u*H#s9KtGRNW9C+Y2k>5nECf{r3?` zw5SdY^Kw^c<`-!YgQ^U%P^+cdiJ&Ac3Q6X#(lIEp6Y|fg2$A^F3E(L?OSd(4+$?Z! zQgDatPg^Gp(xEE)I538cHJ9E0KJbqnh;HRE8XrQhtM>?u^Nl>%PcRlOH0*WOEpUal zhHh?(630uc2h3_Tyce#6f%ttSb0I<>{7e9#_k*;C>R{r@N=_lrl+N4VqJcT-Zj*ol zvc_xx7UOFHm082zs0R&3l#x8J7z`fxMw#F9)&%8{K-diQS7hi8Z90I+;FKK{5v*#v zXTm(VBIx%)L@*IC#zv=b$Ewl}aQ1AAD6)smb|t;(GN{{^of}Ta>U4ZEz=OR-juFNw zy+II|C|bPamAbV7SbSj^@h^HP8H^2X9t9<*)wvrAxQ`;JI##)VTk`ew^0naq1dnQe zfZzGqdpSxPzN4G^Krf!)A!MKIu$>L@k)73Q5?2vX(NSdlHCw&dsZ*SosuH&PT;!_Y zWcf^A1uw~bx}qozEiaU@gffRGO>ypIc&#^5slKN`n9E=#)Lj|+cCN6;SSbdDU9Z+! zr-*G#Q#|oyz_)m^Lju!EEml> zpQm?Oc3Xmh!kA1A8oTnLkYf)@scu~x)W80KjROW5b_j2ko})SXBYvstYC4#Ao0SU_ zLfdo=@X;TOgbXX=mOW!j(zbm5irV%tCF@UPaUMmmv5AQ5mg_0*zkUv(gWZ~L#K9|4 zyaj|aOAsL*gdAUN;y&|hgm|4@CBTzRa_-TO{|eIoTM_*~HV%OopNItjf&k&oRw$=$ z@M5LjdiN3XAp-*cw^+eGWG;^lj=l6#-B$#PlDtP!2tw}X$%4{E0LKg-(1qCl{luN7 z%aDlSZ3885C(q7*&;SDAc_{$Ew8lNE^FMtVI9(>q68M(sad0)(_S)5#JS9g(r{J@} z1X?Q-g#QLc@{b(-cUiq~9UQpX=Q(otzSETkXUDSHK9=@uWIh0ca->+C*T{|5ZAvyF z@F@+oiEOrlY%nuREb{1w{t9#*S!decc&nn9Db1NgeE6k`nG!2^gs$}ienFI2hBk8R zgh>sfHUzrLRcNI!R%{ynlGJFTg|Vef;C(?;%TJ0fS9VZ8vjHz~*la%N%!MHre!C!3 zmo51=4|NPKTHx$d%kp)`qOyEx&tdgG!sbsCdgQX6J|UF zfxk~8>2>{R?)sH2&X$<;o5v!e9eN9bq)gKT|>Dn;jPcHK6!jrbC30 zhb{%kmwCz4Ny_VtRF_IMnYjM+B0+k(PIK9IwuM*rGw{||#_eMZ8H_e|s&lOAF=oaC z{ZUiFsk~T$zqarkkta9gIW{eD>dJBYEYiqB|8gQf^@mgD`Zb~3XLT@|q3j-7F;3Bz zrw2u&74)7a7Gu(FbkEZ6i)zqWF-Fr)gXk9-yGg}UkID&+zkC1_^75mww87vl5_{g^ zIC~9hxKs0*{Q0^5*8$ob8$3s1-3upDE#$}PA9Lc4#(G4IYYiM&3j|z{6g~XQ?j^s#dapJFgEhk;_5|ohFVo!ShFE@#FFr;6Qw7%}KA1{(phd{z<^- z%2;fJbX5$Y6jyK|dkNgbog2V${&(}Brq9ORC33%Qa68veDq5$e_$+v0v-SME{Hvnm zue_fb&NPKg@&-IoFexdh$}Gdv}A8)=UA zc|_4wpW|1xJo;v~=tfbY_IXPd8RQwF;^o3{VF9RWCX;}eg3e#XFmV|dNo_KCVa58| zwaQ;_Q9(Lm==V$qw6&V6f}UxYOlW9sQQ1IaJ2;iHfD2KyVg4eOv2k#8AKnIVD{iX< zuFXU?;1GUi&!8eQCt6+r+8|em;ID)Icu0l>+~u9n=kgR=zh{HJZ&ek6zx5UVy8}vn zN2s2G`@={8e-2fUz{rFMEGo}Cx%(uXqimftyH6n7f~VgfoO!&Lo1N?$5B_ksWm?2F zU6Z+7K9V&rvT&0OA>}RV2W`}`5Oim^DuXv`xf;B(>F;*|00nbdc>BppRgh>F7l!n` zoNdbi(j68~ADz^?{StK{u8qwqu#=tIclt7yYdFo9u=>;mQLmgW4!nKXD=%S#2Y5U1 zwSgls8WgKmDHo_xHhl9h9G*!Q(c+7^Z z78q2(P}BC`%}Ky^8P#m!KZ4Y>R|TCJy&d<3)(~o~kruC&Y%6}VCiJ}u5>zJkC>rJ$ z5Sppg-GACT}4t_DCWt0EnultH~l-j78=q>EpYJb!Ds zB%7#jPxi!9a4;IHbL=uUbgnFSMJqxf!dmW*-#?G1LVVcQ?BW9c2yHei-rzzNYSGxp z-!wAvo-sss$*sHH585T(GpW}^!{H3AsVRW(O-Xm18o@Jl;Vek7#7+i!ln(vMhrS;) z&)<=g?e8fy+?q^_ZDhfeM{Qv26aRVrA^KBTX)0t+C!?YP>#qzbm|%wlC+hB2%9V3y zJkgMn5);?1U$g>CbDTv9FJRr<)5Wq-SFdUNt;R(K7keZB@~St!CAcs1iR{!Vx`V^nckF=_H` zs(Sngj9{uXjV~s=^j*~Cc1k%cjh|oCMk;SeMGW1}B~@EfyD6k6#P5(Sr5_#UNzMLLhSUGhF z7bjE)XC03(P#!1g6!GN&Prl}X#hR4wp!)YpT4Dz>0GC{T$~0u`+ekmj0KQ5eXX?ML zw|pM1C?_tAo@bDGV;`S%yzlt#Iqb6-_&5E+?~r?)Y*Wf+SmYL=!#*)TI+QJ1U)a4l zx!gT1xLq3ht@!Hhr|--h3tTJ&b>ARO!yn)hvX+M0pKw2iU(F@}dx-7v;aBzw;2U3# zKeq;lhP>eu8hO~+`k-__x%+2SfWG?uW%a{t>xXT^k%u+l10UQ8v`&|W&;Z_20Zhq( z%R9U)0oxxCYeCTf0P%eycEUqW_W$f9yi`M`*VL%M5eLAe$`iE|^vRj5C%~@yGGw-M zNODtFB=Qj&E^{yePbm|Pez7)cL6m4MH?TO>INTd=edNy-(Paq_Nv%U z;sI)jTG1UH&c157DG^3$iFF-y6N*acO}Djs>RsSG!{L}Ya*7|BaOVrnNisuf zp?p1YY+#{x?TNP^$F|Zr#OOBt3H_m~$1tC7Y?c+>1^W;q(kroccTSh^T{RIf|0%)p zw)(m0SVYhHWM!{qX?)&h(m-dbP>lkiQlBcGemItGi?s}Ob&i6_>59RwDO7d`+>YAi zO4pO86k)zj_$T1$}zdRb3$3?~tMH`u5PevqV)lz6u!OfIwy(06ENI{!d_k=*r|ERp68 zL)A_}#6(F<+WPJWvcoGF*#E`#0Dh8Z@v@{v<@PVSa#!i_5!06wmP9zG0sR!QF$k*5 zTX~X#LX4VF?u9*NoWm2!KHWTy;7R_?zp2{nBS;lgm%btxT8R$wT!JOrD)H>=jNiWV z&@IKle36v;qpRKI-rHrx(cEsuICT7}qiJ-&Y+w=nqC#r@<@NPTO}0>WlAj*vxa z83a9J^wH2>MS!RB;*L0ED4HxJSREjKSN$B!!!SMqfuBqoqzJ7=0zJEC5wF-_T%5@G zL01S3zrM_gGd-Ls=yce@_szG(52NiFb5Fhw=fBV^!t=sc5^3&p1O32_8m;=0u~q}I zO@7HluC}c3UX$5qJBQCJg<@l>q}=X>P5be$G~A_ zU``L?mW)57YX@S>HRZoI+)F`*RQdC{_#M&Qga*`Ix5r?qH^Kv2!%R_DsxQm+?j!>|3ig8_mk3cZy=h?fS)FCqvo{I4NQr6KBl0Xf{F9lf#tq zbx;p6^0u#o(t%Wa8I)9~d(?({jxfbowj7n%e^07~#gpB+ScuP0%K9-gEFl<1G9MC0 zzU=MRX@*~cdSFBAX64$VhweFs{35hXl{QQ9g|>IosF_DV7EqgnBPKsv!>`%Gkg$F> z2}nCswqkwc4&A~Wn1)2#D(58#>n7BUNOBxZzNde|FtGhABmD&AaEjnIZ+fGm_7|in zqtES%nW6J4o9b4pi&dP=2b}iZ##AflH#!O zMs5X;8*MnIRMp`HVNiY^x2c;lm0axo{96G^?0$LxP4MG@k=&ww@vtE-?%8W*WiQcq zM}|;8P>!k-;W424i8m$03~Qe48>zS|7%K%q^=u*lZ7MQa6;zZ9Bm6$L1Id=S2yr`u z@O~)Z{s<{Kdh%Un#vMQ>=BLzNvFe!vG-3+x16{V`#`y~%|DnU59cH=pIN{ya5sF^z z;gy7)oX7n5lO>)23gS`UXfP(daaHys8l-4Eq__~`R82Go(o?Bn#Jo}HlS#zq@;k(F zGC2dyIgic1#9AQS3#3yZt9YR&JRL|NUWdC{xLxrzT^`%`>%{98J)8i|!ByP|e&{@) zd|A{#3uwQ8@(zh9*{|MICjef*nc5!dd(RYT3U;>28pz-?qECuDTu>6!_Cqv~Ec;gd z_4;B}!e89865Z0n@1nbK=y>4&xM`W+NGd=+QXFd5TC zDQ#IPkrCKW#DlurVai^-w_$D6>i20~R0L#pii1|~mips-XjRqP^w$}bKsRK>g9fGn z#TfmiGNvw)ZR>4-Th>h?!anqxBz2h?hOt>5>Pfd4$iJ8jELCxJKMig9;5sVE!6zi>AK_8W2QxJncrieN$hd+RuCprk)V890V5#fO z$BxFA>`gB|8La1Jv^nYDrOpK!#*sT-%!cr!lH7Y|#(OLapxTEpeRg$^ z@i>03Y_-rT}cyV=!q^Mcpp4vvs%JEw>}M2hZgqX z8;<%f!RK3yh~cm1Z-%GuV+%!|T5q=w$K4-;J`O_I*;dG!5(2nGGe;T+WR|`78~!el zlSSGr=H@m@&9I>v}^lCo?);|DuCfjExUN$e63}Yr!qfBDh zbk(Xp0-G0>5qqIG08-d~VZJ&Ov9p zcG@;yt#(nEGZu1zJhK)YP~FZftWtEKDY6*gg$3b{US;qApCL9IqR-jco*9MBU)}0j zB(J8!?S>}+jegp~{;7=mmZOs*+(IHD@oihaO;RE?%QqFtcOzzsyI)ls*>>zdhV)F= zEteQtk-2)@nE{q^$Jk4b$UuA% zh&09h9HU+n7X)%ETOf{KS{u&ON0u0s+g@n&a}IBdhewZij8(ui%);2+lkk9t!7IWG zdP3$fyo9Km03@m`v&=@cU)hH!lP6iOtAZi|wJmFXG&t44s-n;92tTn}=4r2r?L^~o zICi~~dQP6~PKWPHqOKfnaVZG6n$M!pu>$pQClDb_uX*`;qW)PnO}|3==ONirt521% zE&HDdM5frRA=o95M$W&Q;(W;r=uIYKrw+Xh(wwm@a**Exv|^~eztdgHL?NuVOcF~RNZf^>Kx_w(&!BZ2Fe?MIOD-e>^^3Ng}x zeQ$nZ~;de{fA_naX=1twt;ra1*C8WVkceRhYRYF_PR zdtBHwrP^$(F2ZyrVpiz>Bp2tVZO8?BrAZh+Z9!J# zYMXTZXf)MwAk<%MiNH}uY%ImrFeY zJ<0pxZv?@X3N*_S4JEONE-+#>W%1icb_=(M{G1q}R^2!E#$Te?=H8W9cp;|}i>XF5 zg~k(qh2>R$M39JsdE(Mmb17ugWVT!biJmqt#l*Qf2h~NstK|VEs|ZItW6nd>mHXHn zz2Rcd9$PTj{3aWys@PA(zf7B}C?MdC_N1X%6Zl?1tHvi;Qp)+4_G0T_H91P^>~)M% zVDWWep<^!Pz3P1O8^iSL8y{x6!kT8@+eQndikLpaTiFGEYHi0PpzTJC8bXO?kuYUN zU62I?#Q4Yc-#N8QlI*XzknX#7u}5M%kmEV#y(#CmS~VgGBr#x<##MhZ`zf=5=bMw9u$BV^)&hSZ z26=7~6@0|=!9{X?1Pnw;{1|1K>#gbpG-mSXZorl0HQ~VoBEW8~4brsJJn7wXIyGMj z7KW#ZBHg1gf%izxV|X?%0ATu%j7s!?`aT5Ykq;@ULRaoo0Ex!{3AWGjz3cRv3W6bk z?}MiQvw`=22J|pX*lk1ig235w`F$Av{yq%fwT5EjVmF~uKkIv&5LC`hBSq6FhmV7o zFNc&rNrtBC!IGS{VB9$r9zV%o}keM`UZIwhR67zn>~X0gv;-L#`b7!si+*(wEjdE0|`Ky=tDuc=mk7I-$m;LGDz5VR0(U zz2}Ku0|fom>G<>HYcE0ASaKhoYGqQ0#-c2genn`bH3?`@Z_EXIdbih+ofKA&LgkjT zuCe%c57klHHdDk{4ZN9g$(8`b6sre4<9jh(SLq}{i?&3NIf$Sy+!=#sI<@Ne_%UCv zEhB7f?m1>|Z{X55=nlL{q;bYr`YMK4juu;}4`14^#|I}OjVTk*_I5C*-Z+B?-=@Kd}^1&3G(45MPEg`oA6r|5>44kRT5 ziNPOMKAT4EeplO?ffS+rcH<8IaDdj+e}wUuZ3Y=>6gUbZ zw8WBBFiX4J-Baxlz8CMV>r;>h_S>9gZphl=hG?gY&CUIwsbD8%*A}5d$06VILde{h z7us7NY>?;pGWEQWfTl6@_dt(Np{aTZy*68}tZ-GQIUi!EI$mLNgmS&+*;?n}AgX6G zfkEWL@Os8GQjO1DP41u~)#$VG1p>;S+32wyCZ|p~3gGN~{nsB&VbR`!@@8fnGah*R z(iNLxV@eWp$UvUo55rS?wyZ>9f}HB|_~Q?7KHFAx(`7r&$$M zaO&c*fy}T3D|9I@RM5sh+)0+1tMJdonp&BjXyQ6$zry0 zqn-Q*0-i1|nvL%C@Q603U2_F&HI}s3UTHs)meP&)U^+{Z^treyU%;kU4_LgB$e$m?~f`YUR0@BjbB|VgYGzcgm-AXq@D^f#u zhmz9WLkL56cMT=Y(Cr!Ecj~>a_c~wB|M@i6^UUno&yKZM-0NPPMfUG&G;&XLBNvG9 z+21dUPb1dwuhz_~zs2=?^m=mU`RR>TBxHVRc3h(x`sdni-hyiqLW@8Xx?uk@@#BKV z6pO#Wk<@h;t3=#m=`SIq4MO(**p%Z6T-K~@ToHRQ%2>&@lr=4H6$tt0^fdD9-6Kf} z_&w4=3>#id-h3a2XjKi}y~|UQ9SCJ0Z?rms_ejDfjZ3sv%H<%w!b7^U3y{{)Q>_AV zH_Aud5~2=M6!2ft{`{SXc~1mO({Iz}vaAo4LG#tRz~Zwu%B0Jm0Cqj4m}(Oo_Gw1P zfH+1kbP^CJ98l!{n(TlYzCZq*nU($*K$%-}a@SXAO`0L~2u<^^N{-T?)15WPaR(wa zwEB`X>=lb6j}8BTnL)4N6kdQuSp|>X6&5C6zkZ#qo~Kp@=rh>A?DOIf{?L4vuS<7! zB?Uu5**LKb1lNR=xkUDQk>(UlgR1Wg_=|;mPVD1mQYFGi^>}2KMuCX>t=P}wT?#9? zQip-QrBYW(JQ|`0cy89n?-@bE z4di#15%{*7=R@+|O9=5(g`urEZ9Jz4VH$Y#@oQ2txf0(#AFF&>+zgssN#!%?AD7k) z@Uu^0`8=b>QkV)`f`>SNx7J5J+uCvlnfcE^jUsUO@rkDPICJpkVueKe=hM3k)o93b z=zJ0h)n-{b{3gadoe7EPN1=9F(qwDIxP)d9Ps-!Vzz6`!LMwR4O%o*}szMll#>`35 z>D}_#nhp9oKf#jH9I-z)b)K;K)^Pxl)bpH#RU(Atfu=74?QS-~fzz9G2pT2(SYa-Z>>B$@a`fk(yWF0Ex3@8;^&Q`M>!cKP83^HkHAd*~uB_4K{_rxt^ zf>F1Ng+^@C4oE?SiNZzfUv_g24$Qb|yGGvuu;*1;c8n9uLz(Rzil%#PyZExm{}bpW;Mx>3(!Qp+pNj$OZEGkxtXx=A zsaJ${W(Dj7UuBaMGG?ng5d1@!y$hXC=IcsnWN>ZqWNqOP zGT|)4$B(enpE@zx-&}-O-iefBKRR*l++blz9S*j8+mOIji><5}=cDBrduFu_nnjIl zfToM&`!c4!-rm>}53`y{T8iW~5x;db8Q{&;%voU#75sg_bb6wH0to+XNH_``!Foca zgR&?p4|z#E#YA>13zMREBAQBUZ;1sV3dI*UaKdxn{`C9+&NJc17Ab0;uovuW0$G`N z-TutYh-Ni27-_DmAU;YVW1@Z7b%@U8bZ<20DCr#>pwP55Hmp7GeKHeJ*iHP6D&p5m zkT2ccvwq(K6Dw!A6m>&N#H7lJh--TLK{0R7+*WE|@Y1I9E{|~Tc*Ekf{A_h$^$V-| z$BT+U4KP>`Z8n2eFRGn|LFjonk){3xn9qqwXHaXSOX1yR9VfhC+;y2BgMJZZfDE{T zRr-W=H8(o}g#KnM=L*ZY`#(7AMIbnj7;D&W8&QIPaz+y2+qF;3O6KKIi zna`TpE4g}J+}NeD$UywP zWSL#E$-8$%V;^5Qo_+y%SRB_Zv;H0CJY&6Jnj)4S=>J;mk9C*vLp=0OeB;nlB}4jt zIa4Y_(z%Tz1-exJMy+74va<9$Q; z7GnLq>Y zdRWnu4>*)Bb-)02L^!l7GXR|@d9l`duooWm9USPfy}9^mPDcqi-m`?E_BF)$UrLUh z?ht;2bfxb^Oe=wWqg|Zaf2*8V31VWp{`YvDvJ?C37d!tm0jT5!Dn6V=jjHZQfbfQA z2Xyy&3JEKHz;(Q6(EOH`^w19rM_t<8NH7Ai=Y<*PdX=rEVJls@kSLH$6hh<%5)Q;8 zR|RwkKc!pbi}|s5GTxzJLr*ijkMd0+QUcr#^Xw8#LfrPFIxk&Zr96d_6W;2`nvZh{kdt}$AtVZS`q z*CJ3$W!CLpJZxEbIJELq$quIbmy8@qX7qKP;y@XW>bIIZyywgnn3sTospI|?U^-0u{u+Q;_JZRps zeCf&c#MgQ}7v-l-Q2xEPj|m)FA>W`@#FnpG1D_d9ka?NLU(s7m?;*gS?0N5*!-#-4%N7H&VlXQotw^!fMKNZ(RCy93ws zGbBf!y#b=i5;=*F`4MuE^(hdkRihD>+Md_xY6YIX4ZDwWi?LE@LToRko{7wHxhm?w z*%cAyda9m20w;hV5JQC01?~}=G7|Sz{OAKt^Xf)taqWL@5jM z|L+ja|0gswsOj2GmNs;i|ISdl5O-Qn@TqQg`-^w9+(8D>>px|(vu}}Qr@{&3^Sp=Y zYP8D&!~5Fw&gyQ3uhEK+fmuRvb>(H%9h=TCD8`dYyZDCqX{e9P?s@y}ufST&|&Z}2VEeX=0+5loK-KKaVQGVWL_v|PF32UVW5I^1?|tlKo00hEck zaZh-PxNiJwqd_?#AtBh&`}>scJKO=DBJ!{}hx%0c3NA1MV+GcI4i#P$(YY@;K`NtC z+i>5&69#8S_1%<(G}%?mqyZyhKp5uADF}+R`~Rf`jX=3`9d#(@w4b@)$;M54q$6rS zS3G%aMq_myA9qUU)tt6aKZ8NBWARNWI1sB5!5@ma+<5MOKnnv#Y5-3u4KM!`;~x~u zMqq0?+uSMvTdLK6*Wehipb3G9Otx7mTP+$t|1J>O{j%gjG zV;!~o&&7t;pvr9pK{g=wwpRuGp5xi-V2S))QkvO!%jY>ZU!7xua*u>Pj?^ChB`EYH z7f>ob7&sy6ivNos;rbl^fV@8FLjo%npj6?&)7znR$6=xK;}A37SfgnJs~%bLzH1M; zpWxbN@sEy9vP(uei2R`L!V@&M1JpRTAUtsGUU^uv1lNvoZp&I$(ws?o{n2TOM!~#1 z#3W#G<*sVUK`Ww-Tdja^^jd@Bo*EW$5pP-3E}r)E^yICxG|`;USy=r!`-pmuk4tbp zDVZMf18O;ZiYI7f(*b>HJF@w*yUpW7w}NnC!1NF2?I#mfQ~U^hGQZ#=nzgYs@+gpp zW0B$${?klk1rIA^{zH)e@U`WlJcRJL@a&rksaa1EzOxz0!|I^Uy7>oAxT=47%*}&H zzgQ#;W!{K6TiNKt#kWl(#Yd{SAXb41@VFS7D)^Xg(cuX)q&)YcZ68Z7kN4x$HR_|+ z!;4q;_PbO#yJIXAfANvqWU=;_y1jvp!$1-Dh~Qo)zU|yPP7$Gej7tHdUn-#X_2Z(P zwv++(vvu@`&v)#koqpQv?#*Z9h>%8d(UKEZfSmJ27VKgHY*Xd~t6N*4qR*Zx-PO4g@r-8T``?`sn%I@xb$`eux8cy8Vfof%dSxpz9EmFI zhtZ8v(G!QQw2ux4mjCOJpQ=Y>lGf%1BC{=v!=%uwVH?pupx2Um3B~D`pW~%oxtFP45aX{wKE981-afblxndw=&DhV2yQ!s=keqkJH3)Ao58Utu7AuS+_M- z7((-{3+)!b1Vw_zC`fe|jU-_!0sEd#g|RfyiGcbID=5BgpqnJU4sYy(;5iCu5;C)< zz8AFZ@~f2bs0zecAr=m4QJ{;HBvEc`H<7)}B$MkSKv`$L7mCSACDir2WR zs5Ny$sdU=Uxyk134(;X=0yqC5Vb(E%^XAX}43p8p=iQVr0}{~Ose&rZbcX`8=?XKw zWo}e*S4Ln(#(o)z_u&3h04hQ>&I_T?#swF~&;7%!9lY*MVuH@AU~1`fMM%MdT9IW& z%mYf5&|VFoe7VIja0xzNq+oXVa_t`tQ1()0u;%r7Kk=5kK0W zY8XP6a^SgsEdcL5ljH`w=B+kWWXHsZIJ&}}wtLlIFWh}R!Mu$FSfKJeQX`*!dfLT!%Nc~3^~yaqCKh+^N~`CLR!`yA9%h{P z#)%wk%d3mRFCn7f5t|#LnE(xWD>t_wrk27JQ7R9yk<|bw&)l^VQJlXOg=fRd4EdTR zZP6P+Ub+b&!7mpd6I|q^=JsyI%6~%sZCsSRbmvL|%_fFg;a;WuHy~!L0QR16q3`jL z9ItLVK1_cs(wQC}@76yJ4AT&}*9clOC;8xVci9Bje{vYq@WsrF$)*u6kI!5r_yGAblH`QG-b_OsQM;HP%*F5xKBh z`O`EyN#Fl2k^~{G8KhdqgX30!d??b6{{y`ATlGT%3sNhB$>{!}f%T0+3y#VCN7PI0rB&vT z2SU{ct%vu2;^VzGqc|6V*krjQ+n83kbH9c|K+gd$XjPHZ5@&w~eM$N(^ybrOR{V(C zCx}cFfy;FHuKvYc+|ou&B9!;~?$wTIGev@;?BZEu+vZH-g?v}_|^AQHSK9uLt)`TY%frWyyFuOoyWIe(>Soz;8p zj>Dznpyb>Zdax_utL{)pdFk_x-4C#d?~XGZ90~u-pU6K_7K~&lk+LEE?fmG8`Yywl zRm4p@aR6;*QfTr?9i=ajh5AG#)@lUVz-KOob$0yO&jfkE`3$N$fd9e<$u4b*GQZuJ zZE~+F(zX_W`$JzD!Zhec_`p(e=Twba_M2C`l_f0|G=pleKrGmw4*g=5_PZWYlRkgETelKg z^<`44{F)~nln}GM)u0*Gy5cy-u`R(<@HZ5i;E>vTR`+FdzXA%SK^%FR=c8g;$V>JK zu^^w~=AcqC{rK<^@PyvBHEjKxGQe&mAxwH%A=V-ml#m;Tjc*%fJi{Ltl?XC2e)Dqr z7c>UL4nezAe{dK!2K%~lNz!}SY{i(lPuVpQj2^L*_Cn{6D;2wq&KuXufueMrk49j= z>RDQP|bX zLQYJX1fTE)#czaLjh`4O)co+S64!2nR&peS@j9_G!o$BwQ<>HRhJ=pJjKK-K21+(K zOkU)=149*Ty-k!F+y)mhQ10D*sXH^%0({?`F6!N`F6o18Cn@PHjYjfUc4K!%Nef%b z+$u1|Zw_=ZJ~cNjVJ3nc0iqT^;v2L!3G8U(h)@0Or^Ko?m~~TUvFR!9^`52(o^608 z^7Y-GdN6CN-cmQAb+dD0oPdI&C(5!M>(9U`wx5*B9WkSvQQyAX0zN-!k8SYkv!_qB zEAj^?pya|!okeKr^zhPR?>E;$W!R1;?9?#r8Q&Is$(D*=#_aD6E&Riz@7T~~u_gR1 zilULoDe>)ekp@m&rY4+NXpg?EWLy4yliT4s z{Z6V4$_y<=>)3Omo|{9NziR*g9K6MFA&qAa>~n*;x$Pw?w{=JB5v!OBimx~)VR8! z6K}H^f{QoaFe##)XE?r8PspF}vA>+IhRW9``h)p=lV+O>l`(G*Pp8(PmO(?Ur|K|) z-_n(A6GatPbb6REhhgzh&286ZtiL@W^u6e-gmv*qEx%>&jm!g6oN?y*c|8KlWJ1NnR1DqBR-c;sp5R@;2wCo-HaEDhe#M0|EK&`JHvsAo@U3-x1$J z^8BEr3c@r0qCuFU>=HA6(8gzMnXIbpjoV+skEH8U*C~^DP7nPZS@1jKjC@Moa6gX2 zLYO_9y@0_Rd%ER!fRmq=fGM$0dQIigVk9g7c<{ZViv06@N5#uO@W1@JBFmdAi>vtj z-68ugBB4akXkgqazU^~)T6eO$IcUg@2aIP$hL#HcmQ)Qq!g7*j3BRE%-*R5mYvWgx zPMLz?-3~0?LUOTD4aj_^wexU)6XCY8!B(V@pQ-p7NUJf&zqa1Gl7CY?y~>o6XJeR9 zv4|lAT5Kt6aqx=KWa#Vs`}fs_C(cNB%2Y_(HBTa97`sJ{>B&&c_#sl6?-!8{TvKKS zY7f7o5V^PWs`fOvuEcu=eguWACvqZN@#emA`=$-CNh&z^N(U*}yni}(o-~Y1f%9q!t&1S=iB1ML*>OUibOT@`t zW`4M#?l}EuH<;x~##Skn3d3Atw0PszeJ{-tAQ9@1Y=X|W}75JrcHB=Gmobbc2Nrb_4o|=4l`ke`+1gr1Cuw%>c=HvgxYD$~# zG5yXe(Doaz)(=M>Wr#!T3dI3eo)AqbUg6(d^m`NR=-pWl4Zr`BlpuLlmbM-#`^t3` zk=LAAtg9+>)_gsjWZcFVB&{Q3SJkpDPk;U~z*kyOR&S_Asomqg2-f@o>Aw6&b+{<>Ia49XB*m<>7c)r@)Rf+ZVlBo4tDu%A0ACacA8C23ly_8&fo1hDMO2N-&Ps5 zceXCU_IP533h65jUuuic{bnt%pt?e#St`KX^fQ6 z7^?GWLo_O|TLHqFvGNrsw?@&!FBHmMYutw>V}Y1X5oO@K-&*!2uQiYz-cLd7zn@PW z^JlS23I}5btg#oB<`J>hzqb_i<4y_M{LiUaBU+7LRGBZhyKGZy(dz@E8Vq7ZVG8U= zrhQj_uN~xt^M&CYZ|i`@MP_#2Whxq*pVWk0%q9X05^0sQY9qmQ1DnZn5UMaa>5QK9 z=f6xWN{}WWU2;6;QFMVBF3=Qj#@u1YYhE=*u&n;*&>DbB7^j1}hp3}t?1_gIfL8mF zp5Mmq0FhWj!QFB|^S@frKW;%&sk;s5A`<55AbMK9;*DWo zHXk#U^YDqR%hjjs_rmAz${ew~oMyfm2jEAmtP`l#NBkmn6@6#X1c)r&>YB1MJHbkY zdGHM`)_yO7I`#&l$h%zcXqpl4U|rG382JMZ5{|N{@8n8czQqbqti!b}Pd^hoZY(!^ zQip=|ugd>K6jWb&H+PJ62!+qrE%i>E z>Nkr1LXAo-o z1p;M-xPipcy>vyPul4kQlsE5%$N6!z%qv9Ag7)xS%5I!Sw1qtC2IVZ;tv@E>U zgw!43DJ&knacsY84RFbg|M}lC*Se+Kk5AtAP}J-l3-Syc^Z0IZ*fKESMg0qS9308D z7B~XME%@Bw!b4Tu?%Dki;zjw)iKc-DlYfZDQ7OaEAyi9$UaU(XWfe(4Qv0d@XVJPh#D!A=`T8`5SmB?wYxb0yyt+oofA36GY9_ z+leC3oFX$a>m4M}(iCuj7I15Sx(kCAItF($O_AUp}uk;BLG5UUpuSsh-N zo$m|&;>(p3Ppx11pWWGHc|9>^=NCV7ac@Zk?VM)~xDPuT2&8G8{yE8+*=E>~3QbvV zO%-mSa4#L?HJDyqcyFutC?S<4*h1>2uB@m=Y4G6*Jc&?k;n8ms%PXY}df!u7JQu_f zbjF%-px;MNjf88Vb5ymxZ*Ab;0)!L(jvX!~`Sn$Lj!@eY0uDUIj1wi^=tiKnRVJ8> z2QXe!yvuo?CGb6JCQoIb=^+@fwiV>gs*D>&WJ2`5VSPgZR!r98qQ;R*eQXc=R8a^R zs{br)5>fIaiO`ngI3B2<1k!Rq=1ez+kzdvN&zpP^vgBff+i-C{)5!W~&wN@#xfctx zO!sm9=avq7mT_T<`j_QK@Um_sQ!JF)jD$oz*hCVM_dT}A;TDo zLu_D{LZME-$g7PW1Si8ET53ZvPq5XF7&p0u-b;6tu3ycLf?% zS@}#6giDi&4kq3P+&@0FN_5$bWBP^XKTLmAUi(?{o}gvQu2k)WTs|VH9Zz1?QFpJQ z$VR;fX2J;ftXQjG+sC1Z74WO}jAS|5Re|aHTpa##bm)#|D>EMAfX{La{$-_0A2Xqh z?{ejZ*OVoK&`v5j1UJY-$27!|emK0W;R6ff!t z?+d2LD%=eIH41MkczEV1@s+tQ+Ms<%M?Y*!odJ_}&=ZFo$vGZxZsS!HyHjNZfSs#b zoO*~89&S#W-w+GZJp$^HCPFQZ7~#nKLoFj6Iy3VxoIYF4xmXFd(iNg>>b-ImRwvhd zCi|ia4ZyY`733Ul8Kw0mSBPP6Z|}+R$P$*2 znbsWHOUo$0wOC3@5CWm~aI}u$oqx`g%7O=YK6$OS#^{|6B91t9W+5#H+X|euHS|kB zdvBiZ^aiZ={tMACLP5NZ8Q8V+nbfu9z&f@>Gg{r-spC~4(VSkCuaR(A&5rR=G`ewsCZq00aYictRD&We>nuXiTW zk6TrLrZHjiZ6;FiYM@RRNNl0*jT62Omg!Fpg%?ze#H2~DGpS^czZpm-@W`ku(xs@n z`m@|ZjvmilEHD1)cz)N^pLQB``gtR<25hUPflD3}6kf3ADkB3yOcMYx{GQck3E<&S~Uq)a}>BSno9{ig)I#}@twVancdvTU@SB*dm8n^zx|M9Gw>fOfoG-^QK6wF#90e`3t!bY`v_gFl77LS06%i%|S7hen2`zi%X!j(wTe@%r*p8PA29#=KH zYaO^o;DA4bCEMb&$Ejy6QY+wS$xb{!HV|KwF$e)G&&}9I4yeDq3_tO8i^=-GUjHq; zNMuWhd{ju%8@3mH+QH~byC;dF(m&$3q3DoT&C$%kKziG(+{{X2v>3Cy8(E|kB;W++ zP6=D2Ny)ZEbYL+{Hl`{oZ_*AbZHwBi3^cf2WSFUE=?Z)mL6>( zrE#GEnHXz|s6OuCD;a#;Tp?^Ef2dhyRoG1aG zF`C;;VgTuyn-WYaSzNoNl>47DOxWDEFj&&f{d22h|5gV*SKJmw`ms@i`C<0-mHLB& zx4WUDhDmcPaX|4VoLIR}U=?%e^D`Eq+0owl47GNTJmSS{+R(B?9-ZamPK4#j%uo=lCNu|9^wn z|L=Cj|2y)piqK2{2ZR4KXZuS$Vf;9&7ZS>rlpg|Jo;%^8MWKkl#8XvCk%;>F%fd)3 zGm%ryE;x=QQ~Rex=djN&AJe9Te4c#kWYu?b9(pmZWSd9C`(C1mqJW4&Y57kF$Gfye zz$!ITb1)SR{`Ma-kO}JE+YyLhe)ZjVXk~E1%qo(%G=uaJs>~7oogaGPgSsE3kLQio z=Bh^$X?qh7qKxKYi^I?YCYd_gO+h|BP6M#AyI9d-S!bpBsczC1jN>`{f*Ibt)(xFg z0h=&?V+fA1dG)AoNTyGPkR<7k_jqm^!qySVrQs&U&pB^mLEqnM@na*YU-Dd@VJMH8 z(f@o@$J}gYu=eVndD?J{ct?T)H8IY%i!@&(WB_bC`^nXl1t^xk2K0+R%HpGWk|Mw5 zU^f0T3i@>?B;VqtXR#{p6CW zp^34ixTv0*?jN@r(&oJ?nDf}{%V(I)3xdCEUFj+0fZty=%R;CHJP*%ec{yl@4E-Sc z>=eVl2UZH;B}uO<6Gx`+zP!7LPxpS&8hCm)XM$ToYI`$?`u08Nt0sVMPSK-oux9V( z4V|UJpDcmS(QLj*@B!E=be0vbDarA-hn$wJ`HKA>yy76}QBMN9f%zCGwA){(9@EMn zUjl1nAwLKDJG5x$0Ka(eT6sq0E zZF5r=$!ilOQ27dqZ}%_2hX2IMDw;)+w9^-#o22@=PWY5H&nWu7H(_aqCvi#9dE`=P zeF9REU{WolFMvHhgtsuBvCXU5WvapSYdq2axlrhp!pjM1gw6p6?m&uKWs?4kcfe*@ zU?9qT3@>xC39(+Jb3e4oP_mml22{=ALceJ5%WznwiSfkVc6N~e)#0d>e&sg_vol^w zT}?J+-UFoI#{*xIf`_3`GbFOf16<1$Bo6eJgD~zh4xh0#RF{qOi1V5-B1H`Rl7+l= zVbF_6I?cs9fV7S8=69$^b65V$zMMN>su6VI+cuKovj~3J5pu8&pHxs*j|mJy8M*Za zx}E{ui8i#VyhN4vQYs$!;pnx2qkgoT0!_Jn4gXwK9?!SnVm&$CKPZ5VOcy@jorlJ3 zg0E81^i|=%A&s#aXNgHmTk7cBS$1z{M`W@Lm38!jPF>{8zQ?*+{Oo0Yb`X{g7Xd^F@Y;i7`oS7ILU4}Wt&ujIsDNcq(ye*TIcqHAHz9K`HC9}FspZDKj4 z-~F|DufJFVoCzcyhpx$y(P@)*L`}@@CC%9pfBWdpq#}OXiyPkyqZrWda?c9;ZSwZ> zm^d2#T^6%7!+{d0huJgDB{@;68j_Sd8iu;(r3>ZZJe@}=8;`<7(p9>vi2Zjt4}AFu z01HuuHPy2im@3q~cojK7c3D<;<@cdPXgUc*R}u0dZoBt%v8B>&^UqQTR4FXcF<=fx!20l#1@YV0L>0}v)=}MH^ML%QZ^U&^y zG;kneAfnKzkG)#2LtCL;(qIql^2fxKpZTl;M-U#emUZ$2Dg+=NrbN)(&U`j-22F`# z3}Sx|HW%sfF2ta1*j)2E^=Yi9(8uF{y~Eh5lQAkZn+Ce14n#pRAKb2 zan{^@^%@RCr2>KeyvG-#m;X;bRV+CtEy^Qr>;}~Mvx3bR-Q&4s-iUzFJm1RjlF2)m zmBlhecf5R8%2w>CM$bGood$a4VbZceDfe7AnipKp9{uU5LWI*npwtmBNcuYM#3FqC z;Grh0q$I5`MB;OvLVVY>ftF3bR`fFzbSM)%a_dxJ)2vZb?RBN5dpEC4LksG30!YTY zIcE1fT;n`?>`-`3I9|?2Ct8moOza9G?!f<_gMv1CwM$!GZ~S=Lg!uRTe z_b9%@PrCjnj3E6n^hFvs3dXsfXVwc0ym#$f5QW@(d;;!2KX1Qx1o;`>hq2qr?qLyz znuyc*r|3>(GMv%1xUXIj{r=ld7K1tKy&1;Jh^L^LMq|S2Km6gVK0zAfvp<&BJUy1) z!RhLp8DsaGQT{Nhz5f0>T8f-(;i*(qC{tL_aZ?B>%K~Xk>g*ZN-8$mz@S0dsHht6n z7Ld+>^o@vdbXPFyC2b>o+Xf(m!HoLIGluyxpHhCI+PHp224ivdfcP5QBN?hc;}iT4 zfj@*v%I)oTgMl5U9zklej=M!4v^|oOn(wMwnXlO&>Qf-;rOG&T9V`mCj5|#*&7f@b zo=6)WxB+Z`hVmt2?PbtBgzh2yQCu`#;gcmK=71r2#tD~nMmQvv`Rm7OhQ}}H)TI|T zO_$@#pO#W80Ebj_K2y^T#e&GW9k*S*tV?&5s~6B8zyZsdv9_w!(P;(lSDtL}VA}#w z4VY>2Xxm*Z`y-zj%gF8sv`}uxToD)&_t?clKBw#>W3agw56hU2%c|nhSr@c$fWe+w zz--yteeO$zOA&{ri+S;7$}yNvrH_m9oZElEh)49TxiC0yrEOmz#E$e@--eawN@B-< zEPUWL33=(9`H`ipPd!R7f^N~0v10sFW~h*>68_LkeXmoh&54X_ni>YRNa^IlKIX+zBK43 zCj^s2zHpJx#k|J9nrW?CVxNLNodIiG8rGDF!8D@tAzHnG0rq~n7Of^(^P^+vajqQd zgi`y&?yTy8@H3Ut=42UJ_|TkQq@=A?na$S*z0$`bxkw3y!$G^^D#msq@YL$OupC0w zvEY*{Ck4o_U@=J8@wMkOtuL$u+7_(X)(6)52!0py?4MR}kHa!j+swTXjK%)_9EO3(lCjPD3Q7rWM$l6}M&vOL* zAQGg%KGT$A7T=JPZ)aR1R4?Lgz`p2QR2<{_OX;+qdSF-YXHHUH8bL{py>}-4<6`FW zir1)C%SmZEq5+!=eKqhfuCI-U=C z2q|$(`ZHq$PtI8R#zrw}M?KOD|H0hvB(J+qCnuD!QE`Te>SRpSS**#yT)yElo|<|Jzy)~>E~oS>Kg zHm{_lrChW`HW*kqB|dvs(ZlWVRu(TtmP&@Lj+<1GpWOEbyVN8Bto*t(}Tq9UXD zR63s@w*y#1D^yf34OH(Z>wSmzS83&NxDG1x-V7GJPqzw#up{V`?%gm!-AaA_C)%1E zVeg0ZGe9L(?<)(K+L>wu(itf9xW3$k7FC%<%+U)ufdlbtyEcOd55$+dHl0%ZOA zc39Cfx~E(`9`qMNM|@tG2v#x@x~cp#18rZ4TpKx~Tilx})fX*GItO{d7D^_7JMK?> zR(r868#;wW0`I3UNCzwJ$tEi6YZJ^(_1(9Q7rOOht?x?iXd_H>FSGrQd!$s*!{vcL zhx1`Xc`7)C1ev{)3n8vujp2;P7bO}VHN5b%Z#Rp9mg(oqRO)Wtt50~{x0}8{HqA0& zBdR!Ncv1gk>cfTT4D_02?!u9o7oOV3b8wdnbsVyB=PO{wA1b`Jy%E=1OSLuv#<^I1 zOiR&w4!L)eT>cqsDtrrgWf>HhM{*(-z#+tXwM9?Ah>Xeo+{?TR znanJiJfH*yb`PiGM!W9|UX&=WPlX@nT{%>UUgHVSaqsE(T@rRgQ97)t6`|#`1*uuf zG7e0u6RQpPCOzKXrt7hj#cwCkfCICzp;A*Vlx2>VrcZi`8f2YB1ySLAja>7&D31;2 z9}(e~E?HE&H{vV^dU8txL+~xh(ZR0LQ^>O>#LY8g5uz;}I~+g6ez>T@sM@KKDW>Av zpvDZO8GRVvYWhf!nc0*d7blk@eJm(HVE|W86+x1|5&vS_;TjXEuy-LF+DOaytv54C zd^rjiArYJE>U>*KDDm-!Xi^RrM#W=6@XJX-(iO4AcYo8bA6x-=N%Z(~Ko)E7GCvit zM$;)dTg)Y|=hp;hB!Uf9S+EAf94O%z_m%VjRRjPZ*Np@zpRDN(9OIx-2b13jJIa_s z{9)Klk`Pol#;a3496#Fn*_}&5eE`2N+K_8<{PMhtUSGWojevzx;yRn{bpZ-9gO)MIF_c!+!&= z2X~u#i|rU%Ia|Z7DA0u5&*W#`{hOxN#I%)mw&^?IbaP{izSucKUXv!@Sqb_tx&Z)4+jND8`$1O zvC*HXxe`AOI=!ZqtJQx8j0>QxTZ^Kt>QwqiB*Liu?4w z8_XtA;JkO9_iZ%QEHQJI-G0=dUG$z>H1h2AeNv7B9(AJV|r%CXYopH{P>BZSCxNp!PE*tE6o6P3w zV=>*V`@v#N@iqz+o?4fp0egM)#qzjDo00)cbh4|mnO`*mci9_}t*_u=fEO$dD&Uu@ zJAbqY-&Iv9e>Ya*`DTWqG7rH_k7Q7N(*>1(D~G@Y=WEY-|C5GwkYvRi|MGSltymy8 zA9Z9dsI|_JXPShWr`Q%C)Hd>MSJ>8E_}+-t`}`(Vlg5d>!~3okC1!wYfoJpRC+xgR zI}LBtJc}gnR+=vHMdmWt>C=6$zN!=Xy9U=j3~!-XKA?2XwE@+9aiZf9sLNBXfXHoC z(G796oV$KhkDRL%z^;n*pPbJhRUHFzRu$L;4dq{wSWmV`+e7ekCKMCXUo3>^CC0B@ z2mjRjt{5n?{C_}#Z~qq%Fycr5Bk149c8xY|5XA6p{Gl!UB)31dGIgIX__4iv9>hiU zls2*R&ub$nboSlwTehsA!erQc7~8PFJmU$5yKvJ_`c0_GEXkEPQ8chc3Wpt`_U^U! z-;ZG8qXvnjfc!HmmOG@-O8Cvi) z{DajtTfhO!d}RiN^^H%>SF@1n_|eB6w1$FUV2%F5<3_03qIZ?%@MXT6ycDDKgH@7< ztqg>7onwgJAS(HMJ(vMqW$ywXv{wUq;3*Wdg6S?5RIg{6HJViUz5RCqM{uSwX^(6p zFE2y&08ZwYLCD9Kbj1gXjdk`ZJxP@SDKzoDjy@1fcRsDQU_{7q@jWpnPRPu0e=~x3 zHHUnBh%cBgZ`E?Dk2KERYClf9c2DYYxzHj?1;xj(`N%-lh1>@G@0@6Cm)`G>W3c4@ zF48(w(b(X*c3jeuRjM-aDjeNu$hL<)b3L-LGSKi}D^5Mi#oqmUM>{Td2Ti@qO)cBA zeGS(c?8JC{5#Jd(ptc4to;<|AsO`+2OGCMkB(d0qo5{IxL6J0n+rNV+Hl1&Wv1Pf_ zKzi?PG=9=CB>&oXG7#)qoWQ5w1$L#;or*3!$WB#;c=9at)3FPFttRw@#lIq{^J0{JZfP?dbEt@VgF+y&gm&jLa zzec9&ZWX5}V?L$?C~%R3Yj3PsH|CD~pp`X8%vn&6n8_$NXM`0Y3*j3DcNEL%U-#7n zm-}i@xfl#q_ul=?)Y|>F7DB|xcXpMf4A~iZvQi-r8T~GR_G^0|ZZHNZn!T;h>4wle z&dbxWXi!ZBVWm+Fz_vN3Pz9FuCY6t!QCB1#WKNh6eWq*XFMms?GF-A#S>tv+3kTPd zDnrUW_OS2|jzR6lwH*!E1fN`pL@2EBlG5ZHH(2j^>yeP?6O*Bp=aw`F) zOhd)l&!3&E>=wMN`yb@6gE{s_OE=_fwcP91KjoMvfa){w+3nqy`bqE)80ErnoNSjx7wlPV{s|zY-xcQF3Pn zh?vY_TAGlV3eEGM{CC}TBWRuUQp+UeFYsu)dMjSgrK9%(_G#g7r$9uk_yVfveLPfC zy5_{Hxe6l6A|IxSHH0ftb_T8q+F-l=RX#4R_X-CWxhf)~+HoiF(i~Rp={-xkm@J>Y z0qK*2ptfV2VY&H1gFMkoyBv`)9p26`nb7LNLsxV?J~1heK`|q@%Z>4EgTojlmZMK6 zX{#%SZZW2V{xfo-_aXtMhun;ijAv`~={#nd72r*yUN@c1H_ULewTrtPc~17_mo%u) z&UrT(c1;i@gn5{>f?r{9^i=B0MKf&_Qa&AIqP3?UvnH5IT1$@xnM}BZ95B{ruKg{g z3`7o!uuH5RP&ZR=(h@vVga!POWJr(>^%OIq)_NwiC48EWW;7Afft^P z8ZRJ+>M?J&YUF|pq*`hv{!cll9YVxd(`W9R1>P75mw^Md*{}hNsmgpladyg#MkmBd z_H~wRyqDePLSZE_3m&P&;Pc)AoP!QtxDhTo_cEsytQ=)$p2w7iTqPW2TI=)K-Vyp3 z3B~Ji`~VKbr98BQ8Ok<5EiZN0kUpkhS&{0+sG2hWYR1?&Cf=MCDP*cBqplbydS_OJ z{XdgcrUpeGxbp{Em;(+R!V@@P5VfZ5CnlCQ-eo3_DdLle9w4ddq#gS{bY%jXqpj!F z0-w_w4SO}6w6DH}w@E))^{yOTW(MHpb?v4tpqkYBL9+yi8MIgF013iJQle%D3p$VS zB5HxVE~o^JBJkxDXV2yT!PuKQAUg2ONbi1 zx6wz7PILxCh=?+3^mmZo^M31F-}m1$e^|3DxBHy4&)Mgkz4vupnomZDs0T%tyu^;> zYtDOnRFW(RN(4tl2D=6j)sJs?T660k-XJOYOb;wAMKO%*`&YKU^c>HnI*^gtF@Y;;XN%SsCS}X;>L-8PL z;_hnY5M^_Bzcgcleif5#jh-h%)XdTUe-+UY`g)aR2)sZPvvD@t5HHAe47#1Qzk$$N z>=1j5k?EKnezH&`e8bk+IS`6J^vwJ5C!Ft@jRWJwzHni>T=qO0k~c@!7;>oXM_1F^ zSV*0L_j)-<&VJyQYuIVqhx-$bvmY*fM}X#B&Nt}^qD#&P4;3@kmUv|VY^hyV@i|(Z zsYTtI9KT6Et2kC@#CSlD$uh;o8~2GC$|#qIlwTm@I=w6sRtP)&{ScOniL+}FX9MpP z;#W?pF^yow14ruLP7X9`-;=K)IVTEeHg+87sr%U31yK8oO2O(!hFhC zZ*YPZ?wszey%H@DOjtmb2QbDKo>tO4v~TshW8hj=pY438pq5J$q}Jfc{ceYZY7aPT z*3UaAeZ`XKL~rCUL24<#HB5;yr3UpU_nF#YP~8{*qX2$~>Bhlm1C>K@`^d!Yl(wnM zQ!m?PpHx{PfS2(8eH)$KA{{AfOkT0f{V@T!^SxzJ11JE!n5V)3TZ06nenz2c9nuJ& zmn5}&{)7xsfKr%i)QgylJbugk&OvCdat@c&zhnO96IGw8h*p;S{h@WMBZ~R2LbMY^ z8g`lC6$KeTw=Uf2nj>1ZZ+$DhX0>hYs4gQacec6|O!pf>+Sn({GYsnZe{%@^r8Qyv zyabOr{0VIT#P88*3z7WoQu+>Nf$NPl{qn2ps>!u)HpqS$c__e%E*kubaVxYHcOP5{ zQ5KUYm|dz^sxN29PyQ-Z?NlI_c9sFL`;4EHK5tnJO7h9wfl7{=) zNx1?*+B)-YiXsK2?7}?q7Y#wP>V8%+EAQ665B*%d{P{!hl8$`@UeKilE6-!n`>8JO z9Cxad25OmRO(+Tu;-3xj(n>4YZYApFuh72>00(7R!;P|m=7z7z6Vr4h1yjbX1XgtE zwXIpq&L2X-^Mb_JgYH1Oj#ie{1oSSj~8|wBVI!RPfbAx9=Ik3?C-U!Q^eP z=!*Zcg1+ryuy_4Dkp)Y+k5$7Y@AjBAeAIOrsAap;Y?KBxUkvAG)z+JrDNMffdBEt2 z&$Z^H2uvcf6;u}+v#9TYGr?6U{i9rbM%_XP-Htwcqb)mrY_Q*u9qBcPRxWx$pALO@Lb2-AhE5d- zTk}}gMf>)RDhqt&grqVT@OA9s1?C4IY=wXH5Y$q$-J)%9nZ^Y%^|KK&PWavqBNSXn zh4j^HuhXYU97(ml{mIs93S<^URJ^S>?u*od*MO}RXu&h;?L=jJW4;rUSy|W4^p7s` z(+fki{o_72>%{==g9V43PDMb5k(0@*nj=*`s-|hBx)56R7?kM%IdkI6pp7=02N5^K#n5`+@D;|ot- z^aPCOFQAs^^dR1$t?xweqUr02-H++x37VkD){8j1W>g#riXX?0UnF5`ku%~gHH-Wu z#5ZLp`m>SK>7e!xj6SRrLJ5B-*nXV33;EL^f)x_)5IU)7A>$1A>P5WVU$o^t6plD)`{&vu1HYy`|kena*F&eWkVY zYYQ2QkUe{wHqBDa`ih&-Q%EGB?6Vlzwzc-8h2s&oN>ehQevB%UP_GMCH?Ofu z+#l7>-$ImD8Pq~qQ6yRJ(Wxbf+#1fbN-c-rwm6OgCU>|28 zW{v`k#5QyBeFSgh-p6%=c*p|>-j;5@gw^+8?m{b*Yl*@kQ5zmC);gKP%M#+WW5$`c zp=?RK8!Q<0$^v+tv5Xp4mCb6z6#XG}jC~Owr?t0Zf!hiN@dEt99|N^lSHECq6+ba1 z`=)<6^`5t_qxc4PW7v+^#1SlDb#t#DXN6!t-OvMUF@&wDC4eI|9?Meyo3Ep_Y^-ER zG90{9kuB&OBQIYV&>9{YGNkwfwJUNH;_F$N4aXqag20%od-%LhIN|Tonwww55?jf4 zZHtAHZoLw!UAh#3GYl&p^wFWCZ6(QrJ7gUjVY&>QdAS1yR4VlU^B$7TJg#mMjw!&>5yernF?Hyxhau4Zp3_t>`8(pP^Xb=xOPXQfyb|J8sPJT{L0AuruAYdbTzf z|ArUdzQvY#QcohZ+&^%LmZ?0tyqgZpC9Z^l5ew{TEnAlwiDRb@lr^RB5WOnmT@Z|E%P3XsjaMdJ%M$~^)lk>;+{>Uov z<_$}(-@KKLUJ%m=n#}(c1{OAj3HT35@WQQ#GeXB~%#Zv2V$5I~y*`6T#vBP%(!txi zq+O9`(58^_oPFJAOqICs=JKy4yI-b&72ig`n@|M*L@UogHij8;PO!+D8ssJE|Kr)V z&R}sNlh{btHZ{+z3#;RPGDbH$@q#+?z>XfOyw_Q|GuLY&^2<|%Br-c5_=rJpth9X_ z`3f^0DAto1Qol8-P?=!I+x8Xek%gzkD16%UmK9#X<+ks$|28ebGzA!< z`HX3UQhu_8#wQt=jtzBA^$}Z;&ydbKBT8F-cD6|4RxS%Da=js`NS#Hi<@z4g3Nhb9 zVM_seZL2&Ba|8dz{hWkDtY}e<0Q$$8e~$|8xf)%@EQT+3pgcIj(Xz$G(%>cX*R5n5 zPckemg>D4NqD}}0$OjeR7BGUQfQ+aDxxEv_97Sx5H;tW2P3Exj^0Knw5mj#5Tgt6C z$Pq{&Aj~RNjbZ%a@@w=lLZA-{Siq~C%farPTl1uaZ>Bfmur%U2eJnxNaZ_Thg$hK$ z4F0jmX6k0p@k-YzZlt>FCQEB%67T`6thiore>|9WDGU&c1Cs1H+fx_kdGlZwT>>ks zlhR@Ih^%aAJh@h*9PAup@&=g%$3j36%Onj*N6ErA2hpr4Wdju(Ci#^}MSnD>2w z`TCysD7@CuOu+bMt^a#8LLE?MGO*Ud2S>&!RP=ia(4Wq!yG3@v9pI3;j?Kj&R4l@1 zTh5tg%?U~B^pKwn=`}(BAM{W;<@~YcWEgU$g2v+99AG8~R zuvWPie8?>M%s99l*@j~K$;-}Xv*(iehjb6s98N;3o!e5q2 zgj{pAe0JiUvD4t*t*N}M2eOjHyuRv!UbhzF+_9_2Mowd=vXP+ZLe`$Wkw}!}f=z$I zW3~n%MAqI7zyD~iK8QQXKAvm7-7(%#BjyDqDQ6gAI6cz)R0;JA-oJzckm;dd+C#MW z)uvr?ln)*9F4IyOp#~7l{rLKC-Fi3cxLzr1qGq44IapP~5Zj2sC z{hT_jMM^!ix$X!1IoJQg{BQWuZ#d35zC%k2zutzjKr0{Ib8zK_B&gA3AT-HY=;K6g zAm%zy8m0-152_}jH*)l}T3mF2O(Up;1AhL+yiYFJz5lhvox~vetZv(#n?F5-s%AsD zuwj7hjp#2=sV(yJKD8WVI;k~J zU4ZoidwwwgwYfLNgCbFy=0nzGc(X=yvdOQUY+$p@2|QQ&o+8UVq2KGz>NYMgB-EP+m=Z0b~flY@vuL$VXz|^XF5xU81|oEpUoB5 zf&vlO{R*ce65O{ib`jb%$@fLCi(Gy5nSLU&5d|YI0sBGFGhlO)YMom>ciPaZG)&ij z(g8{+$NA)C9Ie}leo6t$LmSjH>V_CCGXM~Xbw(Ynu9A*qd^FEs+hYbXKAq?4WlG6A zs80xebd&%X@>{+mjBTPy(WrOEoQ zhiJEN<^J>{f)SSS>tT;!5On;GRywjazNjCyl9Qt=_gOCsWe+Z9~-RL`HuV;BKJ$g0YF#0d5bz4aTb=V@Zi2nCCQUd>h z(k#sXqY(T5`PEI8F)9z@4uuqaRlW~Tl1AL7LhmlM_IE(N|MjocUgDhhY?I=AHy09~ z+b`VXUSUu_=An7Spa-JKTdTy+>y9BDMbeX2|ts1O4LE+ z;&V-tEPaYfwol>fOWGe%p274cqDcQ4h$Plh0bF&HJEVQ%k8{2Xza2*>*a0mMa^QNDMkZx;@rr1`ckR+5%Q?R)S|BTN0trlJY zK?k|ZVewA$rS04U0HroFtz9Ra5E`8OBC0Kn9mK+?MyNc>pQG8L@OZqg`DORYWK3hA zr>4a-v^-FlADRR^NuV=)KqxDq;Hd=z2d~azJc7# zcMU$wX>ypX9KK{XGPl8f6wkSF-b!1v(N))~7j`^M=@bq4j52{*6D^U~*FzY@n7n43YR1iv zWp5a-Qu20)4dvGrRpCPj3htGHlAo`p3m4Cb%n;)53Z07vV zQSK`RKHIK8)uDsZw$WK}Y%{FH)D7a75elO)unBXi47`6E6M0CMN}%{-UD!&%)vvRZ z!MKVg6Ub!a&r<}^ckN0#-Ot*~I1oL3PhQ8BWJ+yUiQ(jMG{J<4H2%FOAbn2ASqU}X z7>zHxS_FX&QM<1U8ITn~=#+?=?un-Pi`K~YnF>=@21!*uDgFMGkfzP(Z9uDp?Rj0) zIq&nD+c3x?4db2t^Bo+&Rf!&2)}2_6Syx!@*SeQ@CyMwV>!kX()F!kEl~&;(GvMWL zCdgAs%vhe3&56Q+!ieM3S*Egb#UOGrqo5xl1A?NqW(vpNe)=2qin-=y8px&!B|O#b2q;YCZnnC`EFmhaF2ac z23V2T7Sy(GVHuk^fhWEP&L!zac3+7hh@!{nG83$y4kgfH6>UlP|G6&)+IW<~XcR-i z_Sesd; zSTr?ySA3-dqOj8BJ9D+2(V}nTigGy}baXSWO-0ZQuab8~qDULJa@0;~*Q@rvB;vf& z>bgON(wusv^&s4T@rpuYkTgwKp*h~0fN*)!`BA5^o6fIzqN(wtsb(J>_}6q{-c?tt z)Q`hV@MH_9JeV*Ib8k5&?OPe(YAZY7)N5=|DXvujfg+62(MubS_wbLrH|7=U4GlbA zOmW3$)S*pn$c<6184ZyvQ}vvEp#r8&08;RHaf=_7*2V*8y}Z1B22-ca2N5fP=|8(u ze5xB1vJ%j(|E&LEnr|@0KK8t|^oJ-&uvlE|kHx?~qW*_SRwjipOR`2yNH-F(R|V8~ z^|LuZT!{OreB={Q?L$3Zw0yz{+>m(2xVDO4NlNoNkB*z+1z-Nx=e^7_V4V3q)FO%|Ifz@y6hVN1t%;SultGB8aF&tC*ISIEryCbnXb|V+ z@1{RI*j>e$;m%){L$DKpaVXwF18V2ELJS(u{5QFn;X!{iojzO8qBWWI17?x!?*$=0n6}41H5I&HjW77R;mf|0szbk+%5vlr@1&Sog&|JI zU=#xmiDHJEscUMz5H{x?Xa~HRHISDBk85Yl!kt-Ti-?EBlYqHOaWWZ3ciK1C%=p14TJmBy~1i9dk_4;5C!xI(?A!1VszchKm`jYfFUg3nWrFW~I+p zjkH5f7|1(@+hNfur$A7ECV`LZI} zvNhIg(01yPcgUTrT2r0cf}X;DfC{OL2FV3lr6|DUqWA9jHqB_&yH;yCLXSoSp56D4 z=U3H9V3^IPGQ7pu`e+4_kl7g-xJB`GgLYR24GDc^?$&sRd=D^}@T+RP_HL)aq+zmwyni6QtCZ6#kpRWul-Q7e zAp3L9g6uWHwh3y~K;(bV4Twwr%eftn{dd&5;{A6HJM=pD^g6p9x@-y2JvlROUs?9= zxqSf39|qqgi^=Zz8^c=dcnz=vrbZgi*b{x9FSn4BFZ-i@)F_ldc}*87BGmE(Uz%JK z{8I3egIup=767U{olMStyLWs^LIV<0%|1~aezdBoTl^y5a@)TQd$Wh{t*I$v^h5pr zx3VAa#bg$qqwGq39tqX*oP4MgsZ@~U=RMl9N&p|{`MmnR=*ZK0G+b5~n#K2!(l3SK zI`#XClDD6fNx&X*14e0Wh;;m<@1;2`E6b^^n&^+%)kEe-NclD$i1#zkGjnNS=trW+ zLtR6{aTEmoqlEs=hNal>V)4$ye^Ht2U^FtZ+W#%DuONEbv3wEG&k}uIKp^IQrG|Hx z#jx=~a5c$)*>1kJZdC7QkzUXN?cP^`XBKZUX4mI~t|=apSRF^MxMS&PGn>F6E0Kr< zR=4ly+qP~t&AK{^%EY7gb|j8vM1^0ymxo1+wWl;)tqLv&ofpD)HvKSqPR>pEm3y)s zmOruD5QIJ&D&N|*5gA?@;5tqUT??M~gcWL*f1~X4G_;GNTIaui-Kr1;LgmTEra6Gl zdWmwL*2wLLgLjjRNvS?X$!u6}Aw#OgXfzS(R5i*_UXH-e~$8uFdZxLEQ(%@|cO&pxJlt z$Sa4Nv5r>}!iH%H%uU~v0Jq=39|&8^^#NznK2|Dao@(xHZtX0~s$igyTaN!Zkt%Mb zttRM)pmjm&LtZrsO9A}WoR{o2mBy1>aC!-Cs|{*t2}>fv-OlaQ~- zmWPf>H^y=VD1)`sBiD1?kLchmT9%*|E#P?g!y?IQ6Mb((8;w|tMdp0sa*w87Ju~_k z_8R5Gi@2Z&c?2UgErTwt+;Cxkx`IlynW;3q#;jKzQuBk-=TG)0!$jBwhXZr=C5O+P zb`|f@Fa$E(V(75ka3jWOy?F~DnFuz1les^UguCB-qhq1nhP)Lzhw1@S2Q`)oE_NUw zbClZ=vatFUf(P2eaL1rhI%xa#_$O@7EanbeGa^Lf2 zh5vfm^aj>8bt1U;6%T*6Gaz%hI6p1?vp)nIK1^aDC2 zXf{CmfnJWLBa{PG(XCNXjnEngy_!78V0T>0T&8y1{l-*uT-Q^ ztoVPo+5U8Cd~Z8;+S)P?+A!BIL#s%YVQ=Zch-|I2Z*#axvAj3+6`G#?GVtXyL zLR@PFvU@fIn%Zi)nP3(+6c?Er7pJ_@hbfCsootCy?QCkLYr_Whk1pW3;Z|5;zK4IB zZ{`#L{8cVhICW?Ct)*mHz+SaSg@5C35mN96qe4o3@8cdQDObX`A6KzNbZ}z5toBe$ zUoJ;!?B&*oM*d4p6JthDXsbpF-un2ns@?<28PV_-*+hzVvO9>r0W7wbFr+r<{`c3- zprf`MY#I8h`S#biQe7hy)aZSDQ;F@mxx7|5t%2R^VfHku9d5tU7GS}#08NO*8*8vG z|ADQ-hk(Sd=pP-HOoL~L$WS@0vmgntjy$sQU6}$$Vx{O5^+9Jr*ENeUac4fJ>sxQR;s9dvLqKQ508r(qXYk z)hH`*BlPkV&j}Tt$Fv{S|>qmR!JAi}`@?yr$8)W`VJsy_y98uX3t0(U0&$(D+#bP)~K}vh~$3JpBzQ_S%H& z>%`pSs83ooNk6`+DuuWcR=bNB;>IPpE64lKOZ(QA9BK)P2=1l!c%H2=e_39!&)X82(^TNmCYE*`Mu#EjNj3z(CnuHmpjvAqd~KDs8adL6mumWXIQ=5T;;f& znFA^z3Z>i_`i2cNVUUG|h43uk3B)2GSCKBU^Y0s1IzfyjxEvTGZ?e!qc~An#!lSNe zx-1V?gCzU3S`E$Y^W1x#dX_J;c63HzAAGI18-B(Cdx{kU-G$@knIqH{zc?BlF2BA? zW-@Ynlsda9%ZNHNgrlbQAq^`o4&SE*VI}MYiZ=obMhU)LB0s_Dm*?QP*HI-9$#-~D zfETvcY*3-hz`Q)oxLw>K@~XLHpq>&1Kd@qgXKx+<@GVI2$yejd6mp!q=omt1om{q~ z7-F~Eg7avI5|q;Q8^r{I^yhIeAsQ{=DBasE!d^QLB>bHq5AqBgOZJ!#_Hz%AD=w`mLA-w}$S&9Hzdb)frc)Ii}Tqn|BM6$%*zARHh*`hf>wd!eZ zmKc;(#Mih6$dR6gtOB_u-8+(GsclKU`(36&buxU}K?UGI!F1-iGtXTIY%pKlYkL6~JDk{zXIi6bHqHiV9xXYJt#{n01J`;*NJJKkcI|_IC!A#3Lg95#Kf1AOCEqY$R0X9B@D~l17Xhp4|02ZGAi@xc8hW0DrxzrvI-VGsh{y2c; z4UPp8Y;qUX`r6OijB1^3%Mxu-h>_d}hb?r~=~ZezFu2XzlOFvBH_#bnTf;vO8CE@g z={fs&af#GBNG{sEb8T!(P};K$l)^yu5|Q;&%IgxbYOi@=Vf-|X+l^BRj5doS%piy$ z70U*MFMoC#s?jh=QPQEJUd8qH^` z^D_b++A|y*9t4*7?^5XFM4;P+1(E6uc>8tUfZL}9>Aemvc!jP4b)q>QI;c@^v#=}S z`B6g_x$N`DN@8+=u&b!?qN&dhx9nuwYtWEPTRVydhv8U-HBT~R$Iaq}tqXRib}|n0 z5f<{L0dj64*$?Qit>dTjjSSGI=g5d~ZiY37PKy+i%g~2Mt7O-hkI~2r96)Chf+uY7 zczVwl15HatN&xcMkmEm>{)?l`-XA*!wrQipP&M{R0rxqU0rlRB-l2(=u6Hx&D{!j? zIK&N*{c;?0(vOyr=1+v`_DE8a{1v57HEjKM-B=Ms`Mch zW)!77SLYn-Lq!@y^M0<6U?TdAG3*D+Ro$EnYG#Ia>)>Jw*Z8HORJmzxWchEgF5^8a z3veSdg%yz6J0Sn9Q`EM;$J$v4{{f}Ib;(dhqx)-uWi`*(l7UTD@Fk>QzF>YNT=NUY z=O!o6Ee8~dg(FHfx8iDV-mLvtJix_|jbikd-qM~3UNS4-rYp*G1g}}AF?#7~mPPb&AA8QU|3#`E zN$gkKU<;~;6=bRka5r?V+m5e)7wfm-4}d$X_%u#4=u(-n)XNhJ?uezS%1~(yEo^3@ z%@-T}-S9!tXv5vE|IVSy1&17^e>Qk39?MJ*{TTK=C|_U9^#%8qcZ$7MXcZ%+=c$|u z3=}qO1acQ_#QXyai>^h_u_%eJ|MO{hB4}e3db~}$nu+E_zg2W(C5%&wg^*(8a$4yIA}bYHk{lAez*UY@wbuGzre=r@64UQikiz?tJD8QMsoB5$O5fLR({wt z&#L_6s0makTXib;4+FdYs}+tD{GZUs|CR8;_%Cjffyi2&cLh)k++Bt-4jbG1wF!~` z_j7Ikj~pIkw5_fX=gC`g@r?_S7p?D!KG?q@490kDA@!InlP)iu*k%|V60U2@q&ux3 zDDCxuqlWDq!59*K9*egVnATx z|3-mG5_D#PHtl=wZo;o{=#4Vv3tl8BkO1lx_BU1u0(uA@=!~u~KVELE*6OyjE$cON z;Tkob-%y7%L4NYj$UFT><40$2zZ_RIm;U|;qyb|;9-xpybLRo7Z^ANd~CkuX2uKJTNS z|6TaP+67_xba@0b8xc1qlsbN0Ec?d{waxUQES_uCL-Z>yP_lJ^3-0N_fm`{O233&W zHR^3Sag7jn&^#lVEPt35m(33P_EMsJX=D?Gs0J+VW|LK+3XY`7+Wq=eSCNg;$b}~j zUtWMEHVD@#cZ}yvVXr{I+OP}O0i%EdTec6`A!PGq=>zFoU}q_Jd0Gx|b7@U25y(h* z#u`Kt|J5UmkqX&;I|}1%TJL=_oa47F67tB1K0iWmB&X@G7&h*3=5SzIq{5 zu(cz^h#4s9EA`4q1~PFu_g*zec0O6w?oa|8BDe=lYdtZI3wm1~Xo_x-c~yKI#NlF7 z2u}#Iw%3Y2TT2zx@pxnONH@%I*rE97Yx# zoGyDS*)QmCB9QwL!XtBkE>Z$hXJJ2!esB5DFlV(#B#%L7C8t45@Iv>=1A~CyMO9`w zvPQ+t49}!F=W95`yHjkP!S6na9&G69rvL?7*Ir{_&(IrvHzR|o!FoM}g2obsdJS^j zbSk_mf+;jdsLvb3`XuXBW=f}VHU&Cj8ttZ8*NxTNFes(V8*oaR{?Nw9c$%zL7*zx= zNxVPWor8`VLvZ))n~bx6qTTOE-T_1sslc|!2<^aO->Y{sFv0zSaxCMlekk{s1ZfjF z*2r-_w8rat#OAt!`bs{#OcBro2)pZL>xLz8QeO#4?s(fJ)To1BS7CU77XR1Knwpcu z*vP0~pTO>?oX7;Ft9LQFYzC)L@7G?c#^NM?RBY4HxK10B8Y2%muk$CMEp$9iNu$-G z7B+@0qMU*?Zs|>A@ES%gYE97FUKqtD54&d`gqKQSO))bo&DnCVToKIfiUwzf!@mFN zblayQrRU8umVM%gyuGb|=qIxLqu@TxK3^5 zmIPdahbzcG=$50Ctl?23EpShpw90E^!m?=Zwi6t;0nWd#!{`H}KiEkdBmgz^=1I5XGYw3nKl^=d;#QPLlJ_U6-R1m!ci8BZ(TKM@t0KfvRpYP&kY!w%%!`;z$8(befGlK_7@aCc*xF;VD{5w&F!q-Sm^+-VQ5aaC z3H;E7g=mVt)g_~g`Q}yBPFITgv?{o58d55=n?yebzTU1;u_kiCjH&(|p<>2;LP*TX zQ=vMWBk-wmv*>&4h&Vcf4o<`D9Y^P7D6|8)_yRx?Fucd>r<^ z_fOO&%vo}a=q45A9Y0XIV%*U_c`O?m?iMPe+mvOk046)Qr2!Tq$~gnZPZ@2!$vE<1 z{DW7|Vk|xwV|-K>Xnh8BsEx|e24f5COfL;;en^;2{Z7HitwDd8x>#_6@|v(eB_*Z5 zm%2IS<8U#I{c;$DOyj+YWRM^zTS2Xe^sR2#wih9qyFsmbMMTr8 zQDneb9Wvd(F0FQ!(#uSj_q`hU(rWlWbz$m5i}q^-`;^ylnJARJXaC z1b#&u)s|aIL+JDpk`K?Az=TpsBbNiO9{rTBo;e{ zS8PoN9S5FiTamPjaDD(R9vDnLbsqgf@hQDtr2KBvIwb$dnk@p|L@NRAN>|!>i}MR^ zQWL`Ht;|wOP=D*u35{9)eNsW`9r5LrBj-0->=)|rtK_CfF2%6&xlJS?PiT`d+3cYq z;4PeiylxZbTqjS6`EkJl17UsaAkmmLvO~gRI9fJ~0}}0_p&iFZuD*X=wPe=<6t6pc zi;eJ7U|8iL6t9e8m`zoTPLm(qSs-pl=Rs9XJd)+o6~N&$34j*~bp{h~N(TfhmB7kH z%zLGd0B0Op_NLTeSdw>?R{E;PnuswP+oweZ^M0$isA*3%#R)gj1s@^ zMdVuJwr~yL$F~*s6|oU#X58uYgJLM@Wh8&|WZ8OGwgV_ zNjIX0XpA$!CMP8|OA*MBup3++H`l!JV~_780xY-EKSHBG(u@--;u6o?)+)tu){ z$3o7g4Hn#ZPEMPFsu+kjvB4Emj;pBuOm0)xwu}FegQsI{t`mpikUofVzIPaTIpS=T zGoHy!)URhM;!f{ujJ^rBee+3~P#r#Q$&MVx+$<{nhn#oTf|Rnh--R6AaH0A9%WU_% zmJcQ73&z~FibhC-=Ucs+hqQ%04JTOQrNBlDa@gBm{b|dQQ{yB!+D;{MRE$k|8L)5D z=&{u))_etW-u|wx>QJ08k|?1Q&W|yk+g#+A)9lWoKbMz;D3Rr?Qv*Xv6!i=r*&FH- zEEv_6W+8>w4+d9-77?hbAr?hqy&k-7-66j!f~d4Q@##UKQhWaolF#a?EVWpln>SMM zQSwNvN?pEizeIJ$XWv>>i%!r$s#dj_Y87a)MVcCIxrC}fMS6P(#tbmnpl`^(0ma|P z4tE}z$*DvT#QU>p1Na8JZ-|3PVml_ zk!|G7p$Vpn9_0O-hmf$u^3^E^PP|Sg?}@uKODJ>4L*hsSbysV`xI(@R=vK>2QX2OV zwPwVUW2rl*oyKk)K#y&DG)Hvk6!*JHv$X&S-y*G6<;a@t3C6_NgTGj0*R^QR3|%Ly zrB(7si|(1WmY_t`zh;zC1VDzf0x^wXWp5^12D>aTNFwWa_E0mpOqyXM_zL|L;fe(N zv$$K>hT7w{Y3E*|+yI%wD}v-+k>`&KL@e_zXk%9jBhH+7H1yzi?oV1b82I8ZK&tV}?Ps4%Dfr^( zU)-9RckpZ|c*W*^1$(?`ft74r2c`ph_e&Qs2lO<0ef9eOn<*ahzaom_nyVM`fF z%w(`Yr=HIU-ut;4fq)v=cIlzijrTX8D#XtDvp)RC$z2<_Upe+p`TwH1719J3nMR%O z5;)l@%07IyN@5cFD}X9L5YMS0p6gDPtpvL1i~Ren%(n(l^W`x`t)s@YjpPxGvh3^~ zp%r|NHB8C%U3ob^U50|D{*i2iX6O$JjE?XvQTIr>HIYc0Y5e zL9mISJ-|wR`hv64BvSVKqfv%bMKYb?S`HY#NxR zgr-0paDK_2B-E5R)YMb#X0EUokB73#DIC~Zq*vA(skUBy_niK@l{HwIof3Nvk_}wk zdlnAVR8K0`1ITup>n2<3Hl#;0O2!d>ro_(m^>tuJ#;JVs?TH>htay+MqlasFB^{iY zvI2u&;#HwqJs)$lcvL=}$>v<2{(V?8kVZFvPh2tk!the!qcmiZTXVjqSEHYi|L*@L zmpj)wyodjSzi4>#Tvf)(la4cH_>a6*sAq1?Z%7%Zx8pD+y^a)kS$%m976;e4djxY8 zl>S5?-~)I(!U%qYQ%TK9Y6IEP*X9A+%*!2S$|3~35gi|2b6AE6iBF+ z@)hNqO+2IO@L6^FUtG~ZpNjYbXo6qAIK6=sm_nVivUfcp{cbddZmv)IJ=rOxDZ4fK z)&>Vct?=XxdT)K=ZwYanik_?T=mRQEx)DgexmO-7uRSA+-o%BO^giF$iXfhsT-l?a zsKdd*mM;FDPFVS3YZ@`|_hX;EIb?s=Rkoai2pKSFer&@-%{@~JoUu6|E^(wL)`;5U z_?5%WYsAJUa2`zQCr%ItJ-Yte{uNjBy@2@no2tN9d!D~x+YXneOMwB5mN@Y4{1%7E zE!-Q-rb7OHgk0&}r@53@>_z`Twr;AERm~L%X`ii9X!4W;g#%ctISn)HlndBsjKbepDo3V_u`8^EUK>va z`|dSwW#9b>81hAk7m69bulRX4Cq8^L&X8W=S)#C#0eCm$D*RbsV8wCr6KL8s6rgD} zjzL7MMbDk5>wmRkina}j*7k>lpVyQY)KnE{U{B;cG6=`m zz5O{=Zsj_f9^Qv|sp3kJERUl+{O*0GEj1n@^EmrlHK5_+vI16|vGnw-_Eu~~oUsL- z^**b0ubU_3A>o2mWX9++s5ic0Fk^wY%wgd~*~hz1uhW2ySMmrwBR_8XLhfqBR<-u;^2E!j=|GRkqw44{Ur) zV+4FRTL;`}V#^mrLV8P*BJFmbDEm1~0wHDyh|`FSk%k(~P1r)n!UY?)=wR&2Uc*pD z^_8p+8F1yPGfhcM)-;4nU-mmS?1^l_yGYp<#C*DyNQ+Im3FPOe45;CzT?_VsrOT^j}F+o&oXAwAKxQs$d zt6>{BSW4hkPvBcx;I`;pz>>-2E34B1;b&g}8hC| zvS!h*BJPdv5{C{J$d2VM0q=m?_!g|)Cc2fgH51abGI@gkonZvpOSI8gejWqIq zsDLG3>ba=ogfDecF+f*vjrGz`t9yc7FB1*49eW;CPMLntS7Io-gHVu~<6NF=XOrGF zLAVtTV1Sl*3p7C_rEOc0tPWk#R+Tomu!_~C4{`oNAZ}`EA>uzj%#ay6P3KUAr4A{uoMej~k{6HC;{u!bxD zD*9`Rf_?8QVQb3U(Om<41BNpU_vVf>;e~w$$BASKGR}Q|_}h;Naoz~YCn3Kt>J7X3k{l-rF@?TwWFrZz)VZswix zUmiE1OSbGggcWa5*X4Fn-s>w`)y)jCqBUr1)NY=P_B?B$i=P=vTQvWD9pn8fL#fj} z-%zZ(y1B}p(i`rH5+hh1=p(S~i`S^@kTyhKM*K{w%=tGz{4LYVPKe}hE>@c69U;B+ z=^77RxlVBgs3AQx)BI`BZ{(16kRa1R6k(1&U~12;{N%^iV9Rn|W^*P+cvJtnvruNv zpJi1$($~cO@URN~>#mlt)0f9ISZp--7SfLm4Zbp|=j9jb#oC?}=K>Xyj3S_)C|#NH zSKb&~W|3$+)Srqwq;Jt$D8a-S@vcOEABJ#xotf?VC$*D$gFl~%%jH)of|am{4SojA z(vjG_)DB2RAtkD_-s?05dar(`V?M?j@I|K@rN=p`MJ8%0jIKr>CKNk(FBh?feBwGC zG6f*68GG$1kYS1ZBhr0`gmcAsJUjBNr{ks(e#-zIX1_Xx9NBz!(IbU!ccLHe&H3mt zqIVF*&2pfoVvbB{N%UDnarez-oz2&NHwFQqeXS=^4UvZe@NAKT2&ukj#8Y}iK$Gu> zWM33S>8a|OaQF@1$$jfg(j_eI;+)9lBdp{8NVz;n?WOF~Bq|Udfo>656eTD!t7r&ym4@CpfVL0`y)_bpPy=JQM-qO-({X#TPr zZ~^Ggo(3fQ>M;Pz5rZwx?$Qo!*JdE<%+>^UtgHU#>r43DtIv&)w(=}qN-1D zXFFs-Av~%)v9P{tdIVy1XH`S9Z09mKkZfPe7bnjU#_ue7c_CC3P5C0PlNOkt=nfnC zfXfWE_BeenI_5r6gee*+XbX(4BPC@FErv|_)q!j>fnlICVmaX3 zzlRVnkKv=-hb?;|M7x8qb`@!*$g{$45MjGKs!tyzifkL!`_@!t-W?0RP+gJYw8_Ub;wohvL@?U~;QnJhP7WUVaO~4jvbb98PpI9xo)wJ# z;j%cs<*9f9XT2wMpi*TRd&mJp&(?C;fqfrh6)X#jzwquuzEK*{r)=PR3>xaS9457Wut_s)H^`WS1t#7uMi>B3g+?ds8cXXHaUiHGEhD< zEF57tS-dgj;*0MHxd>l)x-$=5bR7FPh$M@QEcUlAj=x)Ne!qt`bJnHiX68}~z&JMh z7VvLaC0icjB8aTDZkhWfnDNodgDU*4eJ7+<87Tq!M&2lN_KPta(*Gxv1#9{_YOh&u zgigG%z{Hk52>-FYRTd5VKwo6<+0w)jBe^C;ePv5cp%X9eh^Z~Tf@s=|F~%6Xi@tj! zkGJt|wIK24aH3Vf2Qce=(RE3nM_Un3pIf1xA#I*Rwvlp>G-=DsqDvcXAVh%=`kTOO z<-4^GUKY*&1=ZM!WHSA}Oq z&xLDQRwO5qmX;SwYbz(x)|V0xBTMTGXQ+Nqu(UHFGV8??sfeZAh=s)%V~nvo#NP9h zjmK}syG??``+!_3`=m(WW*zQ|i0lDUHc1%(PiV_%LwWR+wCe^Wrxm8Hk8onN(MOiH zE`cF!HTC4fApr*)gE+AbPP|X--A_v!K^TDXrCXD{Fz&@e!HaZ2N+m8!~Zi8GL@Ptbb5XviRiPP4W}(F#e<38cDok zk(1%b;nBgw=w-B17k z;B}cW@+t|ocpi%r??`0Lw%2S!dZAflNVom&?ZVMYPoy2hHn(N#sJ;4gJ6?dCZONDY z)x*n%TrCB$9ccx*iFc4ryrnH^1%~Ysl>Yvto`9D7_>i&(fTU#CKw9qa%|Hh&ti=S_>07IQFJ}Jd*P{A&V`v(004mU6kf0y8TpOJ z8V~RmiC6v>uQF6zcw-iLgYz)MFbu;mOnEa}d8$7yjmH`f@C}Lg{Cn{n3NJF!ftOj| zvBU!a0A)%lFHH?i(0Huz0MC$k9+sc&zaNuaS6VZ1LO+&z@S7z~dTC008ik zYDwzlM&9_wV~q!Rg~WUAy?9dUVdPh6uJzqS8KInK}$Ae|5cp3Q-ufK?QhR=TIcqRbQDK2@TzmS(Xs=3+mnCJ!@ WoB-be#xzg>0000 + +{% block footer %} +{{ super() }} + + +{% endblock %} diff --git a/docs/source/_templates/theme_variables.jinja b/docs/source/_templates/theme_variables.jinja index d2f00702fb655..333edd766fc1d 100644 --- a/docs/source/_templates/theme_variables.jinja +++ b/docs/source/_templates/theme_variables.jinja @@ -14,5 +14,7 @@ 'blog': 'https://www.pytorchlightning.ai/blog', 'resources': 'https://pytorch-lightning.readthedocs.io/en/latest/#community-examples', 'support': 'https://pytorch-lightning.rtfd.io/en/latest/', + 'community': 'https://pytorch-lightning.slack.com', + 'forums': 'https://pytorch-lightning.slack.com', } -%} diff --git a/docs/source/advanced/advanced_gpu.rst b/docs/source/advanced/advanced_gpu.rst index 8146744b521db..0e43d4bff4626 100644 --- a/docs/source/advanced/advanced_gpu.rst +++ b/docs/source/advanced/advanced_gpu.rst @@ -23,7 +23,7 @@ This means we cannot sacrifice throughput as much as if we were fine-tuning, bec Overall: * When **fine-tuning** a model, use advanced memory efficient plugins such as :ref:`deepspeed-zero-stage-3` or :ref:`deepspeed-zero-stage-3-offload`, allowing you to fine-tune larger models if you are limited on compute -* When **pre-training** a model, use simpler optimizations such :ref:`sharded`, :ref:`deepspeed-zero-stage-2`, scaling the number of GPUs to reach larger parameter sizes +* When **pre-training** a model, use simpler optimizations such :ref:`sharded`, :ref:`deepspeed-zero-stage-2` or :ref:`fully-sharded`, scaling the number of GPUs to reach larger parameter sizes * For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` or :ref:`fairscale-activation-checkpointing` as the throughput degradation is not significant For example when using 128 GPUs, you can **pre-train** large 10 to 20 Billion parameter models using :ref:`deepspeed-zero-stage-2` without having to take a performance hit with more advanced optimized multi-gpu plugins. @@ -73,6 +73,104 @@ Sharded Training can work across all DDP variants by adding the additional ``--p Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required. +---------- + +.. _fully-sharded: + +Fully Sharded Training +^^^^^^^^^^^^^^^^^^^^^^ + +.. warning:: + Fully Sharded Training is in beta and the API is subject to change. Please create an `issue `_ if you run into any issues. + +`Fully Sharded `__ shards optimizer state, gradients and parameters across data parallel workers. This allows you to fit much larger models onto multiple GPUs into memory. + +Fully Sharded Training alleviates the need to worry about balancing layers onto specific devices using some form of pipe parallelism, and optimizes for distributed communication with minimal effort. + +Shard Parameters to Reach 10+ Billion Parameters +"""""""""""""""""""""""""""""""""""""""""""""""" + +To reach larger parameter sizes and be memory efficient, we have to shard parameters. There are various ways to enable this. + +.. note:: + Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``. + This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``. + This is a limitation of Fully Sharded Training that will be resolved in the future. + +Wrap the Model +"""""""""""""" + +To activate parameter sharding, you must wrap your model using provided ``wrap`` or ``auto_wrap`` functions as described below. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` and ``auto_wrap`` parameters are passed correctly. + +When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other plugins. + +This is a requirement for really large models and also saves on instantiation time as modules are sharded instantly, rather than after the entire model is created in memory. + +``auto_wrap`` will recursively wrap `torch.nn.Modules` within the ``LightningModule`` with nested Fully Sharded Wrappers, +signalling that we'd like to partition these modules across data parallel devices, discarding the full weights when not required (information `here `__). + +``auto_wrap`` can have varying level of success based on the complexity of your model. **Auto Wrap does not support models with shared parameters**. + +``wrap`` will simply wrap the module with a Fully Sharded Parallel class with the correct parameters from the Lightning context manager. + +Below is an example of using both ``wrap`` and ``auto_wrap`` to create your model. + +.. code-block:: python + + import torch + import torch.nn as nn + import pytorch_lightning as pl + from pytorch_lightning import Trainer + from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap + + class MyModel(pl.LightningModule): + ... + def configure_sharded_model(self): + # Created within sharded model context, modules are instantly sharded across processes + # as soon as they are wrapped with ``wrap`` or ``auto_wrap`` + + # Wraps the layer in a Fully Sharded Wrapper automatically + linear_layer = wrap(nn.Linear(32, 32)) + + # Wraps the module recursively + # based on a minimum number of parameters (default 100M parameters) + block = auto_wrap( + nn.Sequential( + nn.Linear(32, 32), + nn.ReLU() + ) + ) + + # For best memory efficiency, + # add fairscale activation checkpointing + final_block = auto_wrap( + checkpoint_wrapper( + nn.Sequential( + nn.Linear(32, 32), + nn.ReLU() + ) + ) + ) + self.model = nn.Sequential( + linear_layer, + nn.ReLU(), + block, + final_block + ) + + def configure_optimizers(self): + return torch.optim.AdamW(self.model.parameters()) + + model = MyModel() + trainer = Trainer(gpus=4, plugins='fsdp', precision=16) + trainer.fit(model) + + trainer.test() + trainer.predict() + + +---------- + .. _fairscale-activation-checkpointing: FairScale Activation Checkpointing diff --git a/docs/source/advanced/amp.rst b/docs/source/advanced/amp.rst deleted file mode 100644 index 2c25f9e7f918f..0000000000000 --- a/docs/source/advanced/amp.rst +++ /dev/null @@ -1,94 +0,0 @@ -.. testsetup:: * - - from pytorch_lightning.trainer.trainer import Trainer - -.. _amp: - -16-bit training -================= -Lightning offers 16-bit training for CPUs, GPUs, and TPUs. - -.. raw:: html - - - -| - - ----------- - -GPU 16-bit ----------- -16-bit precision can cut your memory footprint by half. -If using volta architecture GPUs it can give a dramatic training speed-up as well. - -.. note:: PyTorch 1.6+ is recommended for 16-bit - -Native torch -^^^^^^^^^^^^ -When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit. - -.. testcode:: - :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() - - # turn on 16-bit - trainer = Trainer(precision=16, gpus=1) - -Apex 16-bit -^^^^^^^^^^^ -If you are using an earlier version of PyTorch Lightning uses Apex to support 16-bit. - -Follow these instructions to install Apex. -To use 16-bit precision, do two things: - -1. Install Apex -2. Set the "precision" trainer flag. - -.. code-block:: bash - - # ------------------------ - # OPTIONAL: on your cluster you might need to load CUDA 10 or 9 - # depending on how you installed PyTorch - - # see available modules - module avail - - # load correct CUDA before install - module load cuda-10.0 - # ------------------------ - - # make sure you've loaded a cuda version > 4.0 and < 7.0 - module load gcc-6.1.0 - - $ pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex - -.. warning:: NVIDIA Apex and DDP have instability problems. We recommend native 16-bit in PyTorch 1.6+ - -Enable 16-bit -^^^^^^^^^^^^^ - -.. testcode:: - :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() - - # turn on 16-bit - trainer = Trainer(amp_level='O2', precision=16) - -If you need to configure the apex init for your particular use case or want to use a different way of doing -16-bit training, override :meth:`pytorch_lightning.core.LightningModule.configure_apex`. - ----------- - -TPU 16-bit ----------- -16-bit on TPUs is much simpler. To use 16-bit with TPUs set precision to 16 when using the TPU flag - -.. testcode:: - :skipif: not _TPU_AVAILABLE - - # DEFAULT - trainer = Trainer(tpu_cores=8, precision=32) - - # turn on 16-bit - trainer = Trainer(tpu_cores=8, precision=16) diff --git a/docs/source/advanced/ipu.rst b/docs/source/advanced/ipu.rst new file mode 100644 index 0000000000000..2b62df379e0cc --- /dev/null +++ b/docs/source/advanced/ipu.rst @@ -0,0 +1,234 @@ +.. _ipu: + +IPU support +=========== + +.. note:: + IPU Support is experimental and a work in progress (see :ref:`known-limitations`). If you run into any problems, please leave an issue. + +Lightning supports `Graphcore Information Processing Units (IPUs) `_, processors built for Artificial Intelligence and Machine Learning. + +IPU Terminology +--------------- + +IPUs consist of many individual cores, allowing parallelization across computation. Due to the high bandwidth speed between cores, +IPUs facilitate machine learning loads where parallelization is essential. Because computation is heavily parallelized, +IPUs operate in a different way to conventional accelerators such as CPU/GPUs. +IPUs do not require large batch sizes for maximum parallelization, can provide optimizations across the compiled graph and rely on model parallelism to fully utilize cores for larger models. + +IPUs are also found within IPU pods, a collection of IPU enabled machines for larger workloads. See the `IPU Architecture `__ for more information. + +How to access IPUs +------------------ + +To use IPUs you must have access to a server with IPU devices attached. To get access see `getting started `_. + +You must ensure that the server with IPUs attached has enabled the SDK popart and poplar packages. Instructions should be given by Graphcore. + +Training with IPUs +------------------ + +Specify the number of IPUs to train with. Note that when training with IPUs, you must select 1 or a power of 2 number of IPUs (i.e. 2/4/8..). + +.. code-block:: python + + trainer = pl.Trainer(ipus=8) # Train using data parallel on 8 IPUs + +IPUs only support specifying a single number to allocate devices, which is handled via the underlying libraries. + +Mixed Precision & 16 bit precision +---------------------------------- + +Lightning also supports training in mixed precision with IPUs. +By default, IPU training will use 32-bit precision. To enable mixed precision, +set the precision flag. + +.. note:: + Currently there is no dynamic scaling of the loss with mixed precision training. + +.. code-block:: python + + import pytorch_lightning as pl + + model = MyLightningModule() + trainer = pl.Trainer(ipus=8, precision=16) + trainer.fit(model) + +You can also use pure 16-bit training, where the weights are also in 16 bit precision. + +.. code-block:: python + + import pytorch_lightning as pl + from pytorch_lightning.plugins import IPUPlugin + + model = MyLightningModule() + model = model.half() + trainer = pl.Trainer(ipus=8, precision=16) + trainer.fit(model) + +Advanced IPU Options +-------------------- + +IPUs provide further optimizations to speed up training. By using the ``IPUPlugin`` we can set the ``device_iterations``, which controls the number of iterations run directly on the IPU devices before returning to host. Increasing the number of on device iterations will improve throughput as there is less device to host communication required. + +.. note:: + + When using model parallel, it is a hard requirement to increase the number of device iterations to ensure we fully saturate the devices via micro-batching. see :ref:`ipu-model-parallelism` for more information. + +.. code-block:: python + + import pytorch_lightning as pl + from pytorch_lightning.plugins import IPUPlugin + + model = MyLightningModule() + trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=32)) + trainer.fit(model) + +Note that by default we return the last device iteration loss. You can override this by passing in your own ``poptorch.Options`` and setting the AnchorMode as described in the `poptorch documentation `__. + +.. code-block:: python + + import poptorch + import pytorch_lightning as pl + from pytorch_lightning.plugins import IPUPlugin + + model = MyLightningModule() + inference_opts = poptorch.Options() + inference_opts.deviceIterations(32) + + training_opts = poptorch.Options() + training_opts.anchorMode(poptorch.AnchorMode.All) + training_opts.deviceIterations(32) + + trainer = Trainer( + ipus=8, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + trainer.fit(model) + +You can also override all options by passing the ``poptorch.Options`` to the plugin. See `poptorch options documentation `_ for more information. + +PopVision Graph Analyser +------------------------ + +.. figure:: ../_static/images/accelerator/ipus/profiler.png + :alt: PopVision Graph Analyser + :width: 500 + +Lightning supports integration with the `PopVision Graph Analyser Tool `__. This helps to look at utilization of IPU devices and provides helpful metrics during the lifecycle of your trainer. Once you have gained access, The PopVision Graph Analyser Tool can be downloaded via the `GraphCore download website `__. + +Lightning supports dumping all reports to a directory to open using the tool. + +.. code-block:: python + + import pytorch_lightning as pl + from pytorch_lightning.plugins import IPUPlugin + + model = MyLightningModule() + trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(autoreport_dir='report_dir/')) + trainer.fit(model) + +This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports `__. + +.. _ipu-model-parallelism: + +Model Parallelism +----------------- + +Due to the IPU architecture, larger models should be parallelized across IPUs by design. Currently poptorch provides the capabilities via annotations as described in `Parallel Execution `__ + +Below is an example using the block annotation in a LightningModule. + +.. note:: + + Currently when using model parallelism, we do not infer the number of IPUs required for you. This is done via the annotations themselves. If you specify 4 different IDs when defining Blocks, this means your model will be split onto 4 different IPUs. + + This is also mutually exclusive with the Trainer flag, i.e. if your model is split onto 2 IPUs and you set ``Trainer(ipus=4)`` this will require 8 IPUs in total; replicating the model 4 times in data parallel. + + When pipelining the model you must also increase the `device_iterations` to ensure full data saturation of the devices data, i.e whilst one device in the model pipeline processes a batch of data, the other device can start on the next batch. For example if the model is split onto 4 IPUs, we require `device_iterations` to be at-least 4. + + +.. code-block:: python + + import pytorch_lightning as pl + import poptorch + + class MyLightningModule(pl.LightningModule): + + def __init__(self): + super().__init__() + # This will place layer1, layer2+layer3, layer4, softmax on different IPUs at runtime. + # BeginBlock will start a new id for all layers within this block + self.layer1 = poptorch.BeginBlock(torch.nn.Linear(5, 10), ipu_id=0) + + # This layer starts a new block, + # adding subsequent layers to this current block at runtime + # till the next block has been declared + self.layer2 = poptorch.BeginBlock(torch.nn.Linear(10, 5), ipu_id=1) + self.layer3 = torch.nn.Linear(5, 5) + + # Create new blocks + self.layer4 = poptorch.BeginBlock(torch.nn.Linear(5, 5), ipu_id=2) + self.softmax = poptorch.BeginBlock(torch.nn.Softmax(dim=1), ipu_id=3) + + ... + + model = MyLightningModule() + trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=20)) + trainer.fit(model) + + +You can also use the block context manager within the forward function, or any of the step functions. + +.. code-block:: python + + import pytorch_lightning as pl + import poptorch + + class MyLightningModule(pl.LightningModule): + + def __init__(self): + super().__init__() + self.layer1 = torch.nn.Linear(5, 10) + self.layer2 = torch.nn.Linear(10, 5) + self.layer3 = torch.nn.Linear(5, 5) + self.layer4 = torch.nn.Linear(5, 5) + + self.act = torch.nn.ReLU() + self.softmax = torch.nn.Softmax(dim=1) + + def forward(self, x): + + with poptorch.Block(ipu_id=0): + x = self.act(self.layer1(x)) + + with poptorch.Block(ipu_id=1): + x = self.act(self.layer2(x)) + + with poptorch.Block(ipu_id=2): + x = self.act(self.layer3(x)) + x = self.act(self.layer4(x)) + + with poptorch.Block(ipu_id=3): + x = self.softmax(x) + return x + ... + + model = MyLightningModule() + trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=20)) + trainer.fit(model) + + +.. _known-limitations: + +Known Limitations +----------------- + +Currently there are some known limitations that are being addressed in the near future to make the experience seamless when moving from different devices. + +Please see the `MNIST example `__ which displays most of the limitations and how to overcome them till they are resolved. + +* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this +* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result +* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code +* Clipping gradients is not supported diff --git a/docs/source/advanced/multi_gpu.rst b/docs/source/advanced/multi_gpu.rst index 1c465ae314e4f..e645fa7d18404 100644 --- a/docs/source/advanced/multi_gpu.rst +++ b/docs/source/advanced/multi_gpu.rst @@ -106,6 +106,23 @@ Note if you use any built in metrics or custom metrics that use the :doc:`Metric # Add sync_dist=True to sync logging across all GPU workers self.log('test_loss', loss, on_step=True, on_epoch=True, sync_dist=True) +It is possible to perform some computation manually and log the reduced result on rank 0 as follows: + +.. testcode:: + + def test_step(self, batch, batch_idx): + x, y = batch + tensors = self(x) + return tensors + + def test_epoch_end(self, outputs): + mean = torch.mean(self.all_gather(outputs)) + + # When logging only on rank 0, don't forget to add + # ``rank_zero_only=True`` to avoid deadlocks on synchronization. + if self.trainer.is_global_zero: + self.log("my_reduced_metric", mean, rank_zero_only=True) + Make models pickleable ^^^^^^^^^^^^^^^^^^^^^^ @@ -622,38 +639,39 @@ The reason is that the full batch is visible to all GPUs on the node when using ---------- -TorchElastic --------------- -Lightning supports the use of TorchElastic to enable fault-tolerant and elastic distributed job scheduling. To use it, specify the 'ddp' or 'ddp2' backend and the number of gpus you want to use in the trainer. +Torch Distributed Elastic +------------------------- +Lightning supports the use of Torch Distributed Elastic to enable fault-tolerant and elastic distributed job scheduling. To use it, specify the 'ddp' or 'ddp2' backend and the number of gpus you want to use in the trainer. .. code-block:: python Trainer(gpus=8, accelerator='ddp') - -Following the `TorchElastic Quickstart documentation `_, you then need to start a single-node etcd server on one of the hosts: +To launch a fault-tolerant job, run the following on all nodes. .. code-block:: bash - etcd --enable-v2 - --listen-client-urls http://0.0.0.0:2379,http://127.0.0.1:4001 - --advertise-client-urls PUBLIC_HOSTNAME:2379 - + python -m torch.distributed.run + --nnodes=NUM_NODES + --nproc_per_node=TRAINERS_PER_NODE + --rdzv_id=JOB_ID + --rdzv_backend=c10d + --rdzv_endpoint=HOST_NODE_ADDR + YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...) -And then launch the elastic job with: +To launch an elastic job, run the following on at least ``MIN_SIZE`` nodes and at most ``MAX_SIZE`` nodes. .. code-block:: bash - python -m torchelastic.distributed.launch + python -m torch.distributed.run --nnodes=MIN_SIZE:MAX_SIZE --nproc_per_node=TRAINERS_PER_NODE --rdzv_id=JOB_ID - --rdzv_backend=etcd - --rdzv_endpoint=ETCD_HOST:ETCD_PORT + --rdzv_backend=c10d + --rdzv_endpoint=HOST_NODE_ADDR YOUR_LIGHTNING_TRAINING_SCRIPT.py (--arg1 ... train script args...) - -See the official `TorchElastic documentation `_ for details +See the official `Torch Distributed Elastic documentation `_ for details on installation and more use cases. ---------- diff --git a/docs/source/advanced/multiple_loaders.rst b/docs/source/advanced/multiple_loaders.rst index 1a82641953c3c..02d5db143c95c 100644 --- a/docs/source/advanced/multiple_loaders.rst +++ b/docs/source/advanced/multiple_loaders.rst @@ -91,23 +91,6 @@ For more details please have a look at :paramref:`~pytorch_lightning.trainer.tra Furthermore, Lightning also supports that nested lists and dicts (or a combination) can be returned. -.. testcode:: - - class LitModel(LightningModule): - - def train_dataloader(self): - - loader_a = torch.utils.data.DataLoader(range(8), batch_size=4) - loader_b = torch.utils.data.DataLoader(range(16), batch_size=2) - - return {'a': loader_a, 'b': loader_b} - - def training_step(self, batch, batch_idx): - # access a dictionnary with a batch from each dataloader - batch_a = batch["a"] - batch_b = batch["b"] - - .. testcode:: class LitModel(LightningModule): diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst index f73a8954f8764..3f9e2c2575cc2 100644 --- a/docs/source/api_references.rst +++ b/docs/source/api_references.rst @@ -89,8 +89,6 @@ Training Type Plugins DDPSpawnPlugin DeepSpeedPlugin HorovodPlugin - RPCPlugin - RPCSequentialPlugin SingleTPUPlugin TPUSpawnPlugin @@ -137,8 +135,15 @@ Profiler API .. autosummary:: :toctree: api :nosignatures: + :template: classtemplate.rst + + AbstractProfiler + AdvancedProfiler + BaseProfiler + PassThroughProfiler + PyTorchProfiler + SimpleProfiler - profilers Trainer API ----------- diff --git a/docs/source/benchmarking/performance.rst b/docs/source/benchmarking/performance.rst deleted file mode 100644 index 6e2b546fb275f..0000000000000 --- a/docs/source/benchmarking/performance.rst +++ /dev/null @@ -1,183 +0,0 @@ -.. _performance: - -Fast performance tips -===================== -Lightning builds in all the micro-optimizations we can find to increase your performance. -But we can only automate so much. - -Here are some additional things you can do to increase your performance. - ----------- - -Dataloaders ------------ -When building your DataLoader set ``num_workers > 0`` and ``pin_memory=True`` (only for GPUs). - -.. code-block:: python - - Dataloader(dataset, num_workers=8, pin_memory=True) - -num_workers -^^^^^^^^^^^ -The question of how many ``num_workers`` is tricky. Here's a summary of -some references, [`1 `_], and our suggestions. - -1. ``num_workers=0`` means ONLY the main process will load batches (that can be a bottleneck). -2. ``num_workers=1`` means ONLY one worker (just not the main process) will load data but it will still be slow. -3. The ``num_workers`` depends on the batch size and your machine. -4. A general place to start is to set ``num_workers`` equal to the number of CPUs on that machine. - -.. warning:: Increasing ``num_workers`` will ALSO increase your CPU memory consumption. - -The best thing to do is to increase the ``num_workers`` slowly and stop once you see no more improvement in your training speed. - -Spawn -^^^^^ -When using ``accelerator=ddp_spawn`` (the ddp default) or TPU training, the way multiple GPUs/TPU cores are used is by calling ``.spawn()`` under the hood. -The problem is that PyTorch has issues with ``num_workers > 0`` when using ``.spawn()``. For this reason we recommend you -use ``accelerator=ddp`` so you can increase the ``num_workers``, however your script has to be callable like so: - -.. code-block:: bash - - python my_program.py --gpus X - ----------- - -.item(), .numpy(), .cpu() -------------------------- -Don't call ``.item()`` anywhere in your code. Use ``.detach()`` instead to remove the connected graph calls. Lightning -takes a great deal of care to be optimized for this. - ----------- - -empty_cache() -------------- -Don't call this unnecessarily! Every time you call this ALL your GPUs have to wait to sync. - ----------- - -Construct tensors directly on the device ----------------------------------------- -LightningModules know what device they are on! Construct tensors on the device directly to avoid CPU->Device transfer. - -.. code-block:: python - - # bad - t = torch.rand(2, 2).cuda() - - # good (self is LightningModule) - t = torch.rand(2, 2, device=self.device) - - -For tensors that need to be model attributes, it is best practice to register them as buffers in the modules's -``__init__`` method: - -.. code-block:: python - - # bad - self.t = torch.rand(2, 2, device=self.device) - - # good - self.register_buffer("t", torch.rand(2, 2)) - ----------- - -Use DDP not DP --------------- -DP performs three GPU transfers for EVERY batch: - -1. Copy model to device. -2. Copy data to device. -3. Copy outputs of each device back to master. - -| - -Whereas DDP only performs 1 transfer to sync gradients. Because of this, DDP is MUCH faster than DP. - -When using DDP set find_unused_parameters=False ------------------------------------------------ - -By default we have enabled find unused parameters to True. This is for compatibility issues that have arisen in the past (see the `discussion `_ for more information). -This by default comes with a performance hit, and can be disabled in most cases. - -.. code-block:: python - - from pytorch_lightning.plugins import DDPPlugin - - trainer = pl.Trainer( - gpus=2, - plugins=DDPPlugin(find_unused_parameters=False), - ) - ----------- - -16-bit precision ----------------- -Use 16-bit to decrease the memory consumption (and thus increase your batch size). On certain GPUs (V100s, 2080tis), 16-bit calculations are also faster. -However, know that 16-bit and multi-processing (any DDP) can have issues. Here are some common problems. - -1. `CUDA error: an illegal memory access was encountered `_. - The solution is likely setting a specific CUDA, CUDNN, PyTorch version combination. -2. ``CUDA error: device-side assert triggered``. This is a general catch-all error. To see the actual error run your script like so: - -.. code-block:: bash - - # won't see what the error is - python main.py - - # will see what the error is - CUDA_LAUNCH_BLOCKING=1 python main.py - -.. tip:: We also recommend using 16-bit native found in PyTorch 1.6. Just install this version and Lightning will automatically use it. - ----------- - -Advanced GPU Optimizations --------------------------- - -When training on single or multiple GPU machines, Lightning offers a host of advanced optimizations to improve throughput, memory efficiency, and model scaling. -Refer to :doc:`Advanced GPU Optimized Training for more details <../advanced/advanced_gpu>`. - ----------- - -Preload Data Into RAM ---------------------- - -When your training or preprocessing requires many operations to be performed on entire dataset(s) it can -sometimes be beneficial to store all data in RAM given there is enough space. -However, loading all data at the beginning of the training script has the disadvantage that it can take a long -time and hence it slows down the development process. Another downside is that in multiprocessing (e.g. DDP) -the data would get copied in each process. -One can overcome these problems by copying the data into RAM in advance. -Most UNIX-based operating systems provide direct access to tmpfs through a mount point typically named ``/dev/shm``. - -0. Increase shared memory if necessary. Refer to the documentation of your OS how to do this. - -1. Copy training data to shared memory: - - .. code-block:: bash - - cp -r /path/to/data/on/disk /dev/shm/ - -2. Refer to the new data root in your script or command line arguments: - - .. code-block:: python - - datamodule = MyDataModule(data_root="/dev/shm/my_data") - ----------- - -Zero Grad ``set_to_none=True`` ------------------------------- - -In order to modestly improve performance, you can override :meth:`~pytorch_lightning.core.lightning.LightningModule.optimizer_zero_grad`. - -For a more detailed explanation of pros / cons of this technique, -read `this `_ documentation by the PyTorch team. - -.. testcode:: - - class Model(LightningModule): - - def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx): - optimizer.zero_grad(set_to_none=True) diff --git a/docs/source/clouds/cloud_training.rst b/docs/source/clouds/cloud_training.rst index a2171f102fa94..c608ad2c1063c 100644 --- a/docs/source/clouds/cloud_training.rst +++ b/docs/source/clouds/cloud_training.rst @@ -1,31 +1,42 @@ .. _grid: -################ -AWS/GCP training -################ +############## +Cloud Training +############## + Lightning has a native solution for training on AWS/GCP at scale. Go to `grid.ai `_ to create an account. -We've designed Grid to work for Lightning users without needing to make ANY changes to their code. +We've designed Grid to work seamlessly with Lightning, without needing to make ANY code changes. -To use grid, take your regular command: +To use Grid, replace ``python`` in your regular command: .. code-block:: bash python my_model.py --learning_rate 1e-6 --layers 2 --gpus 4 -And change it to use the grid train command: +To use the ``grid run`` command: .. code-block:: bash - grid train --grid_gpus 4 my_model.py --learning_rate 'uniform(1e-6, 1e-1, 20)' --layers '[2, 4, 8, 16]' + grid run --gpus 4 my_model.py --learning_rate 'uniform(1e-6, 1e-1, 20)' --layers '[2, 4, 8, 16]' -The above command will launch (20 * 4) experiments each running on 4 GPUs (320 GPUs!) - by making ZERO changes to +The above command will launch (20 * 4) experiments, each running on 4 GPUs (320 GPUs!) - by making ZERO changes to your code. -The `uniform` command is part of our new expressive syntax which lets you construct hyperparameter combinations +The ``uniform`` command is part of our new expressive syntax which lets you construct hyperparameter combinations using over 20+ distributions, lists, etc. Of course, you can also configure all of this using yamls which can be dynamically assembled at runtime. - -.. hint:: Grid supports the search strategy of your choice! (and much more than just sweeps) +*************** +Grid Highlights +*************** + +* Run any public or private repository with Grid, or use an interactive session. +* Grid allocates all the machines and GPUs you need on demand, so you only pay for what you need when you need it. +* Grid handles all the other parts of developing and training at scale: artifacts, logs, metrics, etc. +* Grid works with the experiment manager of your choice, no code changes needed. +* Use Grid Datastores- high-performance, low-latency, versioned datasets. +* Attach Datastores to a Run so you don't have to keep downloading datasets +* Use Grid Sessions for fast prototyping on a cloud machine of your choice +* For more information check the `grid documentation `_ diff --git a/docs/source/common/fast_training.rst b/docs/source/common/fast_training.rst deleted file mode 100644 index 2216d234836f2..0000000000000 --- a/docs/source/common/fast_training.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. testsetup:: * - - from pytorch_lightning.trainer.trainer import Trainer - -.. _fast_training: - -Fast Training -============= -There are multiple options to speed up different parts of the training by choosing to train -on a subset of data. This could be done for speed or debugging purposes. - ----------------- - -Check validation every n epochs -------------------------------- -If you have a small dataset you might want to check validation every n epochs - -.. testcode:: - - # DEFAULT - trainer = Trainer(check_val_every_n_epoch=1) - ----------------- - -Force training for min or max epochs ------------------------------------- -It can be useful to force training for a minimum number of epochs or limit to a max number. - -.. seealso:: - :class:`~pytorch_lightning.trainer.trainer.Trainer` - -.. testcode:: - - # DEFAULT - trainer = Trainer(min_epochs=1, max_epochs=1000) - ----------------- - -Set validation check frequency within 1 training epoch ------------------------------------------------------- -For large datasets it's often desirable to check validation multiple times within a training loop. -Pass in a float to check that often within 1 training epoch. Pass in an int `k` to check every `k` training batches. -Must use an `int` if using an `IterableDataset`. - -.. testcode:: - - # DEFAULT - trainer = Trainer(val_check_interval=0.95) - - # check every .25 of an epoch - trainer = Trainer(val_check_interval=0.25) - - # check every 100 train batches (ie: for `IterableDatasets` or fixed frequency) - trainer = Trainer(val_check_interval=100) - ----------------- - -Use data subset for training, validation, and test --------------------------------------------------- -If you don't want to check 100% of the training/validation/test set (for debugging or if it's huge), set these flags. - -.. testcode:: - - # DEFAULT - trainer = Trainer( - limit_train_batches=1.0, - limit_val_batches=1.0, - limit_test_batches=1.0 - ) - - # check 10%, 20%, 30% only, respectively for training, validation and test set - trainer = Trainer( - limit_train_batches=0.1, - limit_val_batches=0.2, - limit_test_batches=0.3 - ) - -If you also pass ``shuffle=True`` to the dataloader, a different random subset of your dataset will be used for each epoch; otherwise the same subset will be used for all epochs. - -.. note:: ``limit_train_batches``, ``limit_val_batches`` and ``limit_test_batches`` will be overwritten by ``overfit_batches`` if ``overfit_batches`` > 0. ``limit_val_batches`` will be ignored if ``fast_dev_run=True``. - -.. note:: If you set ``limit_val_batches=0``, validation will be disabled. diff --git a/docs/source/common/lightning_cli.rst b/docs/source/common/lightning_cli.rst index c16e2e3b733fe..2e4b3f356f7c4 100644 --- a/docs/source/common/lightning_cli.rst +++ b/docs/source/common/lightning_cli.rst @@ -1,6 +1,7 @@ .. testsetup:: * :skipif: not _JSONARGPARSE_AVAILABLE + import torch from unittest import mock from typing import List from pytorch_lightning.core.lightning import LightningModule @@ -19,9 +20,13 @@ ): pass + class MyClassModel(LightningModule): + def __init__(self, num_classes: int): + pass + class MyDataModule(LightningDataModule): def __init__(self, batch_size: int = 8): - pass + self.num_classes = 5 def send_email(address, message): pass @@ -88,6 +93,8 @@ practice to create a configuration file and provide this to the tool. A way to d nano config.yaml # Run training using created configuration python trainer.py --config config.yaml + # The config JSON can also be passed directly + python trainer.py --config '{trainer: {fast_dev_run: True}}' The instantiation of the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class takes care of parsing command line and config file options, instantiating the classes, setting up a callback to save the config in the log directory and @@ -372,6 +379,47 @@ Note that the config object :code:`self.config` is a dictionary whose keys are g has the same structure as the yaml format described previously. This means for instance that the parameters used for instantiating the trainer class can be found in :code:`self.config['trainer']`. +.. tip:: + + Have a look at the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class API reference to learn about other + methods that can be extended to customize a CLI. + + +Configurable callbacks +^^^^^^^^^^^^^^^^^^^^^^ + +As explained previously, any callback can be added by including it in the config via :code:`class_path` and +:code:`init_args` entries. However, there are other cases in which a callback should always be present and be +configurable. This can be implemented as follows: + +.. testcode:: + + from pytorch_lightning.callbacks import EarlyStopping + from pytorch_lightning.utilities.cli import LightningCLI + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_lightning_class_args(EarlyStopping, 'my_early_stopping') + parser.set_defaults({'my_early_stopping.patience': 5}) + + cli = MyLightningCLI(MyModel) + +To change the configuration of the :code:`EarlyStopping` in the config it would be: + +.. code-block:: yaml + + model: + ... + trainer: + ... + my_early_stopping: + patience: 5 + + +Argument linking +^^^^^^^^^^^^^^^^ + Another case in which it might be desired to extend :class:`~pytorch_lightning.utilities.cli.LightningCLI` is that the model and data module depend on a common parameter. For example in some cases both classes require to know the :code:`batch_size`. It is a burden and error prone giving the same value twice in a config file. To avoid this the @@ -402,13 +450,138 @@ The linking of arguments is observed in the help of the tool, which for this exa model.batch_size <-- data.batch_size Number of samples in a batch (type: int) +Sometimes a parameter value is only available after class instantiation. An example could be that your model requires the number of classes to instantiate its fully connected layer (for a classification task) but the value is not available until the data module has been instantiated. +The code below illustrates how to address this. + +.. testcode:: + + from pytorch_lightning.utilities.cli import LightningCLI + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.link_arguments('data.num_classes', 'model.num_classes', apply_on='instantiate') + + cli = MyLightningCLI(MyClassModel, MyDataModule) + +Instantiation links are used to automatically determine the order of instantiation, in this case data first. + .. tip:: The linking of arguments can be used for more complex cases. For example to derive a value via a function that takes multiple settings as input. For more details have a look at the API of `link_arguments `_. -.. tip:: - Have a look at the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class API reference to learn about other - methods that can be extended to customize a CLI. +Optimizers and learning rate schedulers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Optimizers and learning rate schedulers can also be made configurable. The most common case is when a model only has a +single optimizer and optionally a single learning rate scheduler. In this case the model's +:class:`~pytorch_lightning.core.lightning.LightningModule` could be left without implementing the +:code:`configure_optimizers` method since it is normally always the same and just adds boilerplate. The following code +snippet shows how to implement it: + +.. testcode:: + + import torch + from pytorch_lightning.utilities.cli import LightningCLI + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args(torch.optim.Adam) + parser.add_lr_scheduler_args(torch.optim.lr_scheduler.ExponentialLR) + + cli = MyLightningCLI(MyModel) + +With this the :code:`configure_optimizers` method is automatically implemented and in the config the :code:`optimizer` +and :code:`lr_scheduler` groups would accept all of the options for the given classes, in this example :code:`Adam` and +:code:`ExponentialLR`. Therefore, the config file would be structured like: + +.. code-block:: yaml + + optimizer: + lr: 0.01 + lr_scheduler: + gamma: 0.2 + model: + ... + trainer: + ... + +And any of these arguments could be passed directly through command line. For example: + +.. code-block:: bash + + $ python train.py --optimizer.lr=0.01 --lr_scheduler.gamma=0.2 + +There is also the possibility of selecting among multiple classes by giving them as a tuple. For example: + +.. testcode:: + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args((torch.optim.SGD, torch.optim.Adam)) + +In this case in the config the :code:`optimizer` group instead of having directly init settings, it should specify +:code:`class_path` and optionally :code:`init_args`. Sub-classes of the classes in the tuple would also be accepted. +A corresponding example of the config file would be: + +.. code-block:: yaml + + optimizer: + class_path: torch.optim.Adam + init_args: + lr: 0.01 + model: + ... + trainer: + ... + +And the same through command line: + +.. code-block:: bash + + $ python train.py --optimizer='{class_path: torch.optim.Adam, init_args: {lr: 0.01}}' + +The automatic implementation of :code:`configure_optimizers` can be disabled by linking the configuration group. An +example can be :code:`ReduceLROnPlateau` which requires to specify a monitor. This would be: + +.. testcode:: + + from pytorch_lightning.utilities.cli import instantiate_class, LightningCLI + + class MyModel(LightningModule): + + def __init__(self, optimizer_init: dict, lr_scheduler_init: dict): + super().__init__() + self.optimizer_init = optimizer_init + self.lr_scheduler_init = lr_scheduler_init + + def configure_optimizers(self): + optimizer = instantiate_class(self.parameters(), self.optimizer_init) + scheduler = instantiate_class(optimizer, self.lr_scheduler_init) + return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "metric_to_track"} + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args( + torch.optim.Adam, + link_to='model.optimizer_init', + ) + parser.add_lr_scheduler_args( + torch.optim.lr_scheduler.ReduceLROnPlateau, + link_to='model.lr_scheduler_init', + ) + + cli = MyLightningCLI(MyModel) + +For both possibilities of using :meth:`pytorch_lightning.utilities.cli.LightningArgumentParser.add_optimizer_args` with +a single class or a tuple of classes, the value given to :code:`optimizer_init` will always be a dictionary including +:code:`class_path` and :code:`init_args` entries. The function +:func:`~pytorch_lightning.utilities.cli.instantiate_class` takes care of importing the class defined in +:code:`class_path` and instantiating it using some positional arguments, in this case :code:`self.parameters()`, and the +:code:`init_args`. Any number of optimizers and learning rate schedulers can be added when using :code:`link_to`. diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index 295d231ca5ac3..6043eab649ebf 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -279,11 +279,16 @@ In this case, implement the `training_step_end` method return {'loss': loss, 'pred': pred} def training_step_end(self, batch_parts): - gpu_0_prediction = batch_parts[0]['pred'] - gpu_1_prediction = batch_parts[1]['pred'] + # predictions from each GPU + predictions = batch_parts['pred'] + # losses from each GPU + losses = batch_parts['loss'] + + gpu_0_prediction = predictions[0] + gpu_1_prediction = predictions[1] # do something with both outputs - return (batch_parts[0]['loss'] + batch_parts[1]['loss']) / 2 + return (losses[0] + losses[1]) / 2 def training_epoch_end(self, training_step_outputs): for out in training_step_outputs: @@ -383,11 +388,16 @@ In this case, implement the `validation_step_end` method return {'loss': loss, 'pred': pred} def validation_step_end(self, batch_parts): - gpu_0_prediction = batch_parts.pred[0]['pred'] - gpu_1_prediction = batch_parts.pred[1]['pred'] + # predictions from each GPU + predictions = batch_parts['pred'] + # losses from each GPU + losses = batch_parts['loss'] + + gpu_0_prediction = predictions[0] + gpu_1_prediction = predictions[1] # do something with both outputs - return (batch_parts[0]['loss'] + batch_parts[1]['loss']) / 2 + return (losses[0] + losses[1]) / 2 def validation_epoch_end(self, validation_step_outputs): for out in validation_step_outputs: @@ -441,12 +451,12 @@ There are two ways to call `test()`: trainer.fit(model) # automatically auto-loads the best weights - trainer.test(test_dataloaders=test_dataloader) + trainer.test(dataloaders=test_dataloader) # or call with pretrained model model = MyLightningModule.load_from_checkpoint(PATH) trainer = Trainer() - trainer.test(model, test_dataloaders=test_dataloader) + trainer.test(model, dataloaders=test_dataloader) ---------- @@ -489,6 +499,14 @@ For research, LightningModules are best structured as systems. reconstruction_loss = nn.functional.mse_loss(recons, x) self.log('val_reconstruction', reconstruction_loss) + def predict_step(self, batch, batch_idx, dataloader_idx): + x, _ = batch + + # encode + # for predictions, we could return the embedding or the reconstruction or both based on our need. + x = x.view(x.size(0), -1) + return self.encoder(x) + def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=0.0002) @@ -510,6 +528,7 @@ The methods above are part of the lightning interface: - training_step - validation_step - test_step +- predict_step - configure_optimizers Note that in this case, the train loop and val loop are exactly the same. We can of course reuse this code. @@ -554,12 +573,20 @@ Inference in research ^^^^^^^^^^^^^^^^^^^^^ In the case where we want to perform inference with the system we can add a `forward` method to the LightningModule. +.. note:: When using forward, you are responsible to call :func:`~torch.nn.Module.eval` and use the :func:`~torch.no_grad` context manager. + .. code-block:: python class Autoencoder(pl.LightningModule): + def forward(self, x): return self.decoder(x) + model = Autoencoder() + model.eval() + with torch.no_grad(): + reconstruction = model(embedding) + The advantage of adding a forward is that in complex systems, you can do a much more involved inference procedure, such as text generation: @@ -575,6 +602,25 @@ such as text generation: ... return decoded +In the case where you want to scale your inference, you should be using +:meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step`. + +.. code-block:: python + + class Autoencoder(pl.LightningModule): + + def forward(self, x): + return self.decoder(x) + + def predict_step(self, batch, batch_idx, dataloader_idx = None) + # this calls forward + return self(batch) + + data_module = ... + model = Autoencoder() + trainer = Trainer(gpus=2) + trainer.predict(model, data_module) + Inference in production ^^^^^^^^^^^^^^^^^^^^^^^ For cases like production, you might want to iterate different models inside a LightningModule. @@ -586,33 +632,41 @@ For cases like production, you might want to iterate different models inside a L class ClassificationTask(pl.LightningModule): - def __init__(self, model): - super().__init__() - self.model = model - - def training_step(self, batch, batch_idx): - x, y = batch - y_hat = self.model(x) - loss = F.cross_entropy(y_hat, y) - return loss + def __init__(self, model): + super().__init__() + self.model = model - def validation_step(self, batch, batch_idx): + def training_step(self, batch, batch_idx): x, y = batch y_hat = self.model(x) loss = F.cross_entropy(y_hat, y) - acc = FM.accuracy(y_hat, y) + return loss + def validation_step(self, batch, batch_idx): + loss, acc = self._shared_eval_step(batch, batch_idx) metrics = {'val_acc': acc, 'val_loss': loss} self.log_dict(metrics) return metrics - def test_step(self, batch, batch_idx): - metrics = self.validation_step(batch, batch_idx) - metrics = {'test_acc': metrics['val_acc'], 'test_loss': metrics['val_loss']} + def test_step(self, batch, batch_idx): + loss, acc = self._shared_eval_step(batch, batch_idx) + metrics = {'test_acc': acc, 'test_loss': loss} self.log_dict(metrics) + return metrics - def configure_optimizers(self): - return torch.optim.Adam(self.model.parameters(), lr=0.02) + def _shared_eval_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + acc = FM.accuracy(y_hat, y) + return loss, acc + + def predict_step(self, batch, batch_idx, dataloader_idx): + x, y = batch + y_hat = self.model(x) + + def configure_optimizers(self): + return torch.optim.Adam(self.model.parameters(), lr=0.02) Then pass in any arbitrary model to be fit with this task @@ -1009,7 +1063,11 @@ truncated_bptt_steps ^^^^^^^^^^^^^^^^^^^^ Truncated back prop breaks performs backprop every k steps of -a much longer sequence. +a much longer sequence. This is made possible by passing training batches +splitted along the time-dimensions into splits of size k to the +``training_step``. In order to keep the same forward propagation behavior, all +hidden states should be kept in-between each time-dimension split. + If this is enabled, your batches will automatically get truncated and the trainer will apply Truncated Backprop to it. @@ -1026,23 +1084,40 @@ recurrent network trajectories." class MyModel(LightningModule): - def __init__(self): + def __init__(self, input_size, hidden_size, num_layers): super().__init__() + # batch_first has to be set to True + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + ) + + ... + # Important: This property activates truncated backpropagation through time # Setting this value to 2 splits the batch into sequences of size 2 self.truncated_bptt_steps = 2 # Truncated back-propagation through time def training_step(self, batch, batch_idx, hiddens): + x, y = batch + # the training step must be updated to accept a ``hiddens`` argument # hiddens are the hiddens from the previous truncated backprop step - out, hiddens = self.lstm(data, hiddens) + out, hiddens = self.lstm(x, hiddens) + + ... + return { "loss": ..., "hiddens": hiddens } -Lightning takes care to split your batch along the time-dimension. +Lightning takes care of splitting your batch along the time-dimension. It is +assumed to be the second dimension of your batches. Therefore, in the +example above we have set ``batch_first=True``. .. code-block:: python @@ -1064,7 +1139,9 @@ override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`: Hooks ^^^^^ -This is the pseudocode to describe how all the hooks are called during a call to ``.fit()``. +This is the pseudocode to describe the structure of :meth:`~pytorch_lightning.trainer.Trainer.fit`. +The inputs and outputs of each function are not represented for simplicity. Please check each function's API reference +for more information. .. code-block:: python @@ -1075,36 +1152,41 @@ This is the pseudocode to describe how all the hooks are called during a call to configure_callbacks() - on_fit_start() - - for gpu/tpu in gpu/tpus: - train_on_device(model.copy()) - - on_fit_end() + with parallel(devices): + # devices can be GPUs, TPUs, ... + train_on_device(model) def train_on_device(model): - # setup is called PER DEVICE - setup() + # called PER DEVICE + on_fit_start() + setup('fit') configure_optimizers() + on_pretrain_routine_start() + on_pretrain_routine_end() + + # the sanity check runs here + on_train_start() for epoch in epochs: train_loop() + on_train_end() - teardown() + on_fit_end() + teardown('fit') def train_loop(): on_epoch_start() on_train_epoch_start() - train_outs = [] - for train_batch in train_dataloader(): + + for batch in train_dataloader(): on_train_batch_start() - # ----- train_step methods ------- - out = training_step(batch) - train_outs.append(out) + on_before_batch_transfer() + transfer_batch_to_device() + on_after_batch_transfer() - loss = out.loss + training_step() on_before_zero_grad() optimizer_zero_grad() @@ -1114,38 +1196,42 @@ This is the pseudocode to describe how all the hooks are called during a call to optimizer_step() - on_train_batch_end(out) + on_train_batch_end() if should_check_val: val_loop() - # end training epoch - training_epoch_end(outs) - on_train_epoch_end(outs) + training_epoch_end() + + on_train_epoch_end() on_epoch_end() def val_loop(): - model.eval() + on_validation_model_eval() # calls `model.eval()` torch.set_grad_enabled(False) + on_validation_start() on_epoch_start() on_validation_epoch_start() - val_outs = [] - for val_batch in val_dataloader(): + + for batch in val_dataloader(): on_validation_batch_start() - # -------- val step methods ------- - out = validation_step(val_batch) - val_outs.append(out) + on_before_batch_transfer() + transfer_batch_to_device() + on_after_batch_transfer() + + validation_step() - on_validation_batch_end(out) + on_validation_batch_end() + validation_epoch_end() - validation_epoch_end(val_outs) on_validation_epoch_end() on_epoch_end() + on_validation_end() # set up for train - model.train() + on_validation_model_train() # calls `model.train()` torch.set_grad_enabled(True) backward @@ -1256,6 +1342,12 @@ on_test_epoch_end .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_epoch_end :noindex: +on_test_start +~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_test_start + :noindex: + on_test_end ~~~~~~~~~~~ @@ -1423,3 +1515,15 @@ on_after_batch_transfer .. automethod:: pytorch_lightning.core.hooks.DataHooks.on_after_batch_transfer :noindex: + +add_to_queue +~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.add_to_queue + :noindex: + +get_from_queue +~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.get_from_queue + :noindex: diff --git a/docs/source/common/loggers.rst b/docs/source/common/loggers.rst index c6c5f0d8653c7..5b1f13dbf4b8c 100644 --- a/docs/source/common/loggers.rst +++ b/docs/source/common/loggers.rst @@ -202,7 +202,7 @@ The :class:`~pytorch_lightning.loggers.TestTubeLogger` is available anywhere exc Weights and Biases ================== -`Weights and Biases `_ is a third-party logger. +`Weights and Biases `_ is a third-party logger. To use :class:`~pytorch_lightning.loggers.WandbLogger` as your logger do the following. First, install the package: @@ -215,9 +215,14 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer. .. code-block:: python from pytorch_lightning.loggers import WandbLogger - wandb_logger = WandbLogger(offline=True) + + # instrument experiment with W&B + wandb_logger = WandbLogger(project='MNIST', log_model='all') trainer = Trainer(logger=wandb_logger) + # log gradients and model topology + wandb_logger.watch(model) + The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. @@ -226,8 +231,8 @@ The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except class MyModule(LightningModule): def any_lightning_module_function_or_hook(self): some_img = fake_image() - self.logger.experiment.log({ - "generated_images": [wandb.Image(some_img, caption="...")] + self.log({ + "generated_images": [wandb.Image(some_img, caption="...")] }) .. seealso:: diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 12e9c6925e7fd..cde203fdd193e 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -232,88 +232,6 @@ If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the fol ----- -Improve training speed with model toggling ------------------------------------------- -Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers in a -distributed setting. - -Here is an explanation of what it does: - -* Considering the current optimizer as A and all other optimizers as B. -* Toggling means that all parameters from B exclusive to A will have their ``requires_grad`` attribute set to ``False``. -* Their original state will be restored when exiting the context manager. - -When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. -Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. - -:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a -:meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a -:func:`contextlib.contextmanager` for advanced users. - -Here is an example for advanced use-case. - -.. testcode:: python - - # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. - class SimpleGAN(LightningModule): - - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def training_step(self, batch, batch_idx): - # Implementation follows the PyTorch tutorial: - # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html - g_opt, d_opt = self.optimizers() - - X, _ = batch - X.requires_grad = True - batch_size = X.shape[0] - - real_label = torch.ones((batch_size, 1), device=self.device) - fake_label = torch.zeros((batch_size, 1), device=self.device) - - # Sync and clear gradients - # at the end of accumulation or - # at the end of an epoch. - is_last_batch_to_accumulate = \ - (batch_idx + 1) % 2 == 0 or self.trainer.is_last_batch - - g_X = self.sample_G(batch_size) - - ########################## - # Optimize Discriminator # - ########################## - with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): - d_x = self.D(X) - errD_real = self.criterion(d_x, real_label) - - d_z = self.D(g_X.detach()) - errD_fake = self.criterion(d_z, fake_label) - - errD = (errD_real + errD_fake) - - self.manual_backward(errD) - if is_last_batch_to_accumulate: - d_opt.step() - d_opt.zero_grad() - - ###################### - # Optimize Generator # - ###################### - with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): - d_z = self.D(g_X) - errG = self.criterion(d_z, real_label) - - self.manual_backward(errG) - if is_last_batch_to_accumulate: - g_opt.step() - g_opt.zero_grad() - - self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) - ------ - Use closure for LBFGS-like optimizers ------------------------------------- It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and diff --git a/docs/source/common/test_set.rst b/docs/source/common/test_set.rst index 4c9e9a6061977..5703d71d956de 100644 --- a/docs/source/common/test_set.rst +++ b/docs/source/common/test_set.rst @@ -80,10 +80,10 @@ is not available at the time your model was declared. .. code-block:: python # setup your data loader - test = DataLoader(...) + test_dataloader = DataLoader(...) # test (pass in the loader) - trainer.test(test_dataloaders=test) + trainer.test(dataloaders=test_dataloader) You can either pass in a single dataloader or a list of them. This optional named parameter can be used in conjunction with any of the above use cases. Additionally, diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 86fd218e2d6b8..0983f0acb9eec 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -159,7 +159,7 @@ or after it has already been trained. .. code-block:: python - trainer.validate(val_dataloaders=val_dataloaders) + trainer.validate(dataloaders=val_dataloaders) ------------ @@ -196,6 +196,8 @@ unique seeds across all dataloader workers and processes for :mod:`torch`, :mod: ------- +.. _trainer_flags: + Trainer flags ------------- @@ -658,6 +660,8 @@ Writes logs to disk this often. See Also: - :doc:`logging <../extensions/logging>` +.. _gpus: + gpus ^^^^ @@ -1155,28 +1159,69 @@ precision | -Double precision (64), full precision (32) or half precision (16). -Can all be used on GPU or TPUs. Only double (64) and full precision (32) available on CPU. +Lightning supports either double precision (64), full precision (32), or half precision (16) training. -If used on TPU will use torch.bfloat16 but tensor printing -will still show torch.float32. +Half precision, or mixed precision, is the combined use of 32 and 16 bit floating points to reduce memory footprint during model training. This can result in improved performance, achieving +3X speedups on modern GPUs. .. testcode:: :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() # default used by the Trainer - trainer = Trainer(precision=32) + trainer = Trainer(precision=32, gpus=1) # 16-bit precision trainer = Trainer(precision=16, gpus=1) # 64-bit precision - trainer = Trainer(precision=64) + trainer = Trainer(precision=64, gpus=1) + + +.. note:: When running on TPUs, torch.float16 will be used but tensor printing will still show torch.float32. + +.. note:: 16-bit precision is not supported on CPUs. + + +.. admonition:: When using PyTorch 1.6+, Lightning uses the native AMP implementation to support 16-bit precision. 16-bit precision with PyTorch < 1.6 is supported by NVIDIA Apex library. + :class: dropdown, warning + + NVIDIA Apex and DDP have instability problems. We recommend upgrading to PyTorch 1.6+ in order to use the native AMP 16-bit precision with multiple GPUs. + + If you are using an earlier version of PyTorch (before 1.6), Lightning uses `Apex `_ to support 16-bit training. + + To use Apex 16-bit training: + + 1. Install Apex + + .. code-block:: bash + + # ------------------------ + # OPTIONAL: on your cluster you might need to load CUDA 10 or 9 + # depending on how you installed PyTorch + + # see available modules + module avail + + # load correct CUDA before install + module load cuda-10.0 + # ------------------------ + + # make sure you've loaded a GCC version > 4.0 and < 7.0 + module load gcc-6.1.0 + + pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex + + 2. Set the `precision` trainer flag to 16. You can customize the `Apex optimization level `_ by setting the `amp_level` flag. + + .. testcode:: + :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() + + # turn on 16-bit + trainer = Trainer(amp_backend="apex", amp_level='O2', precision=16) + + If you need to configure the apex init for your particular use case, or want to customize the + 16-bit training behaviour, override :meth:`pytorch_lightning.core.LightningModule.configure_apex`. -Example:: - # one day - trainer = Trainer(precision=8|4|2) process_position ^^^^^^^^^^^^^^^^ @@ -1378,6 +1423,8 @@ track_grad_norm # track the 2-norm trainer = Trainer(track_grad_norm=2) +.. _tpu_cores: + tpu_cores ^^^^^^^^^ @@ -1527,6 +1574,24 @@ Can specify as float or int. trainer = Trainer(val_check_interval=1000) +.. code-block:: + + # Here is the computation to estimate the total number of batches seen within an epoch. + + # Find the total number of train batches + total_train_batches = total_train_samples // (train_batch_size * world_size) + + # Compute how many times we will call validation during the training loop + val_check_batch = max(1, int(total_train_batches * val_check_interval)) + val_checks_per_epoch = total_train_batches / val_check_batch + + # Find the total number of validation batches + total_val_batches = total_val_samples // (val_batch_size * world_size) + + # Total number of batches run + total_fit_batches = total_train_batches + total_val_batches + + weights_save_path ^^^^^^^^^^^^^^^^^ diff --git a/docs/source/conf.py b/docs/source/conf.py index 0d830d75ecf28..111cd6887fbdb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,12 @@ PATH_HERE = os.path.abspath(os.path.dirname(__file__)) PATH_ROOT = os.path.join(PATH_HERE, '..', '..') +PATH_RAW_NB = os.path.join(PATH_ROOT, '_notebooks') +PATH_IPYNB = os.path.join(PATH_HERE, 'notebooks') sys.path.insert(0, os.path.abspath(PATH_ROOT)) +sys.path.append(os.path.join(PATH_RAW_NB, '.actions')) + +from helpers import HelperCLI # noqa: E401 E402 FOLDER_GENERATED = 'generated' SPHINX_MOCK_REQUIREMENTS = int(os.environ.get('SPHINX_MOCK_REQUIREMENTS', True)) @@ -37,21 +42,7 @@ # -- Project documents ------------------------------------------------------- -# # export the documentation -# with open('intro.rst', 'w') as fp: -# intro = pytorch_lightning.__doc__.replace(os.linesep + ' ', '') -# fp.write(m2r.convert(intro)) -# # fp.write(pytorch_lightning.__doc__) - -# # export the READme -# with open(os.path.join(PATH_ROOT, 'README.md'), 'r') as fp: -# readme = fp.read() -# # replace all paths to relative -# for ndir in (os.path.basename(p) for p in glob.glob(os.path.join(PATH_ROOT, '*')) -# if os.path.isdir(p)): -# readme = readme.replace('](%s/' % ndir, '](%s/%s/' % (PATH_ROOT, ndir)) -# with open('readme.md', 'w') as fp: -# fp.write(readme) +HelperCLI.copy_notebooks(PATH_RAW_NB, PATH_IPYNB) def _transform_changelog(path_in: str, path_out: str) -> None: @@ -111,10 +102,9 @@ def _transform_changelog(path_in: str, path_out: str) -> None: 'sphinx.ext.autosummary', 'sphinx.ext.napoleon', 'sphinx.ext.imgmath', - 'recommonmark', 'sphinx.ext.autosectionlabel', - # 'm2r', - # 'nbsphinx', # it seems some sphinx issue + 'myst_parser', + 'nbsphinx', 'sphinx_autodoc_typehints', 'sphinx_copybutton', 'sphinx_paramlinks', @@ -132,12 +122,14 @@ def _transform_changelog(path_in: str, path_out: str) -> None: nbsphinx_allow_errors = True nbsphinx_requirejs_path = '' +# myst-parser, forcing to parse all html pages with mathjax +# https://github.com/executablebooks/MyST-Parser/issues/394 +myst_update_mathjax = False + # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -# source_suffix = ['.rst', '.md'] -# source_suffix = ['.rst', '.md', '.ipynb'] -source_suffix = { +source_parsers = { '.rst': 'restructuredtext', '.txt': 'markdown', '.md': 'markdown', @@ -159,6 +151,8 @@ def _transform_changelog(path_in: str, path_out: str) -> None: # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [ f'{FOLDER_GENERATED}/PULL_REQUEST_TEMPLATE.md', + 'notebooks/course_UvA-DL/*', + 'notebooks/template*', ] # The name of the Pygments (syntax highlighting) style to use. diff --git a/docs/source/ecosystem/asr_nlp_tts.rst b/docs/source/ecosystem/asr_nlp_tts.rst index e1a94eda9e805..6b5840d32c84e 100644 --- a/docs/source/ecosystem/asr_nlp_tts.rst +++ b/docs/source/ecosystem/asr_nlp_tts.rst @@ -160,7 +160,7 @@ for the entire speech to text .yaml file. max_epochs: 5 max_steps: null # computed at runtime if not set num_nodes: 1 - distributed_backend: ddp + accelerator: ddp ... # configure the ASR model model: @@ -598,7 +598,7 @@ Specify TTS Model Configurations with YAML File gpus: -1 # number of gpus max_epochs: 350 num_nodes: 1 - distributed_backend: ddp + accelerator: ddp ... # configure the TTS model diff --git a/docs/source/extensions/datamodules.rst b/docs/source/extensions/datamodules.rst index 27fdf176f5554..b710a43b2c580 100644 --- a/docs/source/extensions/datamodules.rst +++ b/docs/source/extensions/datamodules.rst @@ -53,7 +53,7 @@ Datamodules are for you if you ever asked the questions: What is a DataModule -------------------- -A DataModule is simply a collection of a train_dataloader, val_dataloader(s), test_dataloader(s) along with the +A DataModule is simply a collection of a train_dataloader(s), val_dataloader(s), test_dataloader(s) along with the matching transforms and data processing/downloads steps required. Here's a simple PyTorch example: diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst index 107eca2dd9d74..12760f0ee6898 100644 --- a/docs/source/extensions/logging.rst +++ b/docs/source/extensions/logging.rst @@ -68,6 +68,10 @@ except functions with `batch_start` in their names. def training_step(self, batch, batch_idx): self.log('my_metric', x) + # or a dict + def training_step(self, batch, batch_idx): + self.log('performance', {'acc': acc, 'recall': recall}) + Depending on where log is called from, Lightning auto-determines the correct logging mode for you. \ But of course you can override the default behavior by manually setting the :func:`~~pytorch_lightning.core.lightning.LightningModule.log` parameters. diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 35e563715e037..436d40f660e7a 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -115,8 +115,6 @@ Training Type Plugins DDPSpawnPlugin DeepSpeedPlugin HorovodPlugin - RPCPlugin - RPCSequentialPlugin SingleTPUPlugin TPUSpawnPlugin diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst new file mode 100644 index 0000000000000..ece806558c76c --- /dev/null +++ b/docs/source/guides/speed.rst @@ -0,0 +1,482 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.callbacks.early_stopping import EarlyStopping + from pytorch_lightning.core.lightning import LightningModule + +.. _speed: + +####################### +Speed up model training +####################### + +There are multiple ways you can speed up your model's time to convergence: + +* ``_ + +* ``_ + +* ``_ + +* ``_ + +* ``_ + +* ``_ + +* ``_ + +* ``_ + +* ``_ + +**************** +GPU/TPU training +**************** + +**Use when:** Whenever possible! + +With Lightning, running on GPUs, TPUs or multiple node is a simple switch of a flag. + +GPU training +============ + +Lightning supports a variety of plugins to further speed up distributed GPU training. Most notably: + +* :class:`~pytorch_lightning.plugins.training_type.DDPPlugin` +* :class:`~pytorch_lightning.plugins.training_type.DDPShardedPlugin` +* :class:`~pytorch_lightning.plugins.training_type.DeepSpeedPlugin` + +.. code-block:: python + + # run on 1 gpu + trainer = Trainer(gpus=1) + + # train on 8 gpus, using DDP plugin + trainer = Trainer(gpus=8, accelerator="ddp") + + # train on multiple GPUs across nodes (uses 8 gpus in total) + trainer = Trainer(gpus=2, num_nodes=4) + + +GPU Training Speedup Tips +------------------------- + +When training on single or multiple GPU machines, Lightning offers a host of advanced optimizations to improve throughput, memory efficiency, and model scaling. +Refer to :doc:`Advanced GPU Optimized Training for more details <../advanced/advanced_gpu>`. + +Prefer DDP over DP +^^^^^^^^^^^^^^^^^^ +:class:`~pytorch_lightning.plugins.training_type.DataParallelPlugin` performs three GPU transfers for EVERY batch: + +1. Copy model to device. +2. Copy data to device. +3. Copy outputs of each device back to master. + +Whereas :class:`~pytorch_lightning.plugins.training_type.DDPPlugin` only performs 1 transfer to sync gradients, making DDP MUCH faster than DP. + + +When using DDP set find_unused_parameters=False +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +By default we have set ``find_unused_parameters`` to True for compatibility issues that have arisen in the past (see the `discussion `_ for more information). +This by default comes with a performance hit, and can be disabled in most cases. + +.. code-block:: python + + from pytorch_lightning.plugins import DDPPlugin + + trainer = pl.Trainer( + gpus=2, + plugins=DDPPlugin(find_unused_parameters=False), + ) + +Dataloaders +^^^^^^^^^^^ +When building your DataLoader set ``num_workers > 0`` and ``pin_memory=True`` (only for GPUs). + +.. code-block:: python + + Dataloader(dataset, num_workers=8, pin_memory=True) + +num_workers +""""""""""" + +The question of how many workers to specify in ``num_workers`` is tricky. Here's a summary of +some references, [`1 `_], and our suggestions: + +1. ``num_workers=0`` means ONLY the main process will load batches (that can be a bottleneck). +2. ``num_workers=1`` means ONLY one worker (just not the main process) will load data but it will still be slow. +3. The ``num_workers`` depends on the batch size and your machine. +4. A general place to start is to set ``num_workers`` equal to the number of CPU cores on that machine. You can get the number of CPU cores in python using `os.cpu_count()`, but note that depending on your batch size, you may overflow RAM memory. + +.. warning:: Increasing ``num_workers`` will ALSO increase your CPU memory consumption. + +The best thing to do is to increase the ``num_workers`` slowly and stop once you see no more improvement in your training speed. + +Spawn +""""" +When using ``accelerator=ddp_spawn`` or training on TPUs, the way multiple GPUs/TPU cores are used is by calling ``.spawn()`` under the hood. +The problem is that PyTorch has issues with ``num_workers > 0`` when using ``.spawn()``. For this reason we recommend you +use ``accelerator=ddp`` so you can increase the ``num_workers``, however your script has to be callable like so: + +.. code-block:: bash + + python my_program.py + + +TPU training +============ + +You can set the ``tpu_cores`` trainer flag to 1 or 8 cores. + +.. code-block:: python + + # train on 1 TPU core + trainer = Trainer(tpu_cores=1) + + # train on 8 TPU cores + trainer = Trainer(tpu_cores=8) + +To train on more than 8 cores (ie: a POD), +submit this script using the xla_dist script. + +Example:: + + python -m torch_xla.distributed.xla_dist + --tpu=$TPU_POD_NAME + --conda-env=torch-xla-nightly + --env=XLA_USE_BF16=1 + -- python your_trainer_file.py + + +Read more in our :ref:`accelerators` and :ref:`plugins` guides. + + +----------- + +.. _amp: + +********************************* +Mixed precision (16-bit) training +********************************* + +**Use when:** + +* You want to optimize for memory usage on a GPU. +* You have a GPU that supports 16 bit precision (NVIDIA pascal architecture or newer). +* Your optimization algorithm (training_step) is numerically stable. +* You want to be the cool person in the lab :p + +.. raw:: html + + + +| + + +Mixed precision combines the use of both 32 and 16 bit floating points to reduce memory footprint during model training, resulting in improved performance, achieving +3X speedups on modern GPUs. + +Lightning offers mixed precision or 16-bit training for GPUs and TPUs. + + +.. testcode:: + :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available() + + # 16-bit precision + trainer = Trainer(precision=16, gpus=4) + + +---------------- + + +*********************** +Control Training Epochs +*********************** + +**Use when:** You run a hyperparameter search to find good initial parameters and want to save time, cost (money), or power (environment). +It can allow you to be more cost efficient and also run more experiments at the same time. + +You can use Trainer flags to force training for a minimum number of epochs or limit to a max number of epochs. Use the `min_epochs` and `max_epochs` Trainer flags to set the number of epochs to run. + +.. testcode:: + + # DEFAULT + trainer = Trainer(min_epochs=1, max_epochs=1000) + + +If running iteration based training, i.e. infinite / iterable dataloader, you can also control the number of steps with the `min_steps` and `max_steps` flags: + +.. testcode:: + + trainer = Trainer(max_steps=1000) + + trainer = Trainer(min_steps=100) + +You can also interupt training based on training time: + +.. testcode:: + + # Stop after 12 hours of training or when reaching 10 epochs (string) + trainer = Trainer(max_time="00:12:00:00", max_epochs=10) + + # Stop after 1 day and 5 hours (dict) + trainer = Trainer(max_time={"days": 1, "hours": 5}) + +Learn more in our :ref:`trainer_flags` guide. + + +---------------- + +**************************** +Control Validation Frequency +**************************** + +Check validation every n epochs +=============================== + +**Use when:** You have a small dataset, and want to run less validation checks. + +You can limit validation check to only run every n epochs using the `check_val_every_n_epoch` Trainer flag. + +.. testcode:: + + # DEFAULT + trainer = Trainer(check_val_every_n_epoch=1) + + +Set validation check frequency within 1 training epoch +====================================================== + +**Use when:** You have a large training dataset, and want to run mid-epoch validation checks. + +For large datasets, it's often desirable to check validation multiple times within a training loop. +Pass in a float to check that often within 1 training epoch. Pass in an int `k` to check every `k` training batches. +Must use an `int` if using an `IterableDataset`. + +.. testcode:: + + # DEFAULT + trainer = Trainer(val_check_interval=0.95) + + # check every .25 of an epoch + trainer = Trainer(val_check_interval=0.25) + + # check every 100 train batches (ie: for `IterableDatasets` or fixed frequency) + trainer = Trainer(val_check_interval=100) + +Learn more in our :ref:`trainer_flags` guide. + +---------------- + +****************** +Limit Dataset Size +****************** + +Use data subset for training, validation, and test +================================================== + +**Use when:** Debugging or running huge datasets. + +If you don't want to check 100% of the training/validation/test set set these flags: + +.. testcode:: + + # DEFAULT + trainer = Trainer( + limit_train_batches=1.0, + limit_val_batches=1.0, + limit_test_batches=1.0 + ) + + # check 10%, 20%, 30% only, respectively for training, validation and test set + trainer = Trainer( + limit_train_batches=0.1, + limit_val_batches=0.2, + limit_test_batches=0.3 + ) + +If you also pass ``shuffle=True`` to the dataloader, a different random subset of your dataset will be used for each epoch; otherwise the same subset will be used for all epochs. + +.. note:: ``limit_train_batches``, ``limit_val_batches`` and ``limit_test_batches`` will be overwritten by ``overfit_batches`` if ``overfit_batches`` > 0. ``limit_val_batches`` will be ignored if ``fast_dev_run=True``. + +.. note:: If you set ``limit_val_batches=0``, validation will be disabled. + +Learn more in our :ref:`trainer_flags` guide. + +----- + +********************* +Preload Data Into RAM +********************* + +**Use when:** You need access to all samples in a dataset at once. + +When your training or preprocessing requires many operations to be performed on entire dataset(s), it can +sometimes be beneficial to store all data in RAM given there is enough space. +However, loading all data at the beginning of the training script has the disadvantage that it can take a long +time and hence it slows down the development process. Another downside is that in multiprocessing (e.g. DDP) +the data would get copied in each process. +One can overcome these problems by copying the data into RAM in advance. +Most UNIX-based operating systems provide direct access to tmpfs through a mount point typically named ``/dev/shm``. + +0. Increase shared memory if necessary. Refer to the documentation of your OS how to do this. + +1. Copy training data to shared memory: + + .. code-block:: bash + + cp -r /path/to/data/on/disk /dev/shm/ + +2. Refer to the new data root in your script or command line arguments: + + .. code-block:: python + + datamodule = MyDataModule(data_root="/dev/shm/my_data") + +--------- + +************** +Model Toggling +************** + +**Use when:** Performing gradient accumulation with multiple optimizers in a +distributed setting. + +Here is an explanation of what it does: + +* Considering the current optimizer as A and all other optimizers as B. +* Toggling means that all parameters from B exclusive to A will have their ``requires_grad`` attribute set to ``False``. +* Their original state will be restored when exiting the context manager. + +When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. +Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. + +:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a +:meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a +:func:`contextlib.contextmanager` for advanced users. + +Here is an example for advanced use-case: + +.. testcode:: + + # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. + class SimpleGAN(LightningModule): + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def training_step(self, batch, batch_idx): + # Implementation follows the PyTorch tutorial: + # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html + g_opt, d_opt = self.optimizers() + + X, _ = batch + X.requires_grad = True + batch_size = X.shape[0] + + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) + + # Sync and clear gradients + # at the end of accumulation or + # at the end of an epoch. + is_last_batch_to_accumulate = \ + (batch_idx + 1) % 2 == 0 or self.trainer.is_last_batch + + g_X = self.sample_G(batch_size) + + ########################## + # Optimize Discriminator # + ########################## + with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) + + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) + + errD = (errD_real + errD_fake) + + self.manual_backward(errD) + if is_last_batch_to_accumulate: + d_opt.step() + d_opt.zero_grad() + + ###################### + # Optimize Generator # + ###################### + with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) + + self.manual_backward(errG) + if is_last_batch_to_accumulate: + g_opt.step() + g_opt.zero_grad() + + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) + +----- + +***************** +Set Grads to None +***************** + +In order to modestly improve performance, you can override :meth:`~pytorch_lightning.core.lightning.LightningModule.optimizer_zero_grad`. + +For a more detailed explanation of pros / cons of this technique, +read `this `_ documentation by the PyTorch team. + +.. testcode:: + + class Model(LightningModule): + + def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx): + optimizer.zero_grad(set_to_none=True) + + +----- + +*************** +Things to avoid +*************** + +.item(), .numpy(), .cpu() +========================= +Don't call ``.item()`` anywhere in your code. Use ``.detach()`` instead to remove the connected graph calls. Lightning +takes a great deal of care to be optimized for this. + +---------- + +empty_cache() +============= +Don't call this unnecessarily! Every time you call this ALL your GPUs have to wait to sync. + +---------- + +Tranfering tensors to device +============================ +LightningModules know what device they are on! Construct tensors on the device directly to avoid CPU->Device transfer. + +.. code-block:: python + + # bad + t = torch.rand(2, 2).cuda() + + # good (self is LightningModule) + t = torch.rand(2, 2, device=self.device) + + +For tensors that need to be model attributes, it is best practice to register them as buffers in the modules's +``__init__`` method: + +.. code-block:: python + + # bad + self.t = torch.rand(2, 2, device=self.device) + + # good + self.register_buffer("t", torch.rand(2, 2)) diff --git a/docs/source/index.rst b/docs/source/index.rst index 71ad835e02d31..c2c02b19634d6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,8 +21,8 @@ PyTorch Lightning Documentation :name: guides :caption: Best practices + guides/speed starter/style_guide - benchmarking/performance Lightning project template benchmarking/benchmarks @@ -57,6 +57,13 @@ PyTorch Lightning Documentation PyTorch Lightning 101 class From PyTorch to PyTorch Lightning [Blog] From PyTorch to PyTorch Lightning [Video] + notebooks/lightning_examples/mnist-hello-world.ipynb + notebooks/lightning_examples/datamodules.ipynb + notebooks/lightning_examples/cifar10-baseline.ipynb + notebooks/lightning_examples/basic-gan.ipynb + notebooks/lightning_examples/text-transformers.ipynb + notebooks/lightning_examples/reinforce-learning-DQN.ipynb + notebooks/lightning_examples/augmentation_kornia.ipynb .. toctree:: :maxdepth: 2 @@ -94,12 +101,10 @@ PyTorch Lightning Documentation clouds/cloud_training clouds/cluster - advanced/amp common/child_modules common/debugging common/loggers common/early_stopping - common/fast_training common/hyperparameters common/lightning_cli advanced/lr_finder @@ -114,6 +119,7 @@ PyTorch Lightning Documentation advanced/training_tricks advanced/pruning_quantization advanced/transfer_learning + advanced/ipu advanced/tpu common/test_set common/production_inference @@ -127,7 +133,7 @@ PyTorch Lightning Documentation .. toctree:: :maxdepth: 1 - :name: community + :name: Community :caption: Community diff --git a/docs/source/starter/new-project.rst b/docs/source/starter/new-project.rst index 74ad30102b4f8..07bf3624560a0 100644 --- a/docs/source/starter/new-project.rst +++ b/docs/source/starter/new-project.rst @@ -219,7 +219,7 @@ The :class:`~pytorch_lightning.trainer.Trainer` automates: * Tensorboard (see :doc:`loggers <../common/loggers>` options) * :doc:`Multi-GPU <../advanced/multi_gpu>` support * :doc:`TPU <../advanced/tpu>` -* :doc:`AMP <../advanced/amp>` support +* :ref:`16-bit precision AMP ` support .. tip:: If you prefer to manually manage optimizers you can use the :ref:`manual_opt` mode (ie: RL, GANs, etc...). diff --git a/notebooks/01-mnist-hello-world.ipynb b/notebooks/01-mnist-hello-world.ipynb deleted file mode 100644 index 88557fa8a80aa..0000000000000 --- a/notebooks/01-mnist-hello-world.ipynb +++ /dev/null @@ -1,448 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "i7XbLCXGkll9" - }, - "source": [ - "# Introduction to Pytorch Lightning ⚡\n", - "\n", - "In this notebook, we'll go over the basics of lightning by preparing models to train on the [MNIST Handwritten Digits dataset](https://en.wikipedia.org/wiki/MNIST_database).\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "2LODD6w9ixlT" - }, - "source": [ - "### Setup \n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zK7-Gg69kMnG" - }, - "outputs": [], - "source": [ - "! pip install pytorch-lightning --quiet" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "w4_TYnt_keJi" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import torch\n", - "from torch import nn\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader, random_split\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "EHpyMPKFkVbZ" - }, - "source": [ - "## Simplest example\n", - "\n", - "Here's the simplest most minimal example with just a training loop (no validation, no testing).\n", - "\n", - "**Keep in Mind** - A `LightningModule` *is* a PyTorch `nn.Module` - it just has a few more helpful features." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "V7ELesz1kVQo" - }, - "outputs": [], - "source": [ - "class MNISTModel(pl.LightningModule):\n", - "\n", - " def __init__(self):\n", - " super(MNISTModel, self).__init__()\n", - " self.l1 = torch.nn.Linear(28 * 28, 10)\n", - "\n", - " def forward(self, x):\n", - " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", - "\n", - " def training_step(self, batch, batch_nb):\n", - " x, y = batch\n", - " loss = F.cross_entropy(self(x), y)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " return torch.optim.Adam(self.parameters(), lr=0.02)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hIrtHg-Dv8TJ" - }, - "source": [ - "By using the `Trainer` you automatically get:\n", - "1. Tensorboard logging\n", - "2. Model checkpointing\n", - "3. Training and validation loop\n", - "4. early-stopping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4Dk6Ykv8lI7X" - }, - "outputs": [], - "source": [ - "# Init our model\n", - "mnist_model = MNISTModel()\n", - "\n", - "# Init DataLoader from MNIST Dataset\n", - "train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", - "train_loader = DataLoader(train_ds, batch_size=32)\n", - "\n", - "# Initialize a trainer\n", - "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", - "\n", - "# Train the model ⚡\n", - "trainer.fit(mnist_model, train_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "KNpOoBeIjscS" - }, - "source": [ - "## A more complete MNIST Lightning Module Example\n", - "\n", - "That wasn't so hard was it?\n", - "\n", - "Now that we've got our feet wet, let's dive in a bit deeper and write a more complete `LightningModule` for MNIST...\n", - "\n", - "This time, we'll bake in all the dataset specific pieces directly in the `LightningModule`. This way, we can avoid writing extra code at the beginning of our script every time we want to run it.\n", - "\n", - "---\n", - "\n", - "### Note what the following built-in functions are doing:\n", - "\n", - "1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾\n", - " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", - " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", - "\n", - "2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning-module.html#setup) ⚙️\n", - " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", - " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", - " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).\n", - " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", - "\n", - "3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning-module.html#data-hooks) ♻️\n", - " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4DNItffri95Q" - }, - "outputs": [], - "source": [ - "class LitMNIST(pl.LightningModule):\n", - " \n", - " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # Set our init args as class attributes\n", - " self.data_dir = data_dir\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " # Hardcode some dataset specific attributes\n", - " self.num_classes = 10\n", - " self.dims = (1, 28, 28)\n", - " channels, width, height = self.dims\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # Define PyTorch model\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, self.num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - "\n", - " # Calling self.log will surface up scalars for you in TensorBoard\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def test_step(self, batch, batch_idx):\n", - " # Here we just reuse the validation_step for testing\n", - " return self.validation_step(batch, batch_idx)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer\n", - "\n", - " ####################\n", - " # DATA RELATED HOOKS\n", - " ####################\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Mb0U5Rk2kLBy" - }, - "outputs": [], - "source": [ - "model = LitMNIST()\n", - "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "nht8AvMptY6I" - }, - "source": [ - "### Testing\n", - "\n", - "To test a model, call `trainer.test(model)`.\n", - "\n", - "Or, if you've just trained a model, you can just call `trainer.test()` and Lightning will automatically test using the best saved checkpoint (conditioned on val_loss)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "PA151FkLtprO" - }, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "T3-3lbbNtr5T" - }, - "source": [ - "### Bonus Tip\n", - "\n", - "You can keep calling `trainer.fit(model)` as many times as you'd like to continue training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "IFBwCbLet2r6" - }, - "outputs": [], - "source": [ - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "8TRyS5CCt3n9" - }, - "source": [ - "In Colab, you can use the TensorBoard magic function to view the logs that Lightning has created for you!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wizS-QiLuAYo" - }, - "outputs": [], - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "authorship_tag": "ABX9TyOtAKVa5POQ6Xg3UcTQqXDJ", - "collapsed_sections": [], - "include_colab_link": true, - "name": "01-mnist-hello-world.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/02-datamodules.ipynb b/notebooks/02-datamodules.ipynb deleted file mode 100644 index f2bb7992ffabb..0000000000000 --- a/notebooks/02-datamodules.ipynb +++ /dev/null @@ -1,588 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "2O5r7QvP8-rt" - }, - "source": [ - "# PyTorch Lightning DataModules ⚡\n", - "\n", - "With the release of `pytorch-lightning` version 0.9.0, we have included a new class called `LightningDataModule` to help you decouple data related hooks from your `LightningModule`.\n", - "\n", - "This notebook will walk you through how to start using Datamodules.\n", - "\n", - "The most up to date documentation on datamodules can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html).\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6RYMhmfA9ATN" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lj2zD-wsbvGr" - }, - "outputs": [], - "source": [ - "! pip install pytorch-lightning --quiet" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "8g2mbvy-9xDI" - }, - "source": [ - "# Introduction\n", - "\n", - "First, we'll go over a regular `LightningModule` implementation without the use of a `LightningDataModule`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "eg-xDlmDdAwy" - }, - "outputs": [], - "source": [ - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import random_split, DataLoader\n", - "\n", - "# Note - you must have torchvision installed for this example\n", - "from torchvision.datasets import MNIST, CIFAR10\n", - "from torchvision import transforms" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "DzgY7wi88UuG" - }, - "source": [ - "## Defining the LitMNISTModel\n", - "\n", - "Below, we reuse a `LightningModule` from our hello world tutorial that classifies MNIST Handwritten Digits.\n", - "\n", - "Unfortunately, we have hardcoded dataset-specific items within the model, forever limiting it to working with MNIST Data. 😢\n", - "\n", - "This is fine if you don't plan on training/evaluating your model on different datasets. However, in many cases, this can become bothersome when you want to try out your architecture with different datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "IQkW8_FF5nU2" - }, - "outputs": [], - "source": [ - "class LitMNIST(pl.LightningModule):\n", - " \n", - " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # We hardcode dataset specific stuff here.\n", - " self.data_dir = data_dir\n", - " self.num_classes = 10\n", - " self.dims = (1, 28, 28)\n", - " channels, width, height = self.dims\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " # Build model\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, self.num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer\n", - "\n", - " ####################\n", - " # DATA RELATED HOOKS\n", - " ####################\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "K7sg9KQd-QIO" - }, - "source": [ - "## Training the ListMNIST Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "QxDNDaus6byD" - }, - "outputs": [], - "source": [ - "model = LitMNIST()\n", - "trainer = pl.Trainer(max_epochs=2, gpus=1, progress_bar_refresh_rate=20)\n", - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "dY8d6GxmB0YU" - }, - "source": [ - "# Using DataModules\n", - "\n", - "DataModules are a way of decoupling data-related hooks from the `LightningModule` so you can develop dataset agnostic models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "eJeT5bW081wn" - }, - "source": [ - "## Defining The MNISTDataModule\n", - "\n", - "Let's go over each function in the class below and talk about what they're doing:\n", - "\n", - "1. ```__init__```\n", - " - Takes in a `data_dir` arg that points to where you have downloaded/wish to download the MNIST dataset.\n", - " - Defines a transform that will be applied across train, val, and test dataset splits.\n", - " - Defines default `self.dims`, which is a tuple returned from `datamodule.size()` that can help you initialize models.\n", - "\n", - "\n", - "2. ```prepare_data```\n", - " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", - " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", - "\n", - "3. ```setup```\n", - " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", - " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", - " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage`.\n", - " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", - "\n", - "\n", - "4. ```x_dataloader```\n", - " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "DfGKyGwG_X9v" - }, - "outputs": [], - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "H2Yoj-9M9dS7" - }, - "source": [ - "## Defining the dataset agnostic `LitModel`\n", - "\n", - "Below, we define the same model as the `LitMNIST` model we made earlier. \n", - "\n", - "However, this time our model has the freedom to use any input data that we'd like 🔥." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "PM2IISuOBDIu" - }, - "outputs": [], - "source": [ - "class LitModel(pl.LightningModule):\n", - " \n", - " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # We take in input dimensions as parameters and use those to dynamically build model.\n", - " self.channels = channels\n", - " self.width = width\n", - " self.height = height\n", - " self.num_classes = num_classes\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - "\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "G4Z5olPe-xEo" - }, - "source": [ - "## Training the `LitModel` using the `MNISTDataModule`\n", - "\n", - "Now, we initialize and train the `LitModel` using the `MNISTDataModule`'s configuration settings and dataloaders." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "kV48vP_9mEli" - }, - "outputs": [], - "source": [ - "# Init DataModule\n", - "dm = MNISTDataModule()\n", - "# Init model from datamodule's attributes\n", - "model = LitModel(*dm.size(), dm.num_classes)\n", - "# Init trainer\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, gpus=1)\n", - "# Pass the datamodule as arg to trainer.fit to override model hooks :)\n", - "trainer.fit(model, dm)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "WNxrugIGRRv5" - }, - "source": [ - "## Defining the CIFAR10 DataModule\n", - "\n", - "Lets prove the `LitModel` we made earlier is dataset agnostic by defining a new datamodule for the CIFAR10 dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "1tkaYLU7RT5P" - }, - "outputs": [], - "source": [ - "class CIFAR10DataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", - " ])\n", - "\n", - " self.dims = (3, 32, 32)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " CIFAR10(self.data_dir, train=True, download=True)\n", - " CIFAR10(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)\n", - " self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.cifar_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.cifar_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.cifar_test, batch_size=32)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "BrXxf3oX_gsZ" - }, - "source": [ - "## Training the `LitModel` using the `CIFAR10DataModule`\n", - "\n", - "Our model isn't very good, so it will perform pretty badly on the CIFAR10 dataset.\n", - "\n", - "The point here is that we can see that our `LitModel` has no problem using a different datamodule as its input data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "sd-SbWi_krdj" - }, - "outputs": [], - "source": [ - "dm = CIFAR10DataModule()\n", - "model = LitModel(*dm.size(), dm.num_classes, hidden_size=256)\n", - "trainer = pl.Trainer(max_epochs=5, progress_bar_refresh_rate=20, gpus=1)\n", - "trainer.fit(model, dm)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "
\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "include_colab_link": true, - "name": "02-datamodules.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/03-basic-gan.ipynb b/notebooks/03-basic-gan.ipynb deleted file mode 100644 index 523702a8fcb62..0000000000000 --- a/notebooks/03-basic-gan.ipynb +++ /dev/null @@ -1,472 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "J37PBnE_x7IW" - }, - "source": [ - "# PyTorch Lightning Basic GAN Tutorial ⚡\n", - "\n", - "How to train a GAN!\n", - "\n", - "Main takeaways:\n", - "1. Generator and discriminator are arbitrary PyTorch modules.\n", - "2. training_step does both the generator and discriminator training.\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "kg2MKpRmybht" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply `pip install pytorch-lightning`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "LfrJLKPFyhsK" - }, - "outputs": [], - "source": [ - "! pip install pytorch-lightning --quiet" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "BjEPuiVLyanw" - }, - "outputs": [], - "source": [ - "import os\n", - "from argparse import ArgumentParser\n", - "from collections import OrderedDict\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "from torch.utils.data import DataLoader, random_split\n", - "from torchvision.datasets import MNIST\n", - "\n", - "import pytorch_lightning as pl" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "OuXJzr4G2uHV" - }, - "source": [ - "### MNIST DataModule\n", - "\n", - "Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial on them or see the [latest docs](https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "DOY_nHu328g7" - }, - "outputs": [], - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './', batch_size: int = 64, num_workers: int = 8):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.batch_size = batch_size\n", - " self.num_workers = num_workers\n", - "\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=self.num_workers)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tW3c0QrQyF9P" - }, - "source": [ - "### A. Generator" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0E2QDjl5yWtz" - }, - "outputs": [], - "source": [ - "class Generator(nn.Module):\n", - " def __init__(self, latent_dim, img_shape):\n", - " super().__init__()\n", - " self.img_shape = img_shape\n", - "\n", - " def block(in_feat, out_feat, normalize=True):\n", - " layers = [nn.Linear(in_feat, out_feat)]\n", - " if normalize:\n", - " layers.append(nn.BatchNorm1d(out_feat, 0.8))\n", - " layers.append(nn.LeakyReLU(0.2, inplace=True))\n", - " return layers\n", - "\n", - " self.model = nn.Sequential(\n", - " *block(latent_dim, 128, normalize=False),\n", - " *block(128, 256),\n", - " *block(256, 512),\n", - " *block(512, 1024),\n", - " nn.Linear(1024, int(np.prod(img_shape))),\n", - " nn.Tanh()\n", - " )\n", - "\n", - " def forward(self, z):\n", - " img = self.model(z)\n", - " img = img.view(img.size(0), *self.img_shape)\n", - " return img" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "uyrltsGvyaI3" - }, - "source": [ - "### B. Discriminator" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Ed3MR3vnyxyW" - }, - "outputs": [], - "source": [ - "class Discriminator(nn.Module):\n", - " def __init__(self, img_shape):\n", - " super().__init__()\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Linear(int(np.prod(img_shape)), 512),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(512, 256),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(256, 1),\n", - " nn.Sigmoid(),\n", - " )\n", - "\n", - " def forward(self, img):\n", - " img_flat = img.view(img.size(0), -1)\n", - " validity = self.model(img_flat)\n", - "\n", - " return validity" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "BwUMom3ryySK" - }, - "source": [ - "### C. GAN\n", - "\n", - "#### A couple of cool features to check out in this example...\n", - "\n", - " - We use `some_tensor.type_as(another_tensor)` to make sure we initialize new tensors on the right device (i.e. GPU, CPU).\n", - " - Lightning will put your dataloader data on the right device automatically\n", - " - In this example, we pull from latent dim on the fly, so we need to dynamically add tensors to the right device.\n", - " - `type_as` is the way we recommend to do this.\n", - " - This example shows how to use multiple dataloaders in your `LightningModule`." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3vKszYf6y1Vv" - }, - "outputs": [], - "source": [ - " class GAN(pl.LightningModule):\n", - "\n", - " def __init__(\n", - " self,\n", - " channels,\n", - " width,\n", - " height,\n", - " latent_dim: int = 100,\n", - " lr: float = 0.0002,\n", - " b1: float = 0.5,\n", - " b2: float = 0.999,\n", - " batch_size: int = 64,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - " self.save_hyperparameters()\n", - "\n", - " # networks\n", - " data_shape = (channels, width, height)\n", - " self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=data_shape)\n", - " self.discriminator = Discriminator(img_shape=data_shape)\n", - "\n", - " self.validation_z = torch.randn(8, self.hparams.latent_dim)\n", - "\n", - " self.example_input_array = torch.zeros(2, self.hparams.latent_dim)\n", - "\n", - " def forward(self, z):\n", - " return self.generator(z)\n", - "\n", - " def adversarial_loss(self, y_hat, y):\n", - " return F.binary_cross_entropy(y_hat, y)\n", - "\n", - " def training_step(self, batch, batch_idx, optimizer_idx):\n", - " imgs, _ = batch\n", - "\n", - " # sample noise\n", - " z = torch.randn(imgs.shape[0], self.hparams.latent_dim)\n", - " z = z.type_as(imgs)\n", - "\n", - " # train generator\n", - " if optimizer_idx == 0:\n", - "\n", - " # generate images\n", - " self.generated_imgs = self(z)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self.generated_imgs[:6]\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image('generated_images', grid, 0)\n", - "\n", - " # ground truth result (ie: all fake)\n", - " # put on GPU because we created this tensor inside training_loop\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " valid = valid.type_as(imgs)\n", - "\n", - " # adversarial loss is binary cross-entropy\n", - " g_loss = self.adversarial_loss(self.discriminator(self(z)), valid)\n", - " tqdm_dict = {'g_loss': g_loss}\n", - " output = OrderedDict({\n", - " 'loss': g_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " # train discriminator\n", - " if optimizer_idx == 1:\n", - " # Measure discriminator's ability to classify real from generated samples\n", - "\n", - " # how well can it label as real?\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " valid = valid.type_as(imgs)\n", - "\n", - " real_loss = self.adversarial_loss(self.discriminator(imgs), valid)\n", - "\n", - " # how well can it label as fake?\n", - " fake = torch.zeros(imgs.size(0), 1)\n", - " fake = fake.type_as(imgs)\n", - "\n", - " fake_loss = self.adversarial_loss(\n", - " self.discriminator(self(z).detach()), fake)\n", - "\n", - " # discriminator loss is the average of these\n", - " d_loss = (real_loss + fake_loss) / 2\n", - " tqdm_dict = {'d_loss': d_loss}\n", - " output = OrderedDict({\n", - " 'loss': d_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " def configure_optimizers(self):\n", - " lr = self.hparams.lr\n", - " b1 = self.hparams.b1\n", - " b2 = self.hparams.b2\n", - "\n", - " opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))\n", - " opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))\n", - " return [opt_g, opt_d], []\n", - "\n", - " def on_epoch_end(self):\n", - " z = self.validation_z.type_as(self.generator.model[0].weight)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self(z)\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image('generated_images', grid, self.current_epoch)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Ey5FmJPnzm_E" - }, - "outputs": [], - "source": [ - "dm = MNISTDataModule()\n", - "model = GAN(*dm.size())\n", - "trainer = pl.Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=20)\n", - "trainer.fit(model, dm)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "MlECc7cHzolp" - }, - "outputs": [], - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "
\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "include_colab_link": true, - "name": "03-basic-gan.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb deleted file mode 100644 index fc80e9904a772..0000000000000 --- a/notebooks/04-transformers-text-classification.ipynb +++ /dev/null @@ -1,599 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "8ag5ANQPJ_j9" - }, - "source": [ - "# Finetune 🤗 Transformers Models with PyTorch Lightning ⚡\n", - "\n", - "This notebook will use HuggingFace's `datasets` library to get data, which will be wrapped in a `LightningDataModule`. Then, we write a class to perform text classification on any dataset from the[ GLUE Benchmark](https://gluebenchmark.com/). (We just show CoLA and MRPC due to constraint on compute/disk)\n", - "\n", - "[HuggingFace's NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola) can help you get a feel for the two datasets we will use and what tasks they are solving for.\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Ask a question on [GitHub Discussions](https://github.com/PyTorchLightning/pytorch-lightning/discussions/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)\n", - "\n", - " - [HuggingFace datasets](https://github.com/huggingface/datasets)\n", - " - [HuggingFace transformers](https://github.com/huggingface/transformers)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fqlsVTj7McZ3" - }, - "source": [ - "### Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "OIhHrRL-MnKK" - }, - "outputs": [], - "source": [ - "!pip install pytorch-lightning datasets transformers" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "6yuQT_ZQMpCg" - }, - "outputs": [], - "source": [ - "from argparse import ArgumentParser\n", - "from datetime import datetime\n", - "from typing import Optional\n", - "\n", - "import datasets\n", - "import numpy as np\n", - "import pytorch_lightning as pl\n", - "import torch\n", - "from torch.utils.data import DataLoader\n", - "from transformers import (\n", - " AdamW,\n", - " AutoModelForSequenceClassification,\n", - " AutoConfig,\n", - " AutoTokenizer,\n", - " get_linear_schedule_with_warmup,\n", - " glue_compute_metrics\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "9ORJfiuiNZ_N" - }, - "source": [ - "## GLUE DataModule" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "jW9xQhZxMz1G" - }, - "outputs": [], - "source": [ - "class GLUEDataModule(pl.LightningDataModule):\n", - "\n", - " task_text_field_map = {\n", - " 'cola': ['sentence'],\n", - " 'sst2': ['sentence'],\n", - " 'mrpc': ['sentence1', 'sentence2'],\n", - " 'qqp': ['question1', 'question2'],\n", - " 'stsb': ['sentence1', 'sentence2'],\n", - " 'mnli': ['premise', 'hypothesis'],\n", - " 'qnli': ['question', 'sentence'],\n", - " 'rte': ['sentence1', 'sentence2'],\n", - " 'wnli': ['sentence1', 'sentence2'],\n", - " 'ax': ['premise', 'hypothesis']\n", - " }\n", - "\n", - " glue_task_num_labels = {\n", - " 'cola': 2,\n", - " 'sst2': 2,\n", - " 'mrpc': 2,\n", - " 'qqp': 2,\n", - " 'stsb': 1,\n", - " 'mnli': 3,\n", - " 'qnli': 2,\n", - " 'rte': 2,\n", - " 'wnli': 2,\n", - " 'ax': 3\n", - " }\n", - "\n", - " loader_columns = [\n", - " 'datasets_idx',\n", - " 'input_ids',\n", - " 'token_type_ids',\n", - " 'attention_mask',\n", - " 'start_positions',\n", - " 'end_positions',\n", - " 'labels'\n", - " ]\n", - "\n", - " def __init__(\n", - " self,\n", - " model_name_or_path: str,\n", - " task_name: str ='mrpc',\n", - " max_seq_length: int = 128,\n", - " train_batch_size: int = 32,\n", - " eval_batch_size: int = 32,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - " self.model_name_or_path = model_name_or_path\n", - " self.task_name = task_name\n", - " self.max_seq_length = max_seq_length\n", - " self.train_batch_size = train_batch_size\n", - " self.eval_batch_size = eval_batch_size\n", - "\n", - " self.text_fields = self.task_text_field_map[task_name]\n", - " self.num_labels = self.glue_task_num_labels[task_name]\n", - " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", - "\n", - " def setup(self, stage):\n", - " self.dataset = datasets.load_dataset('glue', self.task_name)\n", - "\n", - " for split in self.dataset.keys():\n", - " self.dataset[split] = self.dataset[split].map(\n", - " self.convert_to_features,\n", - " batched=True,\n", - " remove_columns=['label'],\n", - " )\n", - " self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]\n", - " self.dataset[split].set_format(type=\"torch\", columns=self.columns)\n", - "\n", - " self.eval_splits = [x for x in self.dataset.keys() if 'validation' in x]\n", - "\n", - " def prepare_data(self):\n", - " datasets.load_dataset('glue', self.task_name)\n", - " AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", - " \n", - " def train_dataloader(self):\n", - " return DataLoader(self.dataset['train'], batch_size=self.train_batch_size)\n", - " \n", - " def val_dataloader(self):\n", - " if len(self.eval_splits) == 1:\n", - " return DataLoader(self.dataset['validation'], batch_size=self.eval_batch_size)\n", - " elif len(self.eval_splits) > 1:\n", - " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", - "\n", - " def test_dataloader(self):\n", - " if len(self.eval_splits) == 1:\n", - " return DataLoader(self.dataset['test'], batch_size=self.eval_batch_size)\n", - " elif len(self.eval_splits) > 1:\n", - " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", - "\n", - " def convert_to_features(self, example_batch, indices=None):\n", - "\n", - " # Either encode single sentence or sentence pairs\n", - " if len(self.text_fields) > 1:\n", - " texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))\n", - " else:\n", - " texts_or_text_pairs = example_batch[self.text_fields[0]]\n", - "\n", - " # Tokenize the text/text pairs\n", - " features = self.tokenizer.batch_encode_plus(\n", - " texts_or_text_pairs,\n", - " max_length=self.max_seq_length,\n", - " pad_to_max_length=True,\n", - " truncation=True\n", - " )\n", - "\n", - " # Rename label to labels to make it easier to pass to model forward\n", - " features['labels'] = example_batch['label']\n", - "\n", - " return features" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jQC3a6KuOpX3" - }, - "source": [ - "#### You could use this datamodule with standalone PyTorch if you wanted..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "JCMH3IAsNffF" - }, - "outputs": [], - "source": [ - "dm = GLUEDataModule('distilbert-base-uncased')\n", - "dm.prepare_data()\n", - "dm.setup('fit')\n", - "next(iter(dm.train_dataloader()))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "l9fQ_67BO2Lj" - }, - "source": [ - "## GLUE Model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gtn5YGKYO65B" - }, - "outputs": [], - "source": [ - "class GLUETransformer(pl.LightningModule):\n", - " def __init__(\n", - " self,\n", - " model_name_or_path: str,\n", - " num_labels: int,\n", - " learning_rate: float = 2e-5,\n", - " adam_epsilon: float = 1e-8,\n", - " warmup_steps: int = 0,\n", - " weight_decay: float = 0.0,\n", - " train_batch_size: int = 32,\n", - " eval_batch_size: int = 32,\n", - " eval_splits: Optional[list] = None,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - "\n", - " self.save_hyperparameters()\n", - "\n", - " self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)\n", - " self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)\n", - " self.metric = datasets.load_metric(\n", - " 'glue',\n", - " self.hparams.task_name,\n", - " experiment_id=datetime.now().strftime(\"%d-%m-%Y_%H-%M-%S\")\n", - " )\n", - "\n", - " def forward(self, **inputs):\n", - " return self.model(**inputs)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " outputs = self(**batch)\n", - " loss = outputs[0]\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx, dataloader_idx=0):\n", - " outputs = self(**batch)\n", - " val_loss, logits = outputs[:2]\n", - "\n", - " if self.hparams.num_labels >= 1:\n", - " preds = torch.argmax(logits, axis=1)\n", - " elif self.hparams.num_labels == 1:\n", - " preds = logits.squeeze()\n", - "\n", - " labels = batch[\"labels\"]\n", - "\n", - " return {'loss': val_loss, \"preds\": preds, \"labels\": labels}\n", - "\n", - " def validation_epoch_end(self, outputs):\n", - " if self.hparams.task_name == 'mnli':\n", - " for i, output in enumerate(outputs):\n", - " # matched or mismatched\n", - " split = self.hparams.eval_splits[i].split('_')[-1]\n", - " preds = torch.cat([x['preds'] for x in output]).detach().cpu().numpy()\n", - " labels = torch.cat([x['labels'] for x in output]).detach().cpu().numpy()\n", - " loss = torch.stack([x['loss'] for x in output]).mean()\n", - " self.log(f'val_loss_{split}', loss, prog_bar=True)\n", - " split_metrics = {f\"{k}_{split}\": v for k, v in self.metric.compute(predictions=preds, references=labels).items()}\n", - " self.log_dict(split_metrics, prog_bar=True)\n", - " return loss\n", - "\n", - " preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()\n", - " labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()\n", - " loss = torch.stack([x['loss'] for x in outputs]).mean()\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)\n", - " return loss\n", - "\n", - " def setup(self, stage):\n", - " if stage == 'fit':\n", - " # Get dataloader by calling it - train_dataloader() is called after setup() by default\n", - " train_loader = self.train_dataloader()\n", - "\n", - " # Calculate total steps\n", - " self.total_steps = (\n", - " (len(train_loader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.gpus)))\n", - " // self.hparams.accumulate_grad_batches\n", - " * float(self.hparams.max_epochs)\n", - " )\n", - "\n", - " def configure_optimizers(self):\n", - " \"Prepare optimizer and schedule (linear warmup and decay)\"\n", - " model = self.model\n", - " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", - " optimizer_grouped_parameters = [\n", - " {\n", - " \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", - " \"weight_decay\": self.hparams.weight_decay,\n", - " },\n", - " {\n", - " \"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n", - " \"weight_decay\": 0.0,\n", - " },\n", - " ]\n", - " optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)\n", - "\n", - " scheduler = get_linear_schedule_with_warmup(\n", - " optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps\n", - " )\n", - " scheduler = {\n", - " 'scheduler': scheduler,\n", - " 'interval': 'step',\n", - " 'frequency': 1\n", - " }\n", - " return [optimizer], [scheduler]\n", - "\n", - " @staticmethod\n", - " def add_model_specific_args(parent_parser):\n", - " parser = parent_parser.add_argument_group(\"GLUETransformer\")", - " parser = ArgumentParser(parents=[parent_parser], add_help=False)\n", - " parser.add_argument(\"--learning_rate\", default=2e-5, type=float)\n", - " parser.add_argument(\"--adam_epsilon\", default=1e-8, type=float)\n", - " parser.add_argument(\"--warmup_steps\", default=0, type=int)\n", - " parser.add_argument(\"--weight_decay\", default=0.0, type=float)\n", - " return parent_parser" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ha-NdIP_xbd3" - }, - "source": [ - "### ⚡ Quick Tip \n", - " - Combine arguments from your DataModule, Model, and Trainer into one for easy and robust configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "3dEHnl3RPlAR" - }, - "outputs": [], - "source": [ - "def parse_args(args=None):\n", - " parser = ArgumentParser()\n", - " parser = pl.Trainer.add_argparse_args(parser)\n", - " parser = GLUEDataModule.add_argparse_args(parser)\n", - " parser = GLUETransformer.add_model_specific_args(parser)\n", - " parser.add_argument('--seed', type=int, default=42)\n", - " return parser.parse_args(args)\n", - "\n", - "\n", - "def main(args):\n", - " pl.seed_everything(args.seed)\n", - " dm = GLUEDataModule.from_argparse_args(args)\n", - " dm.prepare_data()\n", - " dm.setup('fit')\n", - " model = GLUETransformer(num_labels=dm.num_labels, eval_splits=dm.eval_splits, **vars(args))\n", - " trainer = pl.Trainer.from_argparse_args(args)\n", - " return dm, model, trainer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "PkuLaeec3sJ-" - }, - "source": [ - "# Training" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "QSpueK5UPsN7" - }, - "source": [ - "## CoLA\n", - "\n", - "See an interactive view of the CoLA dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "NJnFmtpnPu0Y" - }, - "outputs": [], - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path albert-base-v2\n", - " --task_name cola\n", - " --max_epochs 3\n", - " --gpus 1\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "_MrNsTnqdz4z" - }, - "source": [ - "## MRPC\n", - "\n", - "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mrpc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "LBwRxg9Cb3d-" - }, - "outputs": [], - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path distilbert-base-cased\n", - " --task_name mrpc\n", - " --max_epochs 3\n", - " --gpus 1\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "iZhbn0HzfdCu" - }, - "source": [ - "## MNLI\n", - "\n", - " - The MNLI dataset is huge, so we aren't going to bother trying to train it here.\n", - "\n", - " - Let's just make sure our multi-dataloader logic is right by skipping over training and going straight to validation.\n", - "\n", - "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mnli)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "AvsZMOggfcWW" - }, - "outputs": [], - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path distilbert-base-uncased\n", - " --task_name mnli\n", - " --max_epochs 1\n", - " --gpus 1\n", - " --limit_train_batches 10\n", - " --progress_bar_refresh_rate 20\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "
\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "04-transformers-text-classification.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb deleted file mode 100644 index d6996a925c228..0000000000000 --- a/notebooks/05-trainer-flags-overview.ipynb +++ /dev/null @@ -1,2926 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "goRmGIRI5cfC" - }, - "source": [ - "# Introduction to Lightning Flags ⚡🚩\n", - "\n", - "In this notebook, we'll go over the flags available in the `Trainer` object. Note that not everything will work in the Colab environment (multi-gpu, etc). This notebook accompanies the Trainer videos we'll be putting out.\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jKj5lgdr5j48" - }, - "source": [ - "--- \n", - "### Setup \n", - "First thing first, we need to install Lightning. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UGjilEHk4vb7" - }, - "outputs": [], - "source": [ - "! pip install pytorch-lightning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zaVUShmQ5n8Y" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from argparse import ArgumentParser\n", - "import torch\n", - "from torch import nn\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torch.utils.data import random_split\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "\n", - "from torchvision.datasets.mnist import MNIST\n", - "from torchvision import transforms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6tgkS8IYZwY_" - }, - "outputs": [], - "source": [ - "# ------------\n", - "# data\n", - "# ------------\n", - "pl.seed_everything(1234)\n", - "batch_size = 32\n", - "\n", - "# Init DataLoader from MNIST Dataset\n", - "\n", - "dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", - "mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())\n", - "mnist_train, mnist_val = random_split(dataset, [55000, 5000])\n", - "\n", - "train_loader = DataLoader(mnist_train, batch_size=batch_size)\n", - "val_loader = DataLoader(mnist_val, batch_size=batch_size)\n", - "test_loader = DataLoader(mnist_test, batch_size=batch_size)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gEulmrbxwaYL" - }, - "source": [ - "### Simple AutoEncoder Model\n", - "\n", - "Were gonna define a simple Lightning model so we can play with all the settings of the Lightning Trainer.\n", - "\n", - "LightningModule is simply pure Pytorch reorganized into hooks, that represents all the steps in the training process.\n", - "\n", - "You can use LightningModule hooks to control every part of your model, but for the purpose of this video we will use a very simple MNIST classifier, a model that takes 28*28 grayscale images of hand written images, and can predict the digit between 0-9.\n", - "\n", - "The LightningModule can encompass a single model, like an image classifier, or a deep learning system composed of multiple models, like this auto encoder that contains an encoder and a decoder.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x-34xKCI40yW" - }, - "outputs": [], - "source": [ - "class LitAutoEncoder(pl.LightningModule):\n", - "\n", - " def __init__(self, batch_size=32, lr=1e-3):\n", - " super().__init__()\n", - " self.encoder = nn.Sequential(\n", - " nn.Linear(28 * 28, 64),\n", - " nn.ReLU(),\n", - " nn.Linear(64, 3)\n", - " )\n", - " self.decoder = nn.Sequential(\n", - " nn.Linear(3, 64),\n", - " nn.ReLU(),\n", - " nn.Linear(64, 28 * 28)\n", - " )\n", - " self.batch_size=batch_size\n", - " self.learning_rate=lr\n", - "\n", - " def forward(self, x):\n", - " # in lightning, forward defines the prediction/inference actions\n", - " embedding = self.encoder(x)\n", - " return embedding\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('train_loss', loss)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('val_loss', loss)\n", - " \n", - " def test_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('test_loss', loss)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)\n", - " return optimizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VbxcRCrxiYly" - }, - "source": [ - "You'll notice the LightningModule doesn't have epoch and batch loops, we're not calling model.train() and model.eval(), and no mentions of CUDA or hardware. That's because it is all automated by the Lightning Trainer. All the engineering boilerplate is automated by the trainer: \n", - "\n", - "* Training loops\n", - "* Evaluation and test loops\n", - "* Calling model.train(), model.eval(), no_grad at the right time\n", - "* CUDA or to_device calls\n", - "\n", - "It also allows you to train your models on different hardware like GPUs and TPUs without changing your code!\n", - "\n", - "\n", - "### To use the lightning trainer simply:\n", - "\n", - "1. init your LightningModule and datasets\n", - "\n", - "2. init lightning trainer\n", - "\n", - "3. call trainer.fit\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HOk9c4_35FKg" - }, - "outputs": [], - "source": [ - "#####################\n", - "# 1. Init Model\n", - "#####################\n", - "\n", - "model = LitAutoEncoder()\n", - "\n", - "#####################\n", - "# 2. Init Trainer\n", - "#####################\n", - "\n", - "# these 2 flags are explained in the later sections...but for short explanation:\n", - "# - progress_bar_refresh_rate: limits refresh rate of tqdm progress bar so Colab doesn't freak out\n", - "# - max_epochs: only run 2 epochs instead of default of 1000\n", - "trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=2)\n", - "\n", - "#####################\n", - "# 3. Train\n", - "#####################\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3meDako-Qa_6" - }, - "source": [ - "Our model is training just like that, using the Lightning defaults. The beauty of Lightning is that everything is easily configurable.\n", - "In our next videos were going to show you all the ways you can control your Trainer to do things like controlling your training, validation and test loops, running on GPUs and TPUs, checkpointing, early stopping, and a lot more.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z_Wry2MckQkI" - }, - "source": [ - "# Training loop and eval loop Flags" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0MkI1xB2vsLj" - }, - "source": [ - "\n", - "To really scale up your networks, you can use accelerators like GPUs. GPUs or Graphical Processing Units, parallelize matrix multiplications which enable speed ups of at least 100x over training on CPUs.\n", - "\n", - "Let's say you have a machine with 8 GPUs on it. You can set this flag to 1, 4, or 8 GPUs and lightning will automatically distribute your training for you.\n", - "\n", - "```\n", - "trainer = pl.Trainer(gpus=1)\n", - "```\n", - "\n", - "---------\n", - "\n", - "Lightning makes your code hardware agnostic... This means, you can switch between CPUs, GPUs without code changes.\n", - "\n", - "However, it requires forming good PyTorch habits:\n", - "\n", - "1. First, remove the .cuda() or .to() calls in your code.\n", - "2. Second, when you initialize a new tensor, set the device=self.device in the call since every lightningModule knows what gpu index or TPU core it is on.\n", - "\n", - "You can also use type_as and or you can register the tensor as a buffer in your module’s __init__ method with register_buffer().\n", - "\n", - "```\n", - "# before lightning\n", - "def forward(self, x):\n", - " z = torch.Tensor(2, 3)\n", - " z = z.cuda(0)\n", - "\n", - "# with lightning\n", - "def forward(self, x):\n", - " z = torch.Tensor(2, 3)\n", - " z = z.type_as(x, device=self.device)\n", - "```\n", - "\n", - "\n", - "```\n", - "class LitModel(LightningModule):\n", - "\n", - " def __init__(self):\n", - " ...\n", - " self.register_buffer(\"sigma\", torch.eye(3))\n", - " # you can now access self.sigma anywhere in your module\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hw6jJhhjvlSL" - }, - "source": [ - "Lightning Trainer automates all the engineering boilerplate like iterating over epochs and batches, training eval and test loops, CUDA and to(device) calls, calling model.train and model.eval.\n", - "\n", - "You still have full control over the loops, by using the following trainer flags:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pT5-ETH9eUg6" - }, - "source": [ - "## Calling validation steps\n", - "Sometimes, training an epoch may be pretty fast, like minutes per epoch. In this case, you might not need to validate on every epoch. Instead, you can actually validate after a few epochs.\n", - "\n", - "Use `check_val_every_n_epoch` flag to control the frequency of validation step:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z-EMVvKheu3D" - }, - "outputs": [], - "source": [ - "# run val loop every 10 training epochs\n", - "trainer = pl.Trainer(check_val_every_n_epoch=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UOzZr9S2UcSO" - }, - "source": [ - "## val_check_interval\n", - "\n", - "In some cases where your epoch is very long, you might want to check validation within an epoch.\n", - "\n", - "You can also run validation step within your training epochs, by setting `val_check_interval` flag.\n", - "\n", - "Set `val_check_interval` to a float between [0.0 to 1.0] to check your validation set within a training epoch. For example, setting it to 0.25 will check your validation set 4 times during a training epoch.\n", - "\n", - "Default is set to 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9kbUbvrUVLrT" - }, - "outputs": [], - "source": [ - "# check validation set 4 times during a training epoch\n", - "trainer = pl.Trainer(val_check_interval=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Onm1gBsKVaw4" - }, - "source": [ - "When you have iterable data sets, or when streaming data for production use cases, it is useful to check the validation set every number of steps. \n", - "Set val_check_interval to an int:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "psn6DVb5Vi85" - }, - "outputs": [], - "source": [ - "# check validation set every 1000 training batches\n", - "# use this when using iterableDataset and your dataset has no length\n", - "# (ie: production cases with streaming data)\n", - "trainer = pl.Trainer(val_check_interval=1000)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QkoYonrWkb7-" - }, - "source": [ - "## num_sanity_val_steps \n", - "\n", - "You may have run into an issue, where you have a bug in your validation loop, but won't catch it until your training loop ends.\n", - "\n", - "and if your training loop takes hours or days, you will waste valuable compute.\n", - "\n", - "Instead, lightning automatically runs through 2 steps of validation in the beginning to catch these kinds of bugs up front.\n", - "\n", - "\n", - "The `num_sanity_val_steps` flag can help you run n batches of validation before starting the training routine.\n", - "\n", - "You can set it to 0 to turn it off" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOcT-ugSkiKW" - }, - "outputs": [], - "source": [ - "# turn it off\n", - "trainer = pl.Trainer(num_sanity_val_steps=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zS0ob1ZmTw56" - }, - "source": [ - "Set it to -1 to check all validation data before training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rzqvjA4UT263" - }, - "outputs": [], - "source": [ - "# check all validation data\n", - "trainer = pl.Trainer(num_sanity_val_steps=-1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uMB41wq4T3Z2" - }, - "source": [ - "Or use any arbitrary number of validation steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGP78aQzT7VS" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(num_sanity_val_steps=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H-xaYRtd1rb-" - }, - "source": [ - "## Limit train, validation, and test batches\n", - "\n", - "You can set limits on how much of training, validation and test dataset you want your model to check. This is useful if you have really large validation or tests sets, for debugging or testing something that happens at the end of an epoch.\n", - "\n", - "Set the flag to int to specify the number of batches to run\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XiK5cFKL1rcA" - }, - "outputs": [], - "source": [ - "# run for only 10 batches\n", - "trainer = pl.Trainer(limit_test_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y4LK0g65RrBm" - }, - "source": [ - "For example, some metrics need to be computed on the entire validation results, such as AUC ROC. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8MmeRs2DR3dD" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(limit_val_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xmigcNa1A2Vy" - }, - "source": [ - "You can use a float to limit the batches be percentage of the set on every epoch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "W7uGJt8nA4tv" - }, - "outputs": [], - "source": [ - "# run through only 25% of the test set each epoch\n", - "trainer = pl.Trainer(limit_test_batches=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YRI8THtUN7_e" - }, - "source": [ - "# Training on GPUs\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R8FFkX_FwlfE" - }, - "source": [ - "To run on 1 GPU set the flag to 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nnzkf3KaOE27" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cxBg47s5PB1P" - }, - "source": [ - "to run on 2 or 4 GPUs, set the flag to 2 or 4." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cSEM4ihLrohT" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZE6ZgwtNudro" - }, - "source": [ - "You can also select which GPU devices to run on, using a list of indices like [1, 4] \n", - "\n", - "or a string containing a comma separated list of GPU ids like '1,2'\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gQkJtq0urrjq" - }, - "outputs": [], - "source": [ - "# list: train on GPUs 1, 4 (by bus ordering)\n", - "# trainer = Trainer(gpus='1, 4') # equivalent\n", - "trainer = pl.Trainer(gpus=[1, 4])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XghDPad4us74" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=list(range(4)))\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6FVkKHpSPMTW" - }, - "source": [ - "You can use all the GPUs you have available by setting `gpus=-1`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "r6cKQijYrtPe" - }, - "outputs": [], - "source": [ - "# trainer = Trainer(gpus='-1') - equivalent\n", - "trainer = pl.Trainer(gpus=-1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2C-fNLm3UGCV" - }, - "source": [ - "Lightning uses the PCI bus_id as the index for ordering GPUs." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_V75s7EhOFhE" - }, - "source": [ - "### `auto_select_gpus`\n", - "\n", - "You can save on GPUs by running in “exclusive mode”, meaning only one process at a time can access them. If your not sure which GPUs you should use when running exclusive mode, Lightning can automatically find unoccupied GPUs for you. \n", - "\n", - "Simply specify the number of gpus as an integer `gpus=k`, and set the trainer flag `auto_select_gpus=True`. Lightning will automatically help you find k gpus that are not occupied by other processes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_Sd3XFsAOIwd" - }, - "outputs": [], - "source": [ - "# enable auto selection (will find two available gpus on system)\n", - "trainer = pl.Trainer(gpus=2, auto_select_gpus=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a5JGSBMQhJNp" - }, - "source": [ - "## analyzing GPU usage\n", - "\n", - "### log_gpu_memory\n", - "\n", - "This is useful to analyze the memory usage of your GPUs.\n", - "\n", - "To get the GPU memory usage for every GPU on the master node, set the flag to log_gpu_memory=all.\n", - "\n", - "Under the hood, lightning uses the nvidia-smi command which may slow your training down.\n", - "\n", - "Your logs can become overwhelmed if you log the usage from many GPUs at once. In this case, you can also set the flag to min_max which will log only the min and max usage across all the GPUs of the master node.\n", - "\n", - "Note that lightning is not logging the usage across all nodes for performance reasons." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "idus3ZGahOki" - }, - "outputs": [], - "source": [ - "# log all the GPUs (on master node only)\n", - "trainer = Trainer(log_gpu_memory='all')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-mevgiy_hkip" - }, - "source": [ - "To avoid the performance decrease you can also set `log_gpu_memory=min_max` to only log the min and max memory on the master node.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SlvLJnWyhs7J" - }, - "outputs": [], - "source": [ - "# log only the min and max memory on the master node\n", - "trainer = Trainer(log_gpu_memory='min_max')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K82FLLIJVQG3" - }, - "source": [ - "\n", - "But what if you want to train on multiple machines and not just one?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YViQ6PXesAue" - }, - "source": [ - "# Training on multiple GPUs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WacbBQUivxQq" - }, - "source": [ - "Lightning makes your models hardware agnostic, and you can run on GPUs with a flip of a flag. Lightning also supports training on multiple GPUs across many machines.\n", - "\n", - "You can do this by setting the num_nodes flag.\n", - "\n", - "The world size, or the total number of GPUs you are using, will be gpus*num_nodes.\n", - "\n", - "If i set gpus=8 and num_nodes=32 then I will be training on 256 GPUs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5iKckmDvr8zZ" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=8, num_nodes=32)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgcSbDjjlSTh" - }, - "source": [ - "## Accelerators\n", - "\n", - "Under the hood, Lightning uses distributed data parallel (or DDP) by default to distribute training across GPUs.\n", - "\n", - "This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables.\n", - "\n", - "Under the hood it's as if you had called your script like this:\n", - "\n", - "1. Each GPU across each node gets its own process.\n", - "2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.\n", - "3. Each process inits the model. (Make sure to set the random seed so that each model initializes with the same weights.)\n", - "4. Each process performs a full forward and backward pass in parallel.\n", - "5. The gradients are synced and averaged across all processes.\n", - "6. Each process updates its optimizer.\n", - "If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "n_Brr7F5wdtj" - }, - "outputs": [], - "source": [ - "# ddp = DistributedDataParallel\n", - "# trainer = pl.Trainer(gpus=2, num_nodes=2) equivalent\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='ddp')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edxHyttC5J3e" - }, - "source": [ - "DDP is the fastest and recommended way to distribute your training, but you can pass in other backends to `accelerator` trainer flag, when DDP is not supported.\n", - "\n", - "DDP isn't available in\n", - "* Jupyter Notebook, Google COLAB, Kaggle, etc.\n", - "* If You have a nested script without a root package\n", - "* or if Your script needs to invoke .fit or .test multiple times" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZDh96mavxHxf" - }, - "source": [ - "### DDP_SPAWN\n", - "\n", - "In these cases, you can use `ddp_spawn` instead. `ddp_spawn` is exactly like DDP except that it uses `.spawn()` to start the training processes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JM5TKtgLxo37" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='ddp_spawn')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sebhVE3qrhKK" - }, - "source": [ - "We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):\n", - "\n", - "* Since .spawn() trains the model in subprocesses, the model on the main process does not get updated.\n", - "\n", - "* Dataloader(num_workers=N), where N is large, bottlenecks training with DDP… ie: it will be VERY slow or won’t work at all. This is a PyTorch limitation.\n", - "\n", - "* Forces everything to be picklable.\n", - "\n", - "DDP is MUCH faster than DDP_spawn. To be able to use DDP we recommend you: \n", - "\n", - "1. Install a top-level module for your project using setup.py\n", - "\n", - "```\n", - "# setup.py\n", - "#!/usr/bin/env python\n", - "\n", - "from setuptools import setup, find_packages\n", - "\n", - "setup(name='src',\n", - " version='0.0.1',\n", - " description='Describe Your Cool Project',\n", - " author='',\n", - " author_email='',\n", - " url='https://github.com/YourSeed', # REPLACE WITH YOUR OWN GITHUB PROJECT LINK\n", - " install_requires=[\n", - " 'pytorch-lightning'\n", - " ],\n", - " packages=find_packages()\n", - " )\n", - "\n", - "```\n", - "\n", - "2. Setup your project like so:\n", - "\n", - "```\n", - "/project\n", - " /src\n", - " some_file.py\n", - " /or_a_folder\n", - " setup.py\n", - "```\n", - "3. Install as a root-level package\n", - "```\n", - "cd /project\n", - "pip install -e .\n", - "```\n", - "4. You can then call your scripts anywhere\n", - "```\n", - "cd /project/src\n", - "\n", - "python some_file.py --accelerator 'ddp' --gpus 8\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cmB3I_oyw7a8" - }, - "source": [ - "### DP\n", - "\n", - "If you're using windows, DDP is not supported. You can use `dp` for DataParallel instead: DataParallel uses multithreading, instead of multiprocessing. It splits a batch across k GPUs. That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results.\n", - "\n", - "DP use is discouraged by PyTorch and Lightning. Use DDP which is more stable and at least 3x faster.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OO-J0ISvlVCg" - }, - "outputs": [], - "source": [ - "# dp = DataParallel\n", - "trainer = pl.Trainer(gpus=2, accelerator='dp')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y7E2eHZKwUn9" - }, - "source": [ - "### DDP2\n", - "\n", - "In certain cases, it’s advantageous to use ***all*** batches on the same machine, instead of a subset. For instance, in self-supervised learning, a common performance boost comes from increasing the number of negative samples.\n", - "\n", - "In this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:\n", - "\n", - "* Copies a subset of the data to each node.\n", - "* Inits a model on each node.\n", - "* Runs a forward and backward pass using DP.\n", - "* Syncs gradients across nodes.\n", - "* Applies the optimizer updates.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Y4xweqL3xHER" - }, - "outputs": [], - "source": [ - "# ddp2 = DistributedDataParallel + dp\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='ddp2')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lhKNCnveeeq5" - }, - "source": [ - "- The second mode is ddp_spawn. This works like ddp, but instead of calling your script multiple times, lightning will use multiprocessing spawn to start a subprocess per GPU. \n", - "\n", - "However, you should be careful of mixing this mode with num_workers > 0 in your dataloaders because it will bottleneck your training. This is a current known limitation of PyTorch which is why we recommend using our ddp implementation instead.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HUf9ANyQkFFO" - }, - "source": [ - "\n", - "### mocking ddp\n", - "\n", - "Testing or debugging DDP can be hard, so we have a distributed backend that simulates ddp on cpus to make it easier. Set `num_processes` to a number greater than 1 when using accelerator=\"ddp_cpu\" to mimic distributed training on a machine without GPUs. Note that while this is useful for debugging, it will not provide any speedup, since single-process Torch already makes efficient use of multiple CPUs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZSal5Da9kHOf" - }, - "outputs": [], - "source": [ - "# Simulate DDP for debugging on your GPU-less laptop\n", - "trainer = Trainer(accelerator=\"ddp_cpu\", num_processes=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Br_btCy5lgES" - }, - "source": [ - "# Training on TPUS\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DXkBNITdv44d" - }, - "source": [ - "Another option for accelerating your training is using TPUs.\n", - "A TPU is a Tensor processing unit, designed specifically for deep learning. Each TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. Google estimates that 8 TPU cores are about as fast as 4 V100 GPUs!\n", - "\n", - "A TPU pod hosts many TPUs on it. Currently, TPU pod v2 has 2048 cores! You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.\n", - "\n", - "At this moment, TPUs are available on Google Cloud (GCP), Google Colab and Kaggle Environments.\n", - "\n", - "Lightning supports training on TPUs without any code adjustments to your model. Just like when using GPUs, Lightning automatically inserts the correct samplers - no need to do this yourself!\n", - "\n", - "Under the hood, lightning uses the XLA framework developed jointly by the facebook and google XLA teams. And we want to recognize their efforts in advancing TPU adoption of PyTorch.\n", - "\n", - "## tpu_cores\n", - "To train on TPUs, set the tpu_cores flag.\n", - "\n", - "When using colab or kaggle, the allowed values are 1 or 8 cores. When using google cloud, any value above 8 is allowed.\n", - "\n", - "Your effective batch size is the batch size passed into a dataloader times the total number of tpu cores." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "itP9y70gmD9M" - }, - "outputs": [], - "source": [ - "# int: train on a single core\n", - "trainer = pl.Trainer(tpu_cores=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NJKnzPb3mKEg" - }, - "outputs": [], - "source": [ - "# int: train on all cores few cores\n", - "trainer = pl.Trainer(tpu_cores=8)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8a4exfWUmOHq" - }, - "source": [ - "You can also choose which TPU core to train on, by passing a list [1-8]. This is not an officially supported use case but we are working with the XLA team to improve this user experience.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "S6OrjE_bmT-_" - }, - "outputs": [], - "source": [ - "# list: train on a single selected core\n", - "trainer = pl.Trainer(tpu_cores=[2])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Afqx3sFUmfWD" - }, - "source": [ - "To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script.\n", - "\n", - "\n", - "\n", - "```\n", - "python -m torch_xla.distributed.xla_dist\n", - "--tpu=$TPU_POD_NAME\n", - "--conda-env=torch-xla-nightly\n", - "--env=XLA_USE_BF16=1\n", - "-- python your_trainer_file.py\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ncPvbUVQqKOh" - }, - "source": [ - "# Advanced distributed training\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4MP7bEgnv7qK" - }, - "source": [ - "\n", - "Lightning supports distributed training across multiple GPUs and TPUs out of the box by setting trainer flags, but it also allows you to control the way sampling is done if you need to." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wdHiTfAMepKH" - }, - "source": [ - "## replace_sampler_ddp\n", - "In PyTorch, you must use torch.nn.DistributedSampler for multi-node or GPU training. The sampler makes sure each GPU sees the appropriate part of your data.\n", - "\n", - "```\n", - "# without lightning\n", - "def train_dataloader(self):\n", - " dataset = MNIST(...)\n", - " sampler = None\n", - "\n", - " if self.on_tpu:\n", - " sampler = DistributedSampler(dataset)\n", - "\n", - " return DataLoader(dataset, sampler=sampler)\n", - "```\n", - "Lightning adds the correct samplers when needed, so no need to explicitly add samplers. By default it will add `shuffle=True` for train sampler and `shuffle=False` for val/test sampler.\n", - "\n", - "If you want to customize this behaviour, you can set `replace_sampler_ddp=False` and add your own distributed sampler.\n", - "\n", - "(note: For iterable datasets, we don’t do this automatically.)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZfmcB_e_7HbE" - }, - "outputs": [], - "source": [ - "sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)\n", - "dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)\n", - "\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, replace_sampler_ddp=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-IOhk1n0lL3_" - }, - "source": [ - "## prepare_data_per_node\n", - "\n", - "When doing multi NODE training, if your nodes share the same file system, then you don't want to download data more than once to avoid possible collisions. \n", - "\n", - "Lightning automatically calls the prepare_data hook on the root GPU of the master node (ie: only a single GPU).\n", - "\n", - "In some cases where your nodes don't share the same file system, you need to download the data on each node. In this case you can set this flag to true and lightning will download the data on the root GPU of each node.\n", - "\n", - "This flag is defaulted to True." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WFBMUR48lM04" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=2, num_nodes=2, prepare_data_per_node=False)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FKBwXqo4q-Vp" - }, - "source": [ - "## sync_batchnorm\n", - "\n", - "Batch norm is computed per GPU/TPU. This flag enables synchronization between batchnorm layers across all GPUs.\n", - "It is recommended if you have small batch sizes.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GhaCLTEZrAQi" - }, - "outputs": [], - "source": [ - "trainer = Trainer(gpus=4, sync_batchnorm=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XuFA7VTFMY9-" - }, - "source": [ - "# Debugging flags\n", - "\n", - "Lightning offers a couple of flags to make debugging your models easier:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AKoS3fdml4Jx" - }, - "source": [ - "## Fast Dev Run\n", - "\n", - "To help you save time debugging, your first run should use the fast_dev_run flag.\n", - "\n", - "This won't generate logs or save checkpoints but will touch every line of your code to make sure that it is working as intended.\n", - "\n", - "Think about this flag like a compiler. You make changes to your code, and run Trainer with this flag to verify that your changes are bug free.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "L5vuG7GSmhzK" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(fast_dev_run=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HRP1qQR5nT4p" - }, - "source": [ - "## overfit_batches\n", - "\n", - "Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it.\n", - "\n", - "Useful for quickly debugging or trying to overfit on purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NTM-dqGMnXms" - }, - "outputs": [], - "source": [ - "# use only 1% of the train set (and use the train set for val and test)\n", - "trainer = pl.Trainer(overfit_batches=0.01)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c0LV0gC3nl1X" - }, - "outputs": [], - "source": [ - "# overfit on 10 of the same batches\n", - "trainer = pl.Trainer(overfit_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lt3UHU6WgtS_" - }, - "source": [ - "Or a float to represent percentage of data to run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K3yUqADhgnkf" - }, - "outputs": [], - "source": [ - "# run through only 25% of the test set each epoch\n", - "trainer = pl.Trainer(limit_test_batches=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ODN66NeVg_2o" - }, - "source": [ - "In the case of multiple test dataloaders, the limit applies to each dataloader individually.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8aQx5SLeMz1R" - }, - "source": [ - "# accumulate_grad_batches\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g8GczZXFwKC7" - }, - "source": [ - "The batch size controls the accuracy of the estimate of the gradients. Small batch size use less memory, but decrease accuracy. When training large models, such as NLP transformers, it is useful to accumulate gradients before calling backwards(). It allows for bigger batch sizes than what can actually fit on a GPU/TPU in a single step.\n", - "\n", - "Use accumulate_grad_batches to accumulate gradients every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number.\n", - "\n", - "For example, set accumulate_grad_batches to 4 to accumulate every 4 batches. In this case the effective batch size is batch_size*4, so if your batch size is 32, effectively it will be 128." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2jB6-Z_yPhhf" - }, - "outputs": [], - "source": [ - "# accumulate every 4 batches (effective batch size is batch*4)\n", - "trainer = pl.Trainer(accumulate_grad_batches=4)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_Yi-bdTOgINC" - }, - "source": [ - "You can also pass a dictionary to specify different accumulation per epoch. We can set it to `{5: 3, 10: 20}` to have no accumulation for epochs 1 to 4, accumulate 3 batches for epoch 5 to 10, and 20 batches after that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X3xsoZ3YPgBv" - }, - "outputs": [], - "source": [ - "# no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that\n", - "trainer = pl.Trainer(accumulate_grad_batches={5: 3, 10: 20})\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "myzH8mV4M1_9" - }, - "source": [ - "# 16 bit precision\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v9EaFAonwOk6" - }, - "source": [ - "Most deep learning frameworks like PyTorch, train with 32-bit floating point arithmetic. \n", - "\n", - "But many models can still achieve full accuracy using half the precision.\n", - "\n", - "In 2017, NVIDIA researchers successfully used a combination of 32 and 16 bit precision (also known as mixed precision) and achieved the same accuracy as 32 bit precision training.\n", - "\n", - "The main two advantages are:\n", - "\n", - "- a reduction in memory requirements which enables larger batch sizes and models.\n", - "- and a speed up in compute. On ampere, turing and volta architectures 16 bit precision models can train at least 3 times faster.\n", - "\n", - "As of PyTorch 1.6, NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. \n", - "\n", - "This package supersedes the apex package developed by NVIDIA." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TjNypZPHnxvJ" - }, - "source": [ - "## precision\n", - "\n", - "Use precision flag to switch between full precision (32) to half precision (16). Can be used on CPU, GPU or TPUs.\n", - "\n", - "When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.\n", - "\n", - "If used on TPU will use torch.bfloat16 but tensor printing will still show torch.float32" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kBZKMVx1nw-D" - }, - "outputs": [], - "source": [ - "# 16-bit precision\n", - "trainer = pl.Trainer(gpus=1, precision=16)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VJGj3Jh7oQXU" - }, - "source": [ - "In earlier version of Lightning, we use NVIDIA Apex for 16-bit precision. Apex was the first library to attempt 16-bit and the automatic mixed precision library (amp), has since been merged into core PyTorch as of 1.6.\n", - "\n", - "If you insist in using Apex, you can set the amp_backend flag to 'apex' and install Apex on your own." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BDV1trAUPc9h" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HK5c_aVfNV4e" - }, - "source": [ - "## amp_level\n", - "Apex includes 4 optimization levels:\n", - "O0 (FP32 training)\n", - "O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.\n", - "O2 (Fast Mixed Precision): this is the standard mixed precision training. It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.\n", - "O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed things up as cudnn batchnorm is faster anyway.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FshMFPowNbWt" - }, - "outputs": [], - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex', amp_level='O2')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y8KEr1YvNgkC" - }, - "source": [ - "# `auto_scale_batch_size`\n", - "\n", - " \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7F1pKFIuwSFl" - }, - "source": [ - "Lightning can help you improve your model by using auto_scale_batch_size flag, which tries to find the largest batch size that fits into memory, before you start your training.\n", - "Larger batch size often yields better estimates of gradients, but may also result in longer training time. \n", - "\n", - "Set it to True to initially run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9_jE-iyyheIv" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(auto_scale_batch_size=True)\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yaHsJvwFhNJt" - }, - "source": [ - "You can set the value to `power`. `power` scaling starts from a batch size of 1 and keeps doubling the batch size until an out-of-memory (OOM) error is encountered.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Qx0FbQrphgw1" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(auto_scale_batch_size='power')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8bwgVF9zhZ75" - }, - "source": [ - "You can also set it to `binsearch`, that continues to finetune the batch size by performing a binary search.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QObXNs3yNrg9" - }, - "outputs": [], - "source": [ - "# run batch size scaling, result overrides hparams.batch_size\n", - "trainer = pl.Trainer(auto_scale_batch_size='binsearch')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5OWdhSsZjqW7" - }, - "source": [ - "This feature expects that a batch_size field in the hparams of your model, i.e., model.hparams.batch_size should exist and will be overridden by the results of this algorithm. \n", - "\n", - "Additionally, your train_dataloader() method should depend on this field for this feature to work.\n", - "\n", - "The algorithm in short works by:\n", - "1. Dumping the current state of the model and trainer\n", - "\n", - "2. Iteratively until convergence or maximum number of tries max_trials (default 25) has been reached:\n", - "* Call fit() method of trainer. This evaluates steps_per_trial (default 3) number of training steps. Each training step can trigger an OOM error if the tensors (training batch, weights, gradients etc.) allocated during the steps have a too large memory footprint.\n", - " * If an OOM error is encountered, decrease the batch size\n", - " * Else increase it.\n", - "* How much the batch size is increased/decreased is determined by the chosen strategy.\n", - "\n", - "3. The found batch size is saved to model.hparams.batch_size\n", - "\n", - "4. Restore the initial state of model and trainer\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q4CvxfZmOWBd" - }, - "source": [ - "# `auto_lr_find`\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j85e8usNwdBV" - }, - "source": [ - "Selecting a good learning rate for your deep learning training is essential for both better performance and faster convergence.\n", - "\n", - "Even optimizers such as Adam that are self-adjusting the learning rate can benefit from more optimal choices.\n", - "\n", - "To reduce the amount of guesswork concerning choosing a good initial learning rate, you can use Lightning auto learning rate finder.\n", - "\n", - "The learning rate finder does a small run where the learning rate is increased after each processed batch and the corresponding loss is logged. The result of this is a lr vs. loss plot that can be used as guidance for choosing an optimal initial lr.\n", - "\n", - "\n", - "warning: For the moment, this feature only works with models having a single optimizer. LR support for DDP is not implemented yet, it is coming soon.\n", - "\n", - "\n", - "***auto_lr_find=***\n", - "\n", - "In the most basic use case, this feature can be enabled during trainer construction with Trainer(auto_lr_find=True).\n", - "When .fit(model) is called, the LR finder will automatically run before any training is done. The lr that is found and used will be written to the console and logged together with all other hyperparameters of the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iuhve9RBOfFh" - }, - "outputs": [], - "source": [ - "# default used by the Trainer (no learning rate finder)\n", - "trainer = pl.Trainer(mnist_model, auto_lr_find=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BL-gjXNCPDXk" - }, - "source": [ - "This flag sets your learning rate which can be accessed via self.lr or self.learning_rate.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wEb-vIMmPJQf" - }, - "outputs": [], - "source": [ - "class LitModel(LightningModule):\n", - "\n", - " def __init__(self, learning_rate):\n", - " self.learning_rate = learning_rate\n", - "\n", - " def configure_optimizers(self):\n", - " return Adam(self.parameters(), lr=(self.lr or self.learning_rate))\n", - "\n", - "# finds learning rate automatically\n", - "# sets hparams.lr or hparams.learning_rate to that learning rate\n", - "trainer = pl.Trainer(mnist_model, auto_lr_find=True)\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RweqvpnVPPSh" - }, - "source": [ - "To use an arbitrary value set it as auto_lr_find\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4LKI39IfPLJv" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(mnist_model, auto_lr_find='my_value')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9VAhPRKbPX-m" - }, - "source": [ - "Under the hood, when you call tune it runs the learning rate finder.\n", - "\n", - "If you want to inspect the results of the learning rate finder before doing any actual training or just play around with the parameters of the algorithm, this can be done by invoking the lr_find method of the trainer. A typical example of this would look like\n", - "\n", - "\n", - "```\n", - "trainer = pl.Trainer(auto_lr_find=True)\n", - "\n", - "# Run learning rate finder\n", - "lr_finder = trainer.lr_find(model)\n", - "\n", - "# Results can be found in\n", - "lr_finder.results\n", - "\n", - "# Plot with\n", - "fig = lr_finder.plot(suggest=True)\n", - "fig.show()\n", - "\n", - "# Pick point based on plot, or get suggestion\n", - "new_lr = lr_finder.suggestion()\n", - "\n", - "# update hparams of the model\n", - "model.hparams.lr = new_lr\n", - "\n", - "# Fit model\n", - "trainer.fit(model)\n", - "```\n", - "\n", - "The figure produced by lr_finder.plot() should look something like the figure below. It is recommended to not pick the learning rate that achieves the lowest loss, but instead something in the middle of the sharpest downward slope (red point). This is the point returned py lr_finder.suggestion().\n", - "\n", - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tn1RV-jfOjt1" - }, - "source": [ - "# `benchmark`\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rsmTl5zfwjM3" - }, - "source": [ - "You can try to speed your system by setting `benchmark=True`, which enables cudnn.benchmark. This flag is likely to increase the speed of your system if your input sizes don’t change. This flag makes cudnn auto-tuner look for the optimal set of algorithms for the given hardware configuration. This usually leads to faster runtime.\n", - "But if your input sizes changes at each iteration, then cudnn will benchmark every time a new size appears, possibly leading to worse runtime performances." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dWr-OCBgQCeb" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=1, benchmark=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qwAvSKYGa24K" - }, - "source": [ - "# `deterministic`\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tl5mfmafwmat" - }, - "source": [ - "PyTorch does not guarantee reproducible results, even when using identical seeds. To guarentee reproducible results, you can remove most of the randomness from your process by setting the `deterministic` flag to True.\n", - "\n", - "Note that it might make your system slower." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mhv5LZ3HbNCK" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gpus=1, deterministic=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u_5eJSvTf60f" - }, - "source": [ - "# Exploding and vanishing gradients" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B6drjh4pq6Jv" - }, - "source": [ - "## track_grad_norm\n", - "\n", - "You can debug your grad norm to identify exploding or vanishing gradients using the `track_grad_norm` flag.\n", - "\n", - "Set value to 2 to track the 2-norm. or p to any p-norm." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2taHUir8rflR" - }, - "outputs": [], - "source": [ - "# track the 2-norm\n", - "trainer = pl.Trainer(track_grad_norm=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3vHKxmruk62f" - }, - "source": [ - "May be set to ‘inf’ infinity-norm." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "g7TbD6SxlAjP" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(track_grad_norm='inf')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TcMlRe7ywpe6" - }, - "source": [ - "## Gradient clipping\n", - "\n", - "\n", - "Exploding gradients refer to the problem that the gradients get too large and overflow in training, making the model unstable. Gradient clipping will ‘clip’ the gradients or cap them to a Threshold value to prevent the gradients from getting too large. To avoid this, we can set `gradient_clip_val` (default is set to 0.0).\n", - "\n", - "[when to use it, what are relevant values]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jF9JwmbOgOWF" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(gradient_clip_val=0.1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ggb4MkkQrr1h" - }, - "source": [ - "# truncated_bptt_steps\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s1Iu6PyAw9_r" - }, - "source": [ - "If you have a large recurrent model, you can use truncated_bptt_steps flag to split up the backprop over portions of the sequence. This flag will automatically truncate your batches and the trainer will apply Truncated Backprop to it.\n", - "\n", - "Make sure your batches have a sequence dimension.\n", - "\n", - "Lightning takes care of splitting your batch along the time-dimension.\n", - "```\n", - "# we use the second as the time dimension\n", - "# (batch, time, ...)\n", - "sub_batch = batch[0, 0:t, ...]\n", - "Using this feature requires updating your LightningModule’s pytorch_lightning.core.LightningModule.training_step() to include a hiddens arg with the hidden\n", - "\n", - "# Truncated back-propagation through time\n", - "def training_step(self, batch, batch_idx, hiddens):\n", - " # hiddens are the hiddens from the previous truncated backprop step\n", - " out, hiddens = self.lstm(data, hiddens)\n", - "\n", - " return {\n", - " \"loss\": ...,\n", - " \"hiddens\": hiddens # remember to detach() this\n", - " }\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WiTF1VMtruMU" - }, - "outputs": [], - "source": [ - "# backprop every 5 steps in a batch\n", - "trainer = pl.Trainer(truncated_bptt_steps=5)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8XI_kEWkS-nT" - }, - "source": [ - "To modify how the batch is split, override pytorch_lightning.core.LightningModule.tbptt_split_batch():\n", - "\n", - "```\n", - "class LitMNIST(LightningModule):\n", - " def tbptt_split_batch(self, batch, split_size):\n", - " # do your own splitting on the batch\n", - " return splits\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oLbEmbmupwQ8" - }, - "source": [ - "# reload_dataloaders_every_epoch\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CLdNGVv9xD_L" - }, - "source": [ - "Set to True to reload dataloaders every epoch (instead of loading just once in the beginning of training).\n", - "\n", - "```\n", - "# if False (default)\n", - "train_loader = model.train_dataloader()\n", - "for epoch in epochs:\n", - " for batch in train_loader:\n", - " ...\n", - "\n", - "# if True\n", - "for epoch in epochs:\n", - " train_loader = model.train_dataloader()\n", - " for batch in train_loader:\n", - "\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "10AXthXxp311" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(reload_dataloaders_every_epoch=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f513EYl0bmmL" - }, - "source": [ - "# Callbacks\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pt7iGh4xNs5" - }, - "source": [ - "\n", - "Lightning Callbacks are self-contained programs that can be reused across projects.\n", - "Callbacks should capture NON-ESSENTIAL logic that is NOT required for your LightningModule to run. Lightning includes some a few built-in callbacks that can be used with flags like early stopping and Model Checkpointing, but you can also create your own callbacks to add any functionality to your models.\n", - "\n", - "The callback API includes hooks that allow you to add logic at every point of your training:\n", - "setup, teardown, on_epoch_start, on_epoch_end, on_batch_start, on_batch_end, on_init_start, on_keyboard_interrupt etc. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1t84gvDNsUuh" - }, - "source": [ - "## callbacks\n", - "\n", - "Use **callbacks=** to pass a list of user defined callbacks. These callbacks DO NOT replace the built-in callbacks (loggers or EarlyStopping). \n", - "\n", - "In this example, we create a dummy callback that prints a message when training starts and ends, using on_train_start and on_train_end hooks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oIXZYabub3f0" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.callbacks import Callback\n", - "\n", - "class PrintCallback(Callback):\n", - " def on_train_start(self, trainer, pl_module):\n", - " print(\"Training is started!\")\n", - " def on_train_end(self, trainer, pl_module):\n", - " print(\"Training is done.\")\n", - "\n", - "# a list of callbacks\n", - "callbacks = [PrintCallback()]\n", - "trainer = pl.Trainer(callbacks=callbacks)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cNF74CLYfJJu" - }, - "source": [ - "# Model checkpointing\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2blgquBrxLtS" - }, - "source": [ - "Checkpoints capture the exact value of all parameters used by a model.\n", - "\n", - "Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.\n", - "\n", - "Lightning automates saving and loading checkpoints so you restore a training session, saving all the required parameters including: \n", - "* 16-bit scaling factor (apex)\n", - "* Current epoch\n", - "* Global step\n", - "* Model state_dict\n", - "* State of all optimizers\n", - "* State of all learningRate schedulers\n", - "* State of all callbacks\n", - "* The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)\n", - "\n", - "By default Lightning will save a checkpoint in the working directory, which will be updated every epoch.\n", - "\n", - "### Automatic saving\n", - "By default Lightning will save a checkpoint in the end of the first epoch in the working directory, which will be updated every epoch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XGu0JULrg9l7" - }, - "outputs": [], - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(default_root_dir=os.getcwd())\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3s9OjkGuhq1W" - }, - "source": [ - "To change the checkpoint path pass in **default_root_dir=**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DgdxkrIQhvfw" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(default_root_dir='/your/path/to/save/checkpoints')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Qyvj_bkWrJiE" - }, - "source": [ - "\n", - "You can also have Lightning update your checkpoint based on a specific metric that you are logging (using self.log), by passing the key to `monitor=`. For example, if we want to save checkpoint based on the validation loss, logged as `val_loss`, you can pass:\n", - "\n", - "\n", - "```\n", - "checkpoint_callback = ModelCheckpoint(\n", - " filepath=os.getcwd(),\n", - " save_top_k=1,\n", - " verbose=True,\n", - " monitor='val_loss',\n", - " mode='min',\n", - " prefix=''\n", - ")\n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YzYMivw1rO1O" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "\n", - "trainer = pl.Trainer(callbacks=[ModelCheckpoint(monitor='val_loss')])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5hYs_FV8iDMn" - }, - "source": [ - "You can modify the behavior of checkpointing by creating your own callback, and passing it to the trainer. \n", - "You can control\n", - "* filepath- where logs are saved\n", - "* save_top_k- save k top models\n", - "* verbose\n", - "* monitor- the metric to monitor\n", - "* mode\n", - "* prefix\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Tb1K2VYDiNTu" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "\n", - "# DEFAULTS used by the Trainer\n", - "checkpoint_callback = ModelCheckpoint(\n", - " filepath=os.getcwd(),\n", - " save_top_k=3,\n", - " verbose=True,\n", - " monitor='val_loss',\n", - " mode='min',\n", - " prefix='',\n", - ")\n", - "\n", - "trainer = Trainer(callbacks=[checkpoint_callback])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YKhZ6xRojJcl" - }, - "source": [ - "You can disable checkpointing it by passing\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Yt8zd2ZFjOXX" - }, - "outputs": [], - "source": [ - "trainer = Trainer(checkpoint_callback=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HcLy8asCjrj9" - }, - "source": [ - "### Manual saving\n", - "\n", - "You can manually save checkpoints and restore your model from the checkpointed state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kZSkMJf0jR4x" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.save_checkpoint(\"example.ckpt\")\n", - "new_model = LitAutoEncoder.load_from_checkpoint(checkpoint_path=\"example.ckpt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X2d9cjVPj7CP" - }, - "source": [ - "### Checkpoint Loading\n", - "To load a model along with its weights, biases and module_arguments use following method:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BpAFfg5zkFmH" - }, - "outputs": [], - "source": [ - "model = LitAutoEncoder.load_from_checkpoint(PATH)\n", - "\n", - "print(model.learning_rate)\n", - "# prints the learning_rate you used in this checkpoint\n", - "\n", - "model.eval()\n", - "y_hat = model(x)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jTQ3mxSJkhFN" - }, - "source": [ - "But if you don’t want to use the values saved in the checkpoint, pass in your own here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IoMcOh9-kfUP" - }, - "outputs": [], - "source": [ - "class LitAutoEncoder(LightningModule):\n", - "\n", - " def __init__(self, in_dim, out_dim):\n", - " super().__init__()\n", - " self.save_hyperparameters()\n", - " self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ITPVY8mNknut" - }, - "source": [ - "you can restore the model like this\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H7XeRJzVkuY8" - }, - "outputs": [], - "source": [ - "# if you train and save the model like this it will use these values when loading\n", - "# the weights. But you can overwrite this\n", - "LitAutoEncoder(in_dim=32, out_dim=10)\n", - "\n", - "# uses in_dim=32, out_dim=10\n", - "model = LitAutoEncoder.load_from_checkpoint(PATH)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "14WwGpnVk0a4" - }, - "outputs": [], - "source": [ - "# uses in_dim=128, out_dim=10\n", - "model = LitAutoEncoder.load_from_checkpoint(PATH, in_dim=128, out_dim=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bY5s6wP_k1CU" - }, - "source": [ - "\n", - "\n", - "## Restoring Training State (resume_from_checkpoint)\n", - "If your training was cut short for some reason, you can resume exactly from where you left off using the `resume_from_checkpoint` flag, which will automatically restore model, epoch, step, LR schedulers, apex, etc..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9zfhHtyrk3rO" - }, - "outputs": [], - "source": [ - "model = LitAutoEncoder()\n", - "trainer = pl.Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')\n", - "\n", - "# automatically restores model, epoch, step, LR schedulers, apex, etc...\n", - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xkKdvALFsmT2" - }, - "source": [ - "## weights_save_path\n", - "You can specify a directory for saving weights file using `weights_save_path`.\n", - "\n", - "(If you are using a custom checkpoint callback, the checkpoint callback will override this flag)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9OwHHFcCsrgT" - }, - "outputs": [], - "source": [ - "# save to your custom path\n", - "trainer = pl.Trainer(weights_save_path='my/path')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PbNtlJ9Wsscf" - }, - "outputs": [], - "source": [ - "# if checkpoint callback used, then overrides the weights path\n", - "# **NOTE: this saves weights to some/path NOT my/path\n", - "checkpoint = ModelCheckpoint(filepath='some/path')\n", - "trainer = pl.Trainer(\n", - " callbacks=[checkpoint],\n", - " weights_save_path='my/path'\n", - ")\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uDdxCuyHdWQt" - }, - "source": [ - "# Early stopping\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fqAy3ihRxTfR" - }, - "source": [ - "The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed, to help you avoid overfitting.\n", - "\n", - "To enable Early Stopping you can init the EarlyStopping callback, and pass it to `callbacks=` trainer flag. The callback will look for a logged metric to early stop on.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lFx976CheH93" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", - "\n", - "trainer = pl.Trainer(callbacks=[EarlyStopping('val_loss')])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MwpJfTvjeOwF" - }, - "source": [ - "You can customize the callback using the following params:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "V6I9h6HteK2U" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", - "\n", - "early_stop_callback = EarlyStopping(\n", - " monitor='val_accuracy',\n", - " min_delta=0.00,\n", - " patience=3,\n", - " verbose=False,\n", - " mode='max'\n", - ")\n", - "trainer = pl.Trainer(callbacks=[early_stop_callback])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7TAIerPYe_Q1" - }, - "source": [ - "The EarlyStopping callback runs at the end of every validation check, which, under the default configuration, happens after every training epoch. However, the frequency of validation can be modified by setting various parameters on the Trainer, for example check_val_every_n_epoch and val_check_interval. It must be noted that the patience parameter counts the number of validation checks with no improvement, and not the number of training epochs. Therefore, with parameters check_val_every_n_epoch=10 and patience=3, the trainer will perform at least 40 training epochs before being stopped." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VoKrX2ENh9Fg" - }, - "source": [ - "# Logging" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-CQTPKd7iKLm" - }, - "source": [ - "Lightning has built in integration with various loggers such as TensorBoard, wandb, commet, etc.\n", - "\n", - "\n", - "You can pass any metrics you want to log during training to `self.log`, such as loss or accuracy. Similarly, pass in to self.log any metric you want to log during validation step.\n", - "\n", - "These values will be passed in to the logger of your choise. simply pass in any supported logger to logger trainer flag.\n", - "\n", - "\n", - "\n", - "Use the as`logger=` trainer flag to pass in a Logger, or iterable collection of Loggers, for experiment tracking.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ty5VPS3AiS8L" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.loggers import TensorBoardLogger\n", - "\n", - "# default logger used by trainer\n", - "logger = TensorBoardLogger(\n", - " save_dir=os.getcwd(),\n", - " version=1,\n", - " name='lightning_logs'\n", - ")\n", - "trainer = pl.Trainer(logger=logger)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jc5oWNpoiuuc" - }, - "source": [ - "Lightning supports the use of multiple loggers, just pass a list to the Trainer.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BlYwMRRyivp_" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger\n", - "logger1 = TensorBoardLogger('tb_logs', name='my_model')\n", - "logger2 = TestTubeLogger('tb_logs', name='my_model')\n", - "trainer = pl.Trainer(logger=[logger1, logger2])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7EyspQPh7iQ" - }, - "source": [ - "## flush_logs_every_n_steps\n", - "\n", - "Use this flag to determine when logging to disc should happen." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Em_XvsmyiBbk" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(flush_logs_every_n_steps=100)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_vDeKE98qsl1" - }, - "source": [ - "## log_every_n_steps\n", - "How often to add logging rows (does not write to disk)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HkqD7D_0w1Tt" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(log_every_n_steps=1000)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9uw0gfe422CT" - }, - "source": [ - "# info logging" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dQXpt0aatDGo" - }, - "source": [ - "### default_root_dir\n", - "\n", - "---\n", - "\n", - "\n", - "\n", - "Default path for logs and weights when no logger or pytorch_lightning.callbacks.ModelCheckpoint callback passed. On certain clusters you might want to separate where logs and checkpoints are stored. If you don’t then use this argument for convenience. Paths can be local paths or remote paths such as s3://bucket/path or ‘hdfs://path/’. Credentials will need to be set up to use remote filepaths." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CMmID2Bts5W3" - }, - "source": [ - "## weights_summary\n", - "Prints a summary of the weights when training begins. Default is set to `top`- print summary of top level modules.\n", - "\n", - "Options: ‘full’, ‘top’, None." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KTl6EdwDs6j2" - }, - "outputs": [], - "source": [ - "\n", - "# print full summary of all modules and submodules\n", - "trainer = pl.Trainer(weights_summary='full')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R57cSLl9w9ma" - }, - "outputs": [], - "source": [ - "# don't print a summary\n", - "trainer = Trainer(weights_summary=None)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bSc2hU5AotAP" - }, - "source": [ - "# progress bar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgvbyDsBxcH6" - }, - "source": [ - "## process_position\n", - "\n", - "Orders the progress bar. Useful when running multiple trainers on the same node.\n", - "\n", - "(This argument is ignored if a custom callback is passed to callbacks)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6ekz8Es8owDn" - }, - "outputs": [], - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(process_position=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "itivQFgEphBU" - }, - "source": [ - "## progress_bar_refresh_rate\n", - "\n", - "How often to refresh the progress bar (in steps). In notebooks, faster refresh rates (lower number) is known to crash them because of their screen refresh rates, so raise it to 50 or more." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GKe6eVxmplL5" - }, - "outputs": [], - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(progress_bar_refresh_rate=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8rDHJOJbxNtf" - }, - "outputs": [], - "source": [ - "# disable progress bar\n", - "trainer = Trainer(progress_bar_refresh_rate=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NCNvYLwjpWne" - }, - "source": [ - "# profiler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pRknrG_zpY6M" - }, - "outputs": [], - "source": [ - "# to profile standard training events\n", - "trainer = pl.Trainer(profiler=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ji6aWpU73kMM" - }, - "source": [ - "You can also use Lightning AdvancedProfiler if you want more detailed information about time spent in each function call recorded during a given action. The output is quite verbose and you should only use this if you want very detailed reports.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "layG55pt316C" - }, - "outputs": [], - "source": [ - "from pytorch_lightning.profiler import AdvancedProfiler\n", - "\n", - "trainer = Trainer(profiler=AdvancedProfiler())\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "
\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "05-trainer-flags-overview.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/06-mnist-tpu-training.ipynb b/notebooks/06-mnist-tpu-training.ipynb deleted file mode 100644 index ba5ebc98134cc..0000000000000 --- a/notebooks/06-mnist-tpu-training.ipynb +++ /dev/null @@ -1,368 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "06-mnist-tpu-training.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "TPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "WsWdLFMVKqbi" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qXO1QLkbRXl0" - }, - "source": [ - "# TPU training with PyTorch Lightning ⚡\n", - "\n", - "In this notebook, we'll train a model on TPUs. Changing one line of code is all you need to that.\n", - "\n", - "The most up to documentation related to TPU training can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/advanced/tpu.html).\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)\n", - " - Ask a question on our [GitHub Discussions](https://github.com/PyTorchLightning/pytorch-lightning/discussions/)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UmKX0Qa1RaLL" - }, - "source": [ - "### Setup\n", - "\n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "vAWOr0FZRaIj" - }, - "source": [ - "! pip install pytorch-lightning -qU" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zepCr1upT4Z3" - }, - "source": [ - "### Install Colab TPU compatible PyTorch/TPU wheels and dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AYGWh10lRaF1" - }, - "source": [ - "! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SNHa7DpmRZ-C" - }, - "source": [ - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import random_split, DataLoader\n", - "\n", - "# Note - you must have torchvision installed for this example\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rjo1dqzGUxt6" - }, - "source": [ - "### Defining The `MNISTDataModule`\n", - "\n", - "Below we define `MNISTDataModule`. You can learn more about datamodules in [docs](https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html) and [datamodule notebook](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/notebooks/02-datamodules.ipynb)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pkbrm3YgUxlE" - }, - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nr9AqDWxUxdK" - }, - "source": [ - "### Defining the `LitModel`\n", - "\n", - "Below, we define the model `LitMNIST`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YKt0KZkOUxVY" - }, - "source": [ - "class LitModel(pl.LightningModule):\n", - " \n", - " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " self.save_hyperparameters()\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " self.log('train_loss', loss, prog_bar=False)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)\n", - " return optimizer" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Uxl88z06cHyV" - }, - "source": [ - "### TPU Training\n", - "\n", - "Lightning supports training on a single TPU core or 8 TPU cores.\n", - "\n", - "The Trainer parameters `tpu_cores` defines how many TPU cores to train on (1 or 8) / Single TPU core to train on [1].\n", - "\n", - "For Single TPU training, Just pass the TPU core ID [1-8] in a list. Setting `tpu_cores=[5]` will train on TPU core ID 5." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UZ647Xg2gYng" - }, - "source": [ - "Train on TPU core ID 5 with `tpu_cores=[5]`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bzhJ8g_vUxN2" - }, - "source": [ - "# Init DataModule\n", - "dm = MNISTDataModule()\n", - "# Init model from datamodule's attributes\n", - "model = LitModel(*dm.size(), dm.num_classes)\n", - "# Init trainer\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=[5])\n", - "# Train\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "slMq_0XBglzC" - }, - "source": [ - "Train on single TPU core with `tpu_cores=1`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "31N5Scf2RZ61" - }, - "source": [ - "# Init DataModule\n", - "dm = MNISTDataModule()\n", - "# Init model from datamodule's attributes\n", - "model = LitModel(*dm.size(), dm.num_classes)\n", - "# Init trainer\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=1)\n", - "# Train\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_v8xcU5Sf_Cv" - }, - "source": [ - "Train on 8 TPU cores with `tpu_cores=8`. You might have to restart the notebook to run it on 8 TPU cores after training on single TPU core." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "EFEw7YpLf-gE" - }, - "source": [ - "# Init DataModule\n", - "dm = MNISTDataModule()\n", - "# Init model from datamodule's attributes\n", - "model = LitModel(*dm.size(), dm.num_classes)\n", - "# Init trainer\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=8)\n", - "# Train\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m2mhgEgpRZ1g" - }, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "
\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ] -} diff --git a/notebooks/07-cifar10-baseline.ipynb b/notebooks/07-cifar10-baseline.ipynb deleted file mode 100644 index c96f473c4bacf..0000000000000 --- a/notebooks/07-cifar10-baseline.ipynb +++ /dev/null @@ -1,394 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "07-cifar10-baseline.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "qMDj0BYNECU8" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ECu0zDh8UXU8" - }, - "source": [ - "# PyTorch Lightning CIFAR10 ~94% Baseline Tutorial ⚡\n", - "\n", - "Train a Resnet to 94% accuracy on Cifar10!\n", - "\n", - "Main takeaways:\n", - "1. Experiment with different Learning Rate schedules and frequencies in the configure_optimizers method in pl.LightningModule\n", - "2. Use an existing Resnet architecture with modifications directly with Lightning\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HYpMlx7apuHq" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply `pip install pytorch-lightning`.\n", - "Also check out [bolts](https://github.com/PyTorchLightning/lightning-bolts/) for pre-existing data modules and models." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ziAQCrE-TYWG" - }, - "source": [ - "! pip install pytorch-lightning lightning-bolts -qU" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "L-W_Gq2FORoU" - }, - "source": [ - "# Run this if you intend to use TPUs\n", - "# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py\n", - "# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "wjov-2N_TgeS" - }, - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "from torch.optim.lr_scheduler import OneCycleLR\n", - "from torch.optim.swa_utils import AveragedModel, update_bn\n", - "import torchvision\n", - "\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.callbacks import LearningRateMonitor\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "from pl_bolts.datamodules import CIFAR10DataModule\n", - "from pl_bolts.transforms.dataset_normalizations import cifar10_normalization" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "54JMU1N-0y0g" - }, - "source": [ - "pl.seed_everything(7);" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FA90qwFcqIXR" - }, - "source": [ - "### CIFAR10 Data Module\n", - "\n", - "Import the existing data module from `bolts` and modify the train and test transforms." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "S9e-W8CSa8nH" - }, - "source": [ - "batch_size = 32\n", - "\n", - "train_transforms = torchvision.transforms.Compose([\n", - " torchvision.transforms.RandomCrop(32, padding=4),\n", - " torchvision.transforms.RandomHorizontalFlip(),\n", - " torchvision.transforms.ToTensor(),\n", - " cifar10_normalization(),\n", - "])\n", - "\n", - "test_transforms = torchvision.transforms.Compose([\n", - " torchvision.transforms.ToTensor(),\n", - " cifar10_normalization(),\n", - "])\n", - "\n", - "cifar10_dm = CIFAR10DataModule(\n", - " batch_size=batch_size,\n", - " train_transforms=train_transforms,\n", - " test_transforms=test_transforms,\n", - " val_transforms=test_transforms,\n", - ")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SfCsutp3qUMc" - }, - "source": [ - "### Resnet\n", - "Modify the pre-existing Resnet architecture from TorchVision. The pre-existing architecture is based on ImageNet images (224x224) as input. So we need to modify it for CIFAR10 images (32x32)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GNSeJgwvhHp-" - }, - "source": [ - "def create_model():\n", - " model = torchvision.models.resnet18(pretrained=False, num_classes=10)\n", - " model.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " model.maxpool = nn.Identity()\n", - " return model" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HUCj5TKsqty1" - }, - "source": [ - "### Lightning Module\n", - "Check out the [`configure_optimizers`](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#configure-optimizers) method to use custom Learning Rate schedulers. The OneCycleLR with SGD will get you to around 92-93% accuracy in 20-30 epochs and 93-94% accuracy in 40-50 epochs. Feel free to experiment with different LR schedules from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "03OMrBa5iGtT" - }, - "source": [ - "class LitResnet(pl.LightningModule):\n", - " def __init__(self, lr=0.05):\n", - " super().__init__()\n", - "\n", - " self.save_hyperparameters()\n", - " self.model = create_model()\n", - "\n", - " def forward(self, x):\n", - " out = self.model(x)\n", - " return F.log_softmax(out, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = F.log_softmax(self.model(x), dim=1)\n", - " loss = F.nll_loss(logits, y)\n", - " self.log('train_loss', loss)\n", - " return loss\n", - "\n", - " def evaluate(self, batch, stage=None):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - "\n", - " if stage:\n", - " self.log(f'{stage}_loss', loss, prog_bar=True)\n", - " self.log(f'{stage}_acc', acc, prog_bar=True)\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " self.evaluate(batch, 'val')\n", - "\n", - " def test_step(self, batch, batch_idx):\n", - " self.evaluate(batch, 'test')\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr, momentum=0.9, weight_decay=5e-4)\n", - " steps_per_epoch = 45000 // batch_size\n", - " scheduler_dict = {\n", - " 'scheduler': OneCycleLR(optimizer, 0.1, epochs=self.trainer.max_epochs, steps_per_epoch=steps_per_epoch),\n", - " 'interval': 'step',\n", - " }\n", - " return {'optimizer': optimizer, 'lr_scheduler': scheduler_dict}" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "3FFPgpAFi9KU" - }, - "source": [ - "model = LitResnet(lr=0.05)\n", - "model.datamodule = cifar10_dm\n", - "\n", - "trainer = pl.Trainer(\n", - " progress_bar_refresh_rate=20,\n", - " max_epochs=40,\n", - " gpus=1,\n", - " logger=pl.loggers.TensorBoardLogger('lightning_logs/', name='resnet'),\n", - " callbacks=[LearningRateMonitor(logging_interval='step')],\n", - ")\n", - "\n", - "trainer.fit(model, cifar10_dm)\n", - "trainer.test(model, datamodule=cifar10_dm);" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lWL_WpeVIXWQ" - }, - "source": [ - "### Bonus: Use [Stochastic Weight Averaging](https://arxiv.org/abs/1803.05407) to get a boost on performance\n", - "\n", - "Use SWA from torch.optim to get a quick performance boost. Also shows a couple of cool features from Lightning:\n", - "- Use `training_epoch_end` to run code after the end of every epoch\n", - "- Use a pretrained model directly with this wrapper for SWA" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bsSwqKv0t9uY" - }, - "source": [ - "class SWAResnet(LitResnet):\n", - " def __init__(self, trained_model, lr=0.01):\n", - " super().__init__()\n", - "\n", - " self.save_hyperparameters('lr')\n", - " self.model = trained_model\n", - " self.swa_model = AveragedModel(self.model)\n", - "\n", - " def forward(self, x):\n", - " out = self.swa_model(x)\n", - " return F.log_softmax(out, dim=1)\n", - "\n", - " def training_epoch_end(self, training_step_outputs):\n", - " self.swa_model.update_parameters(self.model)\n", - "\n", - " def validation_step(self, batch, batch_idx, stage=None):\n", - " x, y = batch\n", - " logits = F.log_softmax(self.model(x), dim=1)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - "\n", - " self.log(f'val_loss', loss, prog_bar=True)\n", - " self.log(f'val_acc', acc, prog_bar=True)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.SGD(self.model.parameters(), lr=self.hparams.lr, momentum=0.9, weight_decay=5e-4)\n", - " return optimizer\n", - "\n", - " def on_train_end(self):\n", - " update_bn(self.datamodule.train_dataloader(), self.swa_model, device=self.device)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "cA6ZG7C74rjL" - }, - "source": [ - "swa_model = SWAResnet(model.model, lr=0.01)\n", - "swa_model.datamodule = cifar10_dm\n", - "\n", - "swa_trainer = pl.Trainer(\n", - " progress_bar_refresh_rate=20,\n", - " max_epochs=20,\n", - " gpus=1,\n", - " logger=pl.loggers.TensorBoardLogger('lightning_logs/', name='swa_resnet'),\n", - ")\n", - "\n", - "swa_trainer.fit(swa_model, cifar10_dm)\n", - "swa_trainer.test(swa_model, datamodule=cifar10_dm);" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "RRHMfGiDpZ2M" - }, - "source": [ - "# Start tensorboard.\n", - "%reload_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RltpFGS-s0M1" - }, - "source": [ - "\n", - "

Congratulations - Time to Join the Community!

\n", - "
\n", - "\n", - "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", - "\n", - "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", - "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", - "\n", - "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", - "\n", - "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ)!\n", - "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", - "\n", - "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", - "\n", - "* Please, star [Bolt](https://github.com/PyTorchLightning/lightning-bolts)\n", - "\n", - "### Contributions !\n", - "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", - "\n", - "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* [Bolt good first issue](https://github.com/PyTorchLightning/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", - "* You can also contribute your own notebooks with useful examples !\n", - "\n", - "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", - "\n", - "" - ] - } - ] -} diff --git a/notebooks/08-Domain-specific-demos.ipynb b/notebooks/08-Domain-specific-demos.ipynb deleted file mode 100644 index a7b6a0dcc903a..0000000000000 --- a/notebooks/08-Domain-specific-demos.ipynb +++ /dev/null @@ -1,7415 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "GatZ6ZiXFzVh" - }, - "source": [ - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "goRmGIRI5cfC" - }, - "source": [ - "# Live DEMO\n", - "\n", - "Here are four examples using Lightning.\n", - "\n", - "1. MNIST\n", - "2. GAN\n", - "3. Finetuning a Transformer from Huggingface\n", - "4. DQN" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jKj5lgdr5j48" - }, - "source": [ - "--- \n", - "### Setup \n", - "Lightning is easy to use. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 938 - }, - "id": "UGjilEHk4vb7", - "outputId": "229670cf-ec26-446f-afe5-2432c4571030" - }, - "outputs": [], - "source": [ - "! pip install pytorch-lightning==0.8.3 --upgrade --silent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zaVUShmQ5n8Y" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import torch\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gEulmrbxwaYL" - }, - "source": [ - "--- \n", - "## MNIST hello world" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nbQAcRna5e_q" - }, - "source": [ - "## Simplest example\n", - "\n", - "Here's the simplest most minimal example with just a training loop (no validation, no testing).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zM15oxCH5lo6" - }, - "outputs": [], - "source": [ - "class MNISTModel(pl.LightningModule):\n", - "\n", - " def __init__(self):\n", - " super(MNISTModel, self).__init__()\n", - " self.l1 = torch.nn.Linear(28 * 28, 10)\n", - "\n", - " def forward(self, x):\n", - " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", - "\n", - " def training_step(self, batch, batch_nb):\n", - " x, y = batch\n", - " loss = F.cross_entropy(self(x), y)\n", - " tensorboard_logs = {'train_loss': loss}\n", - " return {'loss': loss, 'log': tensorboard_logs}\n", - "\n", - " def configure_optimizers(self):\n", - " return torch.optim.Adam(self.parameters(), lr=0.02)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 639, - "referenced_widgets": [ - "bb305bb378774c1586a3196eb3babd29", - "ff6eead2826e4113abf7ab3a8cb31b0f", - "992f545bb6f1489493d89a59d046f63f", - "44e2c572ab1641a29156ad4ee8884e12", - "254ddfa7c23d4b6f828d515dbab38978", - "a700c003887b4d2ba134fbfcf4823cb5", - "32e47e93509043439580cd5f58dc7726", - "845415af79634da5a64b9f368280c0e3", - "a7a94847786244dd9a5cb9718957143a", - "8734c1b798ff4ba0bf77dca4f3de9cbf", - "ee0a78c01b63443f9e51470a1b1e79a4", - "632b9d0d9ffa4d479deb70f6fafb92ab", - "a9413692ae5040e6ae3c2a446dbe297c", - "ca4cd1659d73446e964f9ab36d92e3a0", - "42e787b78000472eab434fb795197a86", - "1a7680c6279d4985bd69188dd72b11d5", - "3397549a0695432990f1d3d5390941e7", - "515ef7d03ef2447e9643210b029b930e", - "ae52e3d810aa4bc5965559ed2ba2b78a", - "08b6d9269e514d228e7e94fe0299a2c5", - "7ee81979301c447bb13ff9fff5153e0f", - "ea162090fc954f0198a1d63507dfff9b", - "0cf9a61c88af45b6a6ef72640f93cbfd", - "67728556b4c9432b877d54a081657663", - "de325f4002a945b4a2a15086c2a77816", - "5012438370764b4db215d545e9414c94", - "6aafaca3c8824e2fa267f4a68d5d2ca3", - "c4200c1f957a4179af51245a797c8921", - "53b2a85381b1460d9f446390c79bfc08", - "59f02fe7f9f2433bb25f5b292c213f50", - "1dabf5740f4d44d68d06629f77b001e3", - "0f688614251d49589f320f2b2cb55344", - "c93f037dc6044d858ae1862d5b29f6f0", - "00ae53beaa9341f4826b1bdc0a6f88e0", - "4b7021f73f6b4e5193454128ccf323d7", - "6f55aa11acb14afdb2ac0a1052be1bb6", - "b5f184fbcba740999b205e34e23455d6", - "d9540ab5d2394b77a65f48b501acdc18", - "23fd97d95fae4f42bd21906f67115f8b", - "420e8d65e9584973a8004e8398cf430c" - ] - }, - "id": "5VEbFQp55wqo", - "outputId": "c2321d5d-bbad-4896-b41b-dbc9ed19340d" - }, - "outputs": [], - "source": [ - "train_loader = DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)\n", - "\n", - "mnist_model = MNISTModel()\n", - "trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=20) \n", - "trainer.fit(mnist_model, train_loader) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gjo55nA549pU" - }, - "source": [ - "### 1. LightningModule\n", - "Each project goes into a LightningModule.\n", - "This module houses:\n", - "1. Model definition (__init__)\n", - "2. Computations (forward)\n", - "3. What happens inside the training loop (training_step)\n", - "4. What happens inside the validation loop (validation_step)\n", - "5. What optimizer(s) to use (configure_optimizers)\n", - "6. What data to use (train_dataloader, val_dataloader, test_dataloader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x-34xKCI40yW" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import torch\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "\n", - "class MNISTModel(pl.LightningModule):\n", - "\n", - " def __init__(self):\n", - " super(MNISTModel, self).__init__()\n", - " # not the best model...\n", - " self.l1 = torch.nn.Linear(28 * 28, 10)\n", - "\n", - " def forward(self, x):\n", - " # called with self(x)\n", - " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", - "\n", - " def training_step(self, batch, batch_nb):\n", - " # REQUIRED\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " loss = F.cross_entropy(y_hat, y)\n", - " tensorboard_logs = {'train_loss': loss}\n", - " return {'loss': loss, 'log': tensorboard_logs}\n", - "\n", - " def validation_step(self, batch, batch_nb):\n", - " # OPTIONAL\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " return {'val_loss': F.cross_entropy(y_hat, y)}\n", - "\n", - " def validation_epoch_end(self, outputs):\n", - " # OPTIONAL\n", - " avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()\n", - " tensorboard_logs = {'val_loss': avg_loss}\n", - " return {'val_loss': avg_loss, 'log': tensorboard_logs}\n", - "\n", - " def test_step(self, batch, batch_nb):\n", - " # OPTIONAL\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " return {'test_loss': F.cross_entropy(y_hat, y)}\n", - "\n", - " def test_epoch_end(self, outputs):\n", - " # OPTIONAL\n", - " avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()\n", - " logs = {'test_loss': avg_loss}\n", - " return {'test_loss': avg_loss, 'log': logs, 'progress_bar': logs}\n", - "\n", - " def configure_optimizers(self):\n", - " # REQUIRED\n", - " # can return multiple optimizers and learning_rate schedulers\n", - " # (LBFGS it is automatically supported, no need for closure function)\n", - " return torch.optim.Adam(self.parameters(), lr=0.02)\n", - "\n", - " def train_dataloader(self):\n", - " # REQUIRED\n", - " return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " # OPTIONAL\n", - " return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " # OPTIONAL\n", - " return DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()), batch_size=32)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GROo8IDX-QCx" - }, - "source": [ - "### 2. Trainer\n", - "The trainer is where the magic happens!\n", - "Feed ANY LightningModule to a trainer to train the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 510, - "referenced_widgets": [ - "6b3a598e7d01407aa5850b5a6620e7f0", - "e23c0d6c117246b2a0a6681008748917", - "e7a06a13ea11427ea3866cec3a55b644", - "620f45256d504f0188f40c61e23e1355", - "05f01b8813374534a4c58ac65fe2b390", - "ee3d759a4e4442288599eacfd1347c8d", - "c3587b40d9f942e98b708ff0b5fc5301", - "e64a723bcf474d6699d78ec05462f995", - "4ffe9fb35ca44358b0177723f73a35d8", - "ba3fe1aba1b04a1fbaab268dfd3d0166", - "fc3f14c4e83048aa9d6fe9963f95bf7a", - "171db7c8fa1e4f11aaff71b9f5879d58", - "4eaea330bc8e414fbf2f0e2b21af8b08", - "118c0b8da0df4ff68a90a3d500f1d1b8", - "135883097f0e428c963ae0ad320dfabd", - "85741a7765a147c4a8d69872850cb072", - "3d2e43ae9f924fbd8463ce72b44200f3", - "7c03e0a2247442ad9c11569e443de4bb", - "3a521cb700f247fd8cd345b158697f2f", - "8a40fcad728841c7ab2fd15d2c40ee5f", - "bc4dfb14c9d14499ae72d2a30fc6bc2d", - "866f880d850a4e689a1c99723f0366db", - "e7a864f4dc0f485eb045b778e981fc01", - "56ef38eff92143bcaf68b22c8dae7f98", - "a252ac32033b4e39b87a6c91bd21b5ae", - "40ce71ff339849748486ebbc73474cbe", - "f1db9d62eee44e61bb8ac26c16b3b601", - "69fa0a853cf84b9482043e12881c849f", - "d8db4d3709f34c869dedbc066e60501e", - "9b6445338a69425889a8901c192d5144", - "51b1111f5fe24042b38af809285e1b16", - "294d8142a4aa48aa8261b0b8155ef97f", - "a2990a67f92c4047b95876aae91e3de0", - "44ca9ee5c356458680a5d20c6a891c91", - "7e8cf26303ed4975b239fd43184a1dc6", - "d7406a8b15f9439fba19ec4dab086c61", - "fdf5c4a49602423184f6d94cd814177e", - "5d3c506d3f4444d8a6b7024cd11de2cd", - "42ede89dbd194eb6a603ccd7d4b96aae", - "ea13174e5b894e93b3c59d7e599de5a9", - "ee908316d227495381e8cf7dcf5526f1", - "a4b49709f7464ce491324e8aa636c152", - "8eb2086a01cf41429a5f4adff5f2359b", - "bbef89e4fd9d4cf8ae4c8fcab9bc665d", - "d01088cc378044cba4879032d74a852e", - "352d7dae131b407cb6e0238315c1b1a0", - "91a6de2063cc48b28021ef29feab7f69", - "39422514a4a04a9ba290285dc586ea9f", - "b73a326ada4d4a859e3c2c39abf5530d", - "cd942318db094680821f0d9902941977", - "29650c4a829b44ed9e1526b1dc5d2b83", - "df6521155d05459882601ba8c84f3dce", - "384a36423d154f2abcddb5094afeced3", - "a99b7813bf88496c875a818afe3b170a", - "f4a052d2223a4d4fa95ed52f94ad465d", - "7ea8ad4e10bb465aa2b6708655a2793f", - "15bb223836764207a5ac15616a41ddb7", - "46f7df7035d44bd099f60ad23f836f8a", - "296453e43f7344de8a9b5c6bc970ab1e", - "cd86997da08649d7999ade2d0e7cea96", - "7d15fc81537a449cb6b6afd7ccc65dac", - "bef041a9f5a942f68b4a8488a371d3da", - "e10c94b1fdf84a9186ab7d87fd83f87f", - "19c7460c565d494abbb8b9731a34294d" - ] - }, - "id": "HOk9c4_35FKg", - "outputId": "a07e65a7-7452-478d-f80e-179272b26b8a" - }, - "outputs": [], - "source": [ - "mnist_model = MNISTModel()\n", - "\n", - "# most basic trainer, uses good defaults (1 gpu)\n", - "trainer = pl.Trainer(gpus=1) \n", - "trainer.fit(mnist_model) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IV77NP_Ywkzb" - }, - "source": [ - "### Testing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134, - "referenced_widgets": [ - "bcf69c2a0b694e0498beadb6f4509395", - "2e20c741cf8a401cb90e8e230a23026a", - "a7bcd18049d8493b9d3d9f17d86f0429", - "3c99401bde8641c19978c11c9abb906a", - "cd84335fb7234f3aa54dafe045614e56", - "f261b8aab86b4d6e94984bf658c1b74d", - "fd8ec919352046dd84057e9763bb235a", - "f778d9ef70ca4f5898c423109cf82ed2" - ] - }, - "id": "-Bnkq97qhe2x", - "outputId": "9db00280-ef5b-4ae4-8a6d-174590ae6d0c" - }, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Q-qxNrXvKAlN" - }, - "source": [ - "### Plotting\n", - "\n", - "Plot the results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xF9-ouAEGFlZ" - }, - "source": [ - "By using the trainer you automatically get:\n", - "1. Tensorboard logging\n", - "2. Model checkpointing\n", - "3. Training and validation loop\n", - "4. early-stopping" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "18STRwHg-kW8" - }, - "source": [ - "### Bonus\n", - "In fact, if you keep calling fit, it'll keep training the model where it left off!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 83, - "referenced_widgets": [ - "6aa5e292e2094c239e7418994a31ff51", - "555443a6fa564d10a3a7901cf15a79a3", - "1f9c48164702427fb3aca2a26b2651e5", - "d8bd5c9b233b41008109d14cffc89aaa", - "61c71d4f1c2848b1813aebc0b2db5e25", - "a1e2c38bb40642168cc9d44abf645a54", - "d0590d65433c4478af6a0762421f9f7a", - "76c916d634c644a4a0d8f12e183822fd", - "a38242d3231442e2a259067d6a1355c2", - "9a9ebf052d914a8881882da8d2fa9cd8", - "a56deb884719491090a4146e72be3868", - "7633820adf9a4757ae73b472e43031d6", - "b7a073dfdeaf48fc9f3e6352b0ea2ba7", - "8aab627e715a44ada2af81b74bece257", - "fc262db2a53948488092a77209081319", - "11db4a94a4534fc2b503aad28be631be" - ] - }, - "id": "U2d1gc4N5IJX", - "outputId": "f68aaf1f-dfa9-4f30-de7e-d4fdab9eb089" - }, - "outputs": [], - "source": [ - "trainer.fit(mnist_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "P0bSmCw57aV5" - }, - "source": [ - "---\n", - "## GAN Example\n", - "\n", - "How to train a GAN!\n", - "\n", - "Main takeaways:\n", - "1. Generator and discriminator are arbitraty PyTorch modules.\n", - "2. training_step does both the generator and discriminator training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pBhBR3QJ7mhx" - }, - "source": [ - "#### A. Generator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mesU_huG-rr6" - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "To run this template just do:\n", - "python gan.py\n", - "After a few epochs, launch tensorboard to see the images being generated at every batch.\n", - "tensorboard --logdir default\n", - "\"\"\"\n", - "import os\n", - "from argparse import ArgumentParser\n", - "from collections import OrderedDict\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "from torch.utils.data import DataLoader\n", - "from torchvision.datasets import MNIST\n", - "\n", - "import pytorch_lightning as pl\n", - "\n", - "\n", - "class Generator(nn.Module):\n", - " def __init__(self, latent_dim, img_shape):\n", - " super(Generator, self).__init__()\n", - " self.img_shape = img_shape\n", - "\n", - " def block(in_feat, out_feat, normalize=True):\n", - " layers = [nn.Linear(in_feat, out_feat)]\n", - " if normalize:\n", - " layers.append(nn.BatchNorm1d(out_feat, 0.8))\n", - " layers.append(nn.LeakyReLU(0.2, inplace=True))\n", - " return layers\n", - "\n", - " self.model = nn.Sequential(\n", - " *block(latent_dim, 128, normalize=False),\n", - " *block(128, 256),\n", - " *block(256, 512),\n", - " *block(512, 1024),\n", - " nn.Linear(1024, int(np.prod(img_shape))),\n", - " nn.Tanh()\n", - " )\n", - "\n", - " def forward(self, z):\n", - " img = self.model(z)\n", - " img = img.view(img.size(0), *self.img_shape)\n", - " return img" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bt37ycLx7uO3" - }, - "source": [ - "### B. Discriminator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pcPCt8JG7tI-" - }, - "outputs": [], - "source": [ - "class Discriminator(nn.Module):\n", - " def __init__(self, img_shape):\n", - " super(Discriminator, self).__init__()\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Linear(int(np.prod(img_shape)), 512),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(512, 256),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(256, 1),\n", - " nn.Sigmoid(),\n", - " )\n", - "\n", - " def forward(self, img):\n", - " img_flat = img.view(img.size(0), -1)\n", - " validity = self.model(img_flat)\n", - "\n", - " return validity" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TyYOdg8g77P0" - }, - "source": [ - "### C. GAN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ArrPXFM371jR" - }, - "outputs": [], - "source": [ - "class GAN(pl.LightningModule):\n", - "\n", - " def __init__(self, hparams):\n", - " super(GAN, self).__init__()\n", - " self.hparams = hparams\n", - "\n", - " # networks\n", - " mnist_shape = (1, 28, 28)\n", - " self.generator = Generator(latent_dim=hparams.latent_dim, img_shape=mnist_shape)\n", - " self.discriminator = Discriminator(img_shape=mnist_shape)\n", - "\n", - " # cache for generated images\n", - " self.generated_imgs = None\n", - " self.last_imgs = None\n", - "\n", - " def forward(self, z):\n", - " return self.generator(z)\n", - "\n", - " def adversarial_loss(self, y_hat, y):\n", - " return F.binary_cross_entropy(y_hat, y)\n", - "\n", - " def training_step(self, batch, batch_nb, optimizer_idx):\n", - " imgs, _ = batch\n", - " self.last_imgs = imgs\n", - "\n", - " # train generator\n", - " if optimizer_idx == 0:\n", - " # sample noise\n", - " z = torch.randn(imgs.shape[0], self.hparams.latent_dim)\n", - "\n", - " # match gpu device (or keep as cpu)\n", - " if self.on_gpu:\n", - " z = z.cuda(imgs.device.index)\n", - "\n", - " # generate images\n", - " self.generated_imgs = self(z)\n", - "\n", - " # log sampled images\n", - " # sample_imgs = self.generated_imgs[:6]\n", - " # grid = torchvision.utils.make_grid(sample_imgs)\n", - " # self.logger.experiment.add_image('generated_images', grid, 0)\n", - "\n", - " # ground truth result (ie: all fake)\n", - " # put on GPU because we created this tensor inside training_loop\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " if self.on_gpu:\n", - " valid = valid.cuda(imgs.device.index)\n", - "\n", - " # adversarial loss is binary cross-entropy\n", - " g_loss = self.adversarial_loss(self.discriminator(self.generated_imgs), valid)\n", - " tqdm_dict = {'g_loss': g_loss}\n", - " output = OrderedDict({\n", - " 'loss': g_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " # train discriminator\n", - " if optimizer_idx == 1:\n", - " # Measure discriminator's ability to classify real from generated samples\n", - "\n", - " # how well can it label as real?\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " if self.on_gpu:\n", - " valid = valid.cuda(imgs.device.index)\n", - "\n", - " real_loss = self.adversarial_loss(self.discriminator(imgs), valid)\n", - "\n", - " # how well can it label as fake?\n", - " fake = torch.zeros(imgs.size(0), 1)\n", - " if self.on_gpu:\n", - " fake = fake.cuda(imgs.device.index)\n", - "\n", - " fake_loss = self.adversarial_loss(\n", - " self.discriminator(self.generated_imgs.detach()), fake)\n", - "\n", - " # discriminator loss is the average of these\n", - " d_loss = (real_loss + fake_loss) / 2\n", - " tqdm_dict = {'d_loss': d_loss}\n", - " output = OrderedDict({\n", - " 'loss': d_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " def configure_optimizers(self):\n", - " lr = self.hparams.lr\n", - " b1 = self.hparams.b1\n", - " b2 = self.hparams.b2\n", - "\n", - " opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))\n", - " opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))\n", - " return [opt_g, opt_d], []\n", - "\n", - " def train_dataloader(self):\n", - " transform = transforms.Compose([transforms.ToTensor(),\n", - " transforms.Normalize([0.5], [0.5])])\n", - " dataset = MNIST(os.getcwd(), train=True, download=True, transform=transform)\n", - " return DataLoader(dataset, batch_size=self.hparams.batch_size)\n", - "\n", - " def on_epoch_end(self):\n", - " z = torch.randn(8, self.hparams.latent_dim)\n", - " # match gpu device (or keep as cpu)\n", - " if self.on_gpu:\n", - " z = z.cuda(self.last_imgs.device.index)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self(z)\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image(f'generated_images', grid, self.current_epoch)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-WRY6dfn8ScZ" - }, - "source": [ - "### D. Trainer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xsmHHcpP8ryX" - }, - "source": [ - "Here we fake using argparse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fIJl3phH8uEI" - }, - "outputs": [], - "source": [ - "from argparse import Namespace\n", - "\n", - "args = {\n", - " 'batch_size': 32,\n", - " 'lr': 0.0002,\n", - " 'b1': 0.5,\n", - " 'b2': 0.999,\n", - " 'latent_dim': 100\n", - "}\n", - "hparams = Namespace(**args)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 83, - "referenced_widgets": [ - "5f119b90386c499ea9caf987fecf6c06", - "6d391353197443a694f6c75147ca96df", - "47df0bc1b3d14bb7b673e0591daa4e5f", - "87e118f890dc42319e723331e1306787", - "e068e2b1c68c48a784c19fc716c043a3", - "7662324b3b924f8f9649dc409fb0d349", - "afc85a52a5d04653ae9e7168b180ff98", - "dbb9fd5429f5416ab6a4f78f0c72867c" - ] - }, - "id": "h788dCGu7_Iu", - "outputId": "bcebc504-f0fc-496b-c8d5-a0c2f3349155" - }, - "outputs": [], - "source": [ - "gan_model = GAN(hparams)\n", - "\n", - "# most basic trainer, uses good defaults (1 gpu)\n", - "trainer = pl.Trainer(gpus=1) \n", - "trainer.fit(gan_model) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7uQVI-xv9Ddj" - }, - "source": [ - "--- \n", - "## BERT example\n", - "BERT + Lightning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e2npX-Gi9uwa" - }, - "outputs": [], - "source": [ - "! pip install transformers" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeLyZQ_E9o1T" - }, - "source": [ - "#### Data download + processing\n", - "\n", - "Let's grab the correct data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 164, - "referenced_widgets": [ - "5484eef7b6f247d68a89f86965b0940f", - "0c3473a16a5e4c46a6c7515e610bca7f", - "ad849800b2124195b92f3bf9dfc7681b", - "6ae5b2f9195847b5a0aa9991e14aa397", - "240764252e7c4f5ca39db14fd1c724ed", - "386ff59e3694480394253f1c24ff8e84", - "70e48d7d8e8a411a90642926db4aada8", - "1f3364ab59b541268fabcb3f9fb5c64c", - "0fad6468e3c849b380e34f674e074219", - "10a88a05740b45d4a6ea5873d4a7151a", - "d3b107acd1b1401cabe3090724e12e86", - "b3563100dd1b4a4abe14ab7193649064", - "17f0e360e85f48d9a17b84c9b7f6c9f0", - "29f35103a6e94af09c8ac9cdb2cca89c", - "e6e15d5c14134be0b4cf86fdecfef687", - "f23f02d00d424574afa29311b8d0906e", - "e918a6de59b64bd590e4f1233bbc078a", - "abeb0a773f3542c39ff724ae0674b74e", - "892246fdf6bb476abb35ec321ddf86e8", - "88c181cd21a94ec9a43df9754c1986c9", - "e4098b0091124fef8ba342783a82cc6e", - "498a50387a0742a88356a7ee9920bf7a", - "86482894cddd4956ae2fc3d9edd8ef9a", - "438d19fb8e8243ebbc658f4b1d27df99" - ] - }, - "id": "eBP6FeY18_Ck", - "outputId": "b2a5c5fd-88cf-4428-d196-9e1c1ddc7e30" - }, - "outputs": [], - "source": [ - "from transformers.data.processors.glue import MnliProcessor\n", - "import torch\n", - "from transformers import (\n", - " BertModel,\n", - " BertTokenizer\n", - ")\n", - "\n", - "tokenizer = BertTokenizer.from_pretrained('bert-base-cased')\n", - "bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vMbozzxs9xq_" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os\n", - "import sys\n", - "import shutil\n", - "import argparse\n", - "import tempfile\n", - "import urllib.request\n", - "import zipfile\n", - "\n", - "TASKS = [\"CoLA\", \"SST\", \"MRPC\", \"QQP\", \"STS\", \"MNLI\", \"SNLI\", \"QNLI\", \"RTE\", \"WNLI\", \"diagnostic\"]\n", - "TASK2PATH = {\n", - " \"CoLA\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4\", # noqa\n", - " \"SST\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8\", # noqa\n", - " \"MRPC\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc\", # noqa\n", - " \"QQP\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277\", # noqa\n", - " \"STS\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5\", # noqa\n", - " \"MNLI\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce\", # noqa\n", - " \"SNLI\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df\", # noqa\n", - " \"QNLI\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601\", # noqa\n", - " \"RTE\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb\", # noqa\n", - " \"WNLI\": \"https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf\", # noqa\n", - " \"diagnostic\": [\n", - " \"https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D\", # noqa\n", - " \"https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1\",\n", - " ],\n", - "}\n", - "\n", - "MRPC_TRAIN = \"https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt\"\n", - "MRPC_TEST = \"https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt\"\n", - "\n", - "\n", - "def download_and_extract(task, data_dir):\n", - " print(\"Downloading and extracting %s...\" % task)\n", - " data_file = \"%s.zip\" % task\n", - " urllib.request.urlretrieve(TASK2PATH[task], data_file)\n", - " with zipfile.ZipFile(data_file) as zip_ref:\n", - " zip_ref.extractall(data_dir)\n", - " os.remove(data_file)\n", - " print(\"\\tCompleted!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - }, - "id": "3CVHOXQY9yVm", - "outputId": "f06b886b-cc32-4972-918e-f4ca5828fb2c" - }, - "outputs": [], - "source": [ - "download_and_extract('MNLI', '.')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOR0Q1Yg-HmN" - }, - "outputs": [], - "source": [ - "from transformers import glue_convert_examples_to_features as convert_examples_to_features\n", - "from transformers import BertTokenizer\n", - "from torch.utils.data import TensorDataset, RandomSampler, DataLoader, random_split\n", - "\n", - "processor = MnliProcessor()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yuUwBKpn-TIK" - }, - "source": [ - "#### Data loaders\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kMdQZUjO-MI7" - }, - "outputs": [], - "source": [ - "def generate_mnli_bert_dataloaders():\n", - " # ----------------------\n", - " # TRAIN/VAL DATALOADERS\n", - " # ----------------------\n", - " train = processor.get_train_examples('MNLI')\n", - " features = convert_examples_to_features(train,\n", - " tokenizer,\n", - " label_list=['contradiction','neutral','entailment'],\n", - " max_length=128,\n", - " output_mode='classification',\n", - " pad_on_left=False,\n", - " pad_token=tokenizer.pad_token_id,\n", - " pad_token_segment_id=0)\n", - " train_dataset = TensorDataset(torch.tensor([f.input_ids for f in features], dtype=torch.long), \n", - " torch.tensor([f.attention_mask for f in features], dtype=torch.long), \n", - " torch.tensor([f.token_type_ids for f in features], dtype=torch.long), \n", - " torch.tensor([f.label for f in features], dtype=torch.long))\n", - "\n", - " nb_train_samples = int(0.95 * len(train_dataset))\n", - " nb_val_samples = len(train_dataset) - nb_train_samples\n", - "\n", - " bert_mnli_train_dataset, bert_mnli_val_dataset = random_split(train_dataset, [nb_train_samples, nb_val_samples])\n", - "\n", - " # train loader\n", - " train_sampler = RandomSampler(bert_mnli_train_dataset)\n", - " bert_mnli_train_dataloader = DataLoader(bert_mnli_train_dataset, sampler=train_sampler, batch_size=32)\n", - "\n", - " # val loader\n", - " val_sampler = RandomSampler(bert_mnli_val_dataset)\n", - " bert_mnli_val_dataloader = DataLoader(bert_mnli_val_dataset, sampler=val_sampler, batch_size=32)\n", - "\n", - "\n", - " # ----------------------\n", - " # TEST DATALOADERS\n", - " # ----------------------\n", - " dev = processor.get_dev_examples('MNLI')\n", - " features = convert_examples_to_features(dev,\n", - " tokenizer,\n", - " label_list=['contradiction','neutral','entailment'],\n", - " max_length=128,\n", - " output_mode='classification',\n", - " pad_on_left=False,\n", - " pad_token=tokenizer.pad_token_id,\n", - " pad_token_segment_id=0)\n", - "\n", - " bert_mnli_test_dataset = TensorDataset(torch.tensor([f.input_ids for f in features], dtype=torch.long), \n", - " torch.tensor([f.attention_mask for f in features], dtype=torch.long), \n", - " torch.tensor([f.token_type_ids for f in features], dtype=torch.long), \n", - " torch.tensor([f.label for f in features], dtype=torch.long))\n", - "\n", - " # test dataset\n", - " test_sampler = RandomSampler(bert_mnli_test_dataset)\n", - " bert_mnli_test_dataloader = DataLoader(bert_mnli_test_dataset, sampler=test_sampler, batch_size=32)\n", - " \n", - " return bert_mnli_train_dataloader, bert_mnli_val_dataloader, bert_mnli_test_dataloader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iV-baDhN-U6B" - }, - "outputs": [], - "source": [ - "bert_mnli_train_dataloader, bert_mnli_val_dataloader, bert_mnli_test_dataloader = generate_mnli_bert_dataloaders()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yr7eaxkF-djf" - }, - "source": [ - "### BERT Lightning module!\n", - "\n", - "Finally, we can create the LightningModule" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UIXLW8CO-W8w" - }, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "import torch.nn.functional as F\n", - "\n", - "\n", - "class BertMNLIFinetuner(pl.LightningModule):\n", - "\n", - " def __init__(self):\n", - " super(BertMNLIFinetuner, self).__init__()\n", - " \n", - " self.bert = bert\n", - " self.W = nn.Linear(bert.config.hidden_size, 3)\n", - " self.num_classes = 3\n", - "\n", - "\n", - " def forward(self, input_ids, attention_mask, token_type_ids):\n", - " \n", - " h, _, attn = self.bert(input_ids=input_ids, \n", - " attention_mask=attention_mask, \n", - " token_type_ids=token_type_ids)\n", - " \n", - " h_cls = h[:, 0]\n", - " logits = self.W(h_cls)\n", - " return logits, attn\n", - "\n", - " def training_step(self, batch, batch_nb):\n", - " # batch\n", - " input_ids, attention_mask, token_type_ids, label = batch\n", - " \n", - " # fwd\n", - " y_hat, attn = self(input_ids, attention_mask, token_type_ids)\n", - " \n", - " # loss\n", - " loss = F.cross_entropy(y_hat, label)\n", - " \n", - " # logs\n", - " tensorboard_logs = {'train_loss': loss}\n", - " return {'loss': loss, 'log': tensorboard_logs}\n", - "\n", - " def validation_step(self, batch, batch_nb):\n", - " # batch\n", - " input_ids, attention_mask, token_type_ids, label = batch\n", - " \n", - " # fwd\n", - " y_hat, attn = self(input_ids, attention_mask, token_type_ids)\n", - " \n", - " # loss\n", - " loss = F.cross_entropy(y_hat, label)\n", - " \n", - " # acc\n", - " a, y_hat = torch.max(y_hat, dim=1)\n", - " val_acc = accuracy_score(y_hat.cpu(), label.cpu())\n", - " val_acc = torch.tensor(val_acc)\n", - "\n", - " return {'val_loss': loss, 'val_acc': val_acc}\n", - "\n", - " def validation_epoch_end(self, outputs):\n", - " avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()\n", - " avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()\n", - "\n", - " tensorboard_logs = {'val_loss': avg_loss, 'avg_val_acc': avg_val_acc}\n", - " return {'val_loss': avg_loss, 'progress_bar': tensorboard_logs}\n", - "\n", - " def test_step(self, batch, batch_nb):\n", - " input_ids, attention_mask, token_type_ids, label = batch\n", - " \n", - " y_hat, attn = self(input_ids, attention_mask, token_type_ids)\n", - " \n", - " a, y_hat = torch.max(y_hat, dim=1)\n", - " test_acc = accuracy_score(y_hat.cpu(), label.cpu())\n", - " \n", - " return {'test_acc': torch.tensor(test_acc)}\n", - "\n", - " def test_epoch_end(self, outputs):\n", - "\n", - " avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()\n", - "\n", - " tensorboard_logs = {'avg_test_acc': avg_test_acc}\n", - " return {'avg_test_acc': avg_test_acc, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}\n", - " \n", - " def configure_optimizers(self):\n", - " return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-05, eps=1e-08)\n", - "\n", - "\n", - " def train_dataloader(self):\n", - " return bert_mnli_train_dataloader\n", - "\n", - " def val_dataloader(self):\n", - " return bert_mnli_val_dataloader\n", - "\n", - " def test_dataloader(self):\n", - " return bert_mnli_test_dataloader" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHt8tgwa_DcM" - }, - "source": [ - "### Trainer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 83, - "referenced_widgets": [ - "86bedd1fc6da4b8fa0deac637628729e", - "f444ab7646444b9885cfec41b5a2236e", - "fad0b06dc57e4b4599cf43daad7106b8", - "c190999c2761453380f816372fcca608", - "a5cc9e60aff641dca27f1adf6807e5b3", - "0a96cc26343e4bb2ac2f5145be2fbacf", - "cce9ed8de0a048679453e53b71523eea", - "773fd1b84c364903bc7350630e76a825", - "0e149cc766d147aba2c05f8b0f2c69d5", - "191f483b5b0346a8a28cac37f29ac2dc", - "24b28a7423a541c0b84ba93d70416c1a", - "4820f0005e60493793e506e9f0caf5d4", - "fce1fc72006f4e84a6497a493cbbfca2", - "f220485e332d4c3cbfc3c45ce3b5fdf1", - "bf257b8a04b44a389da2e6f4c64379d4", - "7efa007fdb2d4e06b5f34c4286fe9a2f" - ] - }, - "id": "gMRMJ-Kd-oup", - "outputId": "790ab73c-b37d-4bcb-af5f-46b464e46f9b" - }, - "outputs": [], - "source": [ - "bert_finetuner = BertMNLIFinetuner()\n", - "\n", - "# most basic trainer, uses good defaults (1 gpu)\n", - "trainer = pl.Trainer(gpus=1) \n", - "trainer.fit(bert_finetuner) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NWvMLBDySQI5" - }, - "source": [ - "## DQN example\n", - "\n", - "How to train a Deep Q Network\n", - "\n", - "Main takeaways:\n", - "1. RL has the same flow as previous models we have seen, with a few additions\n", - "2. Handle unsupervised learning by using an IterableDataset where the dataset itself is constantly updated during training\n", - "3. Each training step carries has the agent taking an action in the environment and storing the experience in the IterableDataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 146 - }, - "id": "4ARIT37rDdIZ", - "outputId": "37ea5092-0db7-4e73-b507-f4be9bb0ae7e" - }, - "outputs": [], - "source": [ - "!pip install gym" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nm9BKoF0Sv_O" - }, - "source": [ - "### DQN Network" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FXkKtnEhSaIV" - }, - "outputs": [], - "source": [ - "from torch import nn\n", - "\n", - "class DQN(nn.Module):\n", - " \"\"\"\n", - " Simple MLP network\n", - "\n", - " Args:\n", - " obs_size: observation/state size of the environment\n", - " n_actions: number of discrete actions available in the environment\n", - " hidden_size: size of hidden layers\n", - " \"\"\"\n", - "\n", - " def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128):\n", - " super(DQN, self).__init__()\n", - " self.net = nn.Sequential(\n", - " nn.Linear(obs_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Linear(hidden_size, n_actions)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " return self.net(x.float())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c9clSz7xTFZf" - }, - "source": [ - "### Memory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zUmawp0ITE3I" - }, - "outputs": [], - "source": [ - "from collections import namedtuple\n", - "\n", - "# Named tuple for storing experience steps gathered in training\n", - "Experience = namedtuple(\n", - " 'Experience', field_names=['state', 'action', 'reward',\n", - " 'done', 'new_state'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Zs7h_Z0LTVoy" - }, - "outputs": [], - "source": [ - "from typing import Tuple\n", - "\n", - "class ReplayBuffer:\n", - " \"\"\"\n", - " Replay Buffer for storing past experiences allowing the agent to learn from them\n", - "\n", - " Args:\n", - " capacity: size of the buffer\n", - " \"\"\"\n", - "\n", - " def __init__(self, capacity: int) -> None:\n", - " self.buffer = deque(maxlen=capacity)\n", - "\n", - " def __len__(self) -> None:\n", - " return len(self.buffer)\n", - "\n", - " def append(self, experience: Experience) -> None:\n", - " \"\"\"\n", - " Add experience to the buffer\n", - "\n", - " Args:\n", - " experience: tuple (state, action, reward, done, new_state)\n", - " \"\"\"\n", - " self.buffer.append(experience)\n", - "\n", - " def sample(self, batch_size: int) -> Tuple:\n", - " indices = np.random.choice(len(self.buffer), batch_size, replace=False)\n", - " states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])\n", - "\n", - " return (np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),\n", - " np.array(dones, dtype=np.bool), np.array(next_states))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R5UK2VRvTgS1" - }, - "outputs": [], - "source": [ - "from torch.utils.data.dataset import IterableDataset\n", - "\n", - "class RLDataset(IterableDataset):\n", - " \"\"\"\n", - " Iterable Dataset containing the ExperienceBuffer\n", - " which will be updated with new experiences during training\n", - "\n", - " Args:\n", - " buffer: replay buffer\n", - " sample_size: number of experiences to sample at a time\n", - " \"\"\"\n", - "\n", - " def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None:\n", - " self.buffer = buffer\n", - " self.sample_size = sample_size\n", - "\n", - " def __iter__(self) -> Tuple:\n", - " states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size)\n", - " for i in range(len(dones)):\n", - " yield states[i], actions[i], rewards[i], dones[i], new_states[i]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d7sCGSURTuQK" - }, - "source": [ - "### Agent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dS2RpSHHTvpO" - }, - "outputs": [], - "source": [ - "import gym\n", - "import torch\n", - "\n", - "class Agent:\n", - " \"\"\"\n", - " Base Agent class handeling the interaction with the environment\n", - "\n", - " Args:\n", - " env: training environment\n", - " replay_buffer: replay buffer storing experiences\n", - " \"\"\"\n", - "\n", - " def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None:\n", - " self.env = env\n", - " self.replay_buffer = replay_buffer\n", - " self.reset()\n", - " self.state = self.env.reset()\n", - "\n", - " def reset(self) -> None:\n", - " \"\"\" Resents the environment and updates the state\"\"\"\n", - " self.state = self.env.reset()\n", - "\n", - " def get_action(self, net: nn.Module, epsilon: float, device: str) -> int:\n", - " \"\"\"\n", - " Using the given network, decide what action to carry out\n", - " using an epsilon-greedy policy\n", - "\n", - " Args:\n", - " net: DQN network\n", - " epsilon: value to determine likelihood of taking a random action\n", - " device: current device\n", - "\n", - " Returns:\n", - " action\n", - " \"\"\"\n", - " if np.random.random() < epsilon:\n", - " action = self.env.action_space.sample()\n", - " else:\n", - " state = torch.tensor([self.state])\n", - "\n", - " if device not in ['cpu']:\n", - " state = state.cuda(device)\n", - "\n", - " q_values = net(state)\n", - " _, action = torch.max(q_values, dim=1)\n", - " action = int(action.item())\n", - "\n", - " return action\n", - "\n", - " @torch.no_grad()\n", - " def play_step(self, net: nn.Module, epsilon: float = 0.0, device: str = 'cpu') -> Tuple[float, bool]:\n", - " \"\"\"\n", - " Carries out a single interaction step between the agent and the environment\n", - "\n", - " Args:\n", - " net: DQN network\n", - " epsilon: value to determine likelihood of taking a random action\n", - " device: current device\n", - "\n", - " Returns:\n", - " reward, done\n", - " \"\"\"\n", - "\n", - " action = self.get_action(net, epsilon, device)\n", - "\n", - " # do step in the environment\n", - " new_state, reward, done, _ = self.env.step(action)\n", - "\n", - " exp = Experience(self.state, action, reward, done, new_state)\n", - "\n", - " self.replay_buffer.append(exp)\n", - "\n", - " self.state = new_state\n", - " if done:\n", - " self.reset()\n", - " return reward, done" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IAlT0-75T_Kv" - }, - "source": [ - "### DQN Lightning Module" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BS5D7s83T13H" - }, - "outputs": [], - "source": [ - "import pytorch_lightning as pl\n", - "import argparse\n", - "from collections import OrderedDict, deque\n", - "from typing import Tuple, List\n", - "import torch.optim as optim\n", - "from torch.optim import Optimizer\n", - "from torch.utils.data import DataLoader\n", - "\n", - "class DQNLightning(pl.LightningModule):\n", - " \"\"\" Basic DQN Model \"\"\"\n", - "\n", - " def __init__(self, hparams: argparse.Namespace) -> None:\n", - " super().__init__()\n", - " self.hparams = hparams\n", - "\n", - " self.env = gym.make(self.hparams.env)\n", - " obs_size = self.env.observation_space.shape[0]\n", - " n_actions = self.env.action_space.n\n", - "\n", - " self.net = DQN(obs_size, n_actions)\n", - " self.target_net = DQN(obs_size, n_actions)\n", - "\n", - " self.buffer = ReplayBuffer(self.hparams.replay_size)\n", - " self.agent = Agent(self.env, self.buffer)\n", - " self.total_reward = 0\n", - " self.episode_reward = 0\n", - " self.populate(self.hparams.warm_start_steps)\n", - "\n", - " def populate(self, steps: int = 1000) -> None:\n", - " \"\"\"\n", - " Carries out several random steps through the environment to initially fill\n", - " up the replay buffer with experiences\n", - "\n", - " Args:\n", - " steps: number of random steps to populate the buffer with\n", - " \"\"\"\n", - " for i in range(steps):\n", - " self.agent.play_step(self.net, epsilon=1.0)\n", - "\n", - " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", - " \"\"\"\n", - " Passes in a state x through the network and gets the q_values of each action as an output\n", - "\n", - " Args:\n", - " x: environment state\n", - "\n", - " Returns:\n", - " q values\n", - " \"\"\"\n", - " output = self.net(x)\n", - " return output\n", - "\n", - " def dqn_mse_loss(self, batch: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:\n", - " \"\"\"\n", - " Calculates the mse loss using a mini batch from the replay buffer\n", - "\n", - " Args:\n", - " batch: current mini batch of replay data\n", - "\n", - " Returns:\n", - " loss\n", - " \"\"\"\n", - " states, actions, rewards, dones, next_states = batch\n", - "\n", - " state_action_values = self.net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)\n", - "\n", - " with torch.no_grad():\n", - " next_state_values = self.target_net(next_states).max(1)[0]\n", - " next_state_values[dones] = 0.0\n", - " next_state_values = next_state_values.detach()\n", - "\n", - " expected_state_action_values = next_state_values * self.hparams.gamma + rewards\n", - "\n", - " return nn.MSELoss()(state_action_values, expected_state_action_values)\n", - "\n", - " def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], nb_batch) -> OrderedDict:\n", - " \"\"\"\n", - " Carries out a single step through the environment to update the replay buffer.\n", - " Then calculates loss based on the minibatch recieved\n", - "\n", - " Args:\n", - " batch: current mini batch of replay data\n", - " nb_batch: batch number\n", - "\n", - " Returns:\n", - " Training loss and log metrics\n", - " \"\"\"\n", - " device = self.get_device(batch)\n", - " epsilon = max(self.hparams.eps_end, self.hparams.eps_start -\n", - " self.global_step + 1 / self.hparams.eps_last_frame)\n", - "\n", - " # step through environment with agent\n", - " reward, done = self.agent.play_step(self.net, epsilon, device)\n", - " self.episode_reward += reward\n", - "\n", - " # calculates training loss\n", - " loss = self.dqn_mse_loss(batch)\n", - "\n", - " if self.trainer.use_dp or self.trainer.use_ddp2:\n", - " loss = loss.unsqueeze(0)\n", - "\n", - " if done:\n", - " self.total_reward = self.episode_reward\n", - " self.episode_reward = 0\n", - "\n", - " # Soft update of target network\n", - " if self.global_step % self.hparams.sync_rate == 0:\n", - " self.target_net.load_state_dict(self.net.state_dict())\n", - "\n", - " log = {'total_reward': torch.tensor(self.total_reward).to(device),\n", - " 'reward': torch.tensor(reward).to(device),\n", - " 'train_loss': loss\n", - " }\n", - " status = {'steps': torch.tensor(self.global_step).to(device),\n", - " 'total_reward': torch.tensor(self.total_reward).to(device)\n", - " }\n", - "\n", - " \n", - "\n", - " return OrderedDict({'loss': loss, 'log': log, 'progress_bar': status})\n", - "\n", - " def configure_optimizers(self) -> List[Optimizer]:\n", - " \"\"\" Initialize Adam optimizer\"\"\"\n", - " optimizer = optim.Adam(self.net.parameters(), lr=self.hparams.lr)\n", - " return [optimizer]\n", - "\n", - " def __dataloader(self) -> DataLoader:\n", - " \"\"\"Initialize the Replay Buffer dataset used for retrieving experiences\"\"\"\n", - " dataset = RLDataset(self.buffer, self.hparams.episode_length)\n", - " dataloader = DataLoader(dataset=dataset,\n", - " batch_size=self.hparams.batch_size,\n", - " )\n", - " return dataloader\n", - "\n", - " def train_dataloader(self) -> DataLoader:\n", - " \"\"\"Get train loader\"\"\"\n", - " return self.__dataloader()\n", - "\n", - " def get_device(self, batch) -> str:\n", - " \"\"\"Retrieve device currently being used by minibatch\"\"\"\n", - " return batch[0].device.index if self.on_gpu else 'cpu'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JST5AN-8VFLY" - }, - "source": [ - "### Trainer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bQEvD7gFUSaN" - }, - "outputs": [], - "source": [ - "def main(hparams) -> None:\n", - " model = DQNLightning(hparams)\n", - "\n", - " trainer = pl.Trainer(\n", - " gpus=1,\n", - " distributed_backend='dp',\n", - " max_epochs=500,\n", - " early_stop_callback=False,\n", - " val_check_interval=100\n", - " )\n", - "\n", - " trainer.fit(model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 380, - "referenced_widgets": [ - "e9a6bf4eda3244c6bb17216715f36525", - "0922c5b2de554b4fa28dd531603f2709", - "c293fc4171b0438595bc9a49fbb250cf", - "819c83bf0bbd472ba417c31e957718c7", - "c24384195a074989a86217b2edc411cb", - "b3817e0ba30f449585f7641b4d3061bb", - "8591bd2136ab4bb7831579609b43ee9c", - "5a761ed145474ec7a30006bc584b26be" - ] - }, - "id": "-iV9PQC9VOHK", - "outputId": "2fd70097-c913-4d68-e80a-d240532edd19" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import argparse\n", - "\n", - "torch.manual_seed(0)\n", - "np.random.seed(0)\n", - "\n", - "parser = argparse.ArgumentParser()\n", - "parser.add_argument(\"--batch_size\", type=int, default=16, help=\"size of the batches\")\n", - "parser.add_argument(\"--lr\", type=float, default=1e-2, help=\"learning rate\")\n", - "parser.add_argument(\"--env\", type=str, default=\"CartPole-v0\", help=\"gym environment tag\")\n", - "parser.add_argument(\"--gamma\", type=float, default=0.99, help=\"discount factor\")\n", - "parser.add_argument(\"--sync_rate\", type=int, default=10,\n", - " help=\"how many frames do we update the target network\")\n", - "parser.add_argument(\"--replay_size\", type=int, default=1000,\n", - " help=\"capacity of the replay buffer\")\n", - "parser.add_argument(\"--warm_start_size\", type=int, default=1000,\n", - " help=\"how many samples do we use to fill our buffer at the start of training\")\n", - "parser.add_argument(\"--eps_last_frame\", type=int, default=1000,\n", - " help=\"what frame should epsilon stop decaying\")\n", - "parser.add_argument(\"--eps_start\", type=float, default=1.0, help=\"starting value of epsilon\")\n", - "parser.add_argument(\"--eps_end\", type=float, default=0.01, help=\"final value of epsilon\")\n", - "parser.add_argument(\"--episode_length\", type=int, default=200, help=\"max length of an episode\")\n", - "parser.add_argument(\"--max_episode_reward\", type=int, default=200,\n", - " help=\"max episode reward in the environment\")\n", - "parser.add_argument(\"--warm_start_steps\", type=int, default=1000,\n", - " help=\"max episode reward in the environment\")\n", - "\n", - "args, _ = parser.parse_known_args()\n", - "\n", - "main(args)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Lightning-demo.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "00ae53beaa9341f4826b1bdc0a6f88e0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "05f01b8813374534a4c58ac65fe2b390": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "08b6d9269e514d228e7e94fe0299a2c5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_67728556b4c9432b877d54a081657663", - "placeholder": "​", - "style": "IPY_MODEL_0cf9a61c88af45b6a6ef72640f93cbfd", - "value": " 1654784/? [00:01<00:00, 1304326.05it/s]" - } - }, - "0922c5b2de554b4fa28dd531603f2709": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "0a96cc26343e4bb2ac2f5145be2fbacf": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0c3473a16a5e4c46a6c7515e610bca7f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0cf9a61c88af45b6a6ef72640f93cbfd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0e149cc766d147aba2c05f8b0f2c69d5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_24b28a7423a541c0b84ba93d70416c1a", - "IPY_MODEL_4820f0005e60493793e506e9f0caf5d4" - ], - "layout": "IPY_MODEL_191f483b5b0346a8a28cac37f29ac2dc" - } - }, - "0f688614251d49589f320f2b2cb55344": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0fad6468e3c849b380e34f674e074219": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d3b107acd1b1401cabe3090724e12e86", - "IPY_MODEL_b3563100dd1b4a4abe14ab7193649064" - ], - "layout": "IPY_MODEL_10a88a05740b45d4a6ea5873d4a7151a" - } - }, - "10a88a05740b45d4a6ea5873d4a7151a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "118c0b8da0df4ff68a90a3d500f1d1b8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "11db4a94a4534fc2b503aad28be631be": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "135883097f0e428c963ae0ad320dfabd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "15bb223836764207a5ac15616a41ddb7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_296453e43f7344de8a9b5c6bc970ab1e", - "IPY_MODEL_cd86997da08649d7999ade2d0e7cea96" - ], - "layout": "IPY_MODEL_46f7df7035d44bd099f60ad23f836f8a" - } - }, - "171db7c8fa1e4f11aaff71b9f5879d58": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_85741a7765a147c4a8d69872850cb072", - "placeholder": "​", - "style": "IPY_MODEL_135883097f0e428c963ae0ad320dfabd", - "value": " 32768/? [00:01<00:00, 24569.28it/s]" - } - }, - "17f0e360e85f48d9a17b84c9b7f6c9f0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "191f483b5b0346a8a28cac37f29ac2dc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "19c7460c565d494abbb8b9731a34294d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1a7680c6279d4985bd69188dd72b11d5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1dabf5740f4d44d68d06629f77b001e3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1f3364ab59b541268fabcb3f9fb5c64c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1f9c48164702427fb3aca2a26b2651e5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "Validation sanity check: ", - "description_tooltip": null, - "layout": "IPY_MODEL_a1e2c38bb40642168cc9d44abf645a54", - "max": 5, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_61c71d4f1c2848b1813aebc0b2db5e25", - "value": 5 - } - }, - "23fd97d95fae4f42bd21906f67115f8b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "240764252e7c4f5ca39db14fd1c724ed": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "24b28a7423a541c0b84ba93d70416c1a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "danger", - "description": "Epoch 1: 0%", - "description_tooltip": null, - "layout": "IPY_MODEL_f220485e332d4c3cbfc3c45ce3b5fdf1", - "max": 12273, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_fce1fc72006f4e84a6497a493cbbfca2", - "value": 50 - } - }, - "254ddfa7c23d4b6f828d515dbab38978": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "294d8142a4aa48aa8261b0b8155ef97f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "296453e43f7344de8a9b5c6bc970ab1e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "Validating: 96%", - "description_tooltip": null, - "layout": "IPY_MODEL_bef041a9f5a942f68b4a8488a371d3da", - "max": 1875, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7d15fc81537a449cb6b6afd7ccc65dac", - "value": 1800 - } - }, - "29650c4a829b44ed9e1526b1dc5d2b83": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "Validating: ", - "description_tooltip": null, - "layout": "IPY_MODEL_a99b7813bf88496c875a818afe3b170a", - "max": 1875, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_384a36423d154f2abcddb5094afeced3", - "value": 1875 - } - }, - "29f35103a6e94af09c8ac9cdb2cca89c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2e20c741cf8a401cb90e8e230a23026a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "32e47e93509043439580cd5f58dc7726": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3397549a0695432990f1d3d5390941e7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ae52e3d810aa4bc5965559ed2ba2b78a", - "IPY_MODEL_08b6d9269e514d228e7e94fe0299a2c5" - ], - "layout": "IPY_MODEL_515ef7d03ef2447e9643210b029b930e" - } - }, - "352d7dae131b407cb6e0238315c1b1a0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "384a36423d154f2abcddb5094afeced3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "386ff59e3694480394253f1c24ff8e84": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "39422514a4a04a9ba290285dc586ea9f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3a521cb700f247fd8cd345b158697f2f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_866f880d850a4e689a1c99723f0366db", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_bc4dfb14c9d14499ae72d2a30fc6bc2d", - "value": 1 - } - }, - "3c99401bde8641c19978c11c9abb906a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f778d9ef70ca4f5898c423109cf82ed2", - "placeholder": "​", - "style": "IPY_MODEL_fd8ec919352046dd84057e9763bb235a", - "value": " 350/? [00:01<00:00, 348.68it/s]" - } - }, - "3d2e43ae9f924fbd8463ce72b44200f3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_3a521cb700f247fd8cd345b158697f2f", - "IPY_MODEL_8a40fcad728841c7ab2fd15d2c40ee5f" - ], - "layout": "IPY_MODEL_7c03e0a2247442ad9c11569e443de4bb" - } - }, - "40ce71ff339849748486ebbc73474cbe": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "420e8d65e9584973a8004e8398cf430c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "42e787b78000472eab434fb795197a86": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "42ede89dbd194eb6a603ccd7d4b96aae": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "438d19fb8e8243ebbc658f4b1d27df99": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "44ca9ee5c356458680a5d20c6a891c91": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "44e2c572ab1641a29156ad4ee8884e12": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_845415af79634da5a64b9f368280c0e3", - "placeholder": "​", - "style": "IPY_MODEL_32e47e93509043439580cd5f58dc7726", - "value": " 9920512/? [00:20<00:00, 998025.61it/s]" - } - }, - "46f7df7035d44bd099f60ad23f836f8a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "47df0bc1b3d14bb7b673e0591daa4e5f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "danger", - "description": "Epoch 1: 93%", - "description_tooltip": null, - "layout": "IPY_MODEL_7662324b3b924f8f9649dc409fb0d349", - "max": 1875, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e068e2b1c68c48a784c19fc716c043a3", - "value": 1750 - } - }, - "4820f0005e60493793e506e9f0caf5d4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7efa007fdb2d4e06b5f34c4286fe9a2f", - "placeholder": "​", - "style": "IPY_MODEL_bf257b8a04b44a389da2e6f4c64379d4", - "value": " 50/12273 [00:11<48:21, 4.21it/s, loss=1.083, v_num=6]" - } - }, - "498a50387a0742a88356a7ee9920bf7a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4b7021f73f6b4e5193454128ccf323d7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "danger", - "description": "Epoch 2: 81%", - "description_tooltip": null, - "layout": "IPY_MODEL_d9540ab5d2394b77a65f48b501acdc18", - "max": 1875, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_b5f184fbcba740999b205e34e23455d6", - "value": 1520 - } - }, - "4eaea330bc8e414fbf2f0e2b21af8b08": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "4ffe9fb35ca44358b0177723f73a35d8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_fc3f14c4e83048aa9d6fe9963f95bf7a", - "IPY_MODEL_171db7c8fa1e4f11aaff71b9f5879d58" - ], - "layout": "IPY_MODEL_ba3fe1aba1b04a1fbaab268dfd3d0166" - } - }, - "5012438370764b4db215d545e9414c94": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "515ef7d03ef2447e9643210b029b930e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "51b1111f5fe24042b38af809285e1b16": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "53b2a85381b1460d9f446390c79bfc08": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "5484eef7b6f247d68a89f86965b0940f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ad849800b2124195b92f3bf9dfc7681b", - "IPY_MODEL_6ae5b2f9195847b5a0aa9991e14aa397" - ], - "layout": "IPY_MODEL_0c3473a16a5e4c46a6c7515e610bca7f" - } - }, - "555443a6fa564d10a3a7901cf15a79a3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "56ef38eff92143bcaf68b22c8dae7f98": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "59f02fe7f9f2433bb25f5b292c213f50": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5a761ed145474ec7a30006bc584b26be": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5d3c506d3f4444d8a6b7024cd11de2cd": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5f119b90386c499ea9caf987fecf6c06": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_47df0bc1b3d14bb7b673e0591daa4e5f", - "IPY_MODEL_87e118f890dc42319e723331e1306787" - ], - "layout": "IPY_MODEL_6d391353197443a694f6c75147ca96df" - } - }, - "61c71d4f1c2848b1813aebc0b2db5e25": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "620f45256d504f0188f40c61e23e1355": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e64a723bcf474d6699d78ec05462f995", - "placeholder": "​", - "style": "IPY_MODEL_c3587b40d9f942e98b708ff0b5fc5301", - "value": " 9920512/? [00:20<00:00, 2030694.65it/s]" - } - }, - "632b9d0d9ffa4d479deb70f6fafb92ab": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1a7680c6279d4985bd69188dd72b11d5", - "placeholder": "​", - "style": "IPY_MODEL_42e787b78000472eab434fb795197a86", - "value": " 0/28881 [00:00<?, ?it/s]" - } - }, - "67728556b4c9432b877d54a081657663": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "69fa0a853cf84b9482043e12881c849f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_294d8142a4aa48aa8261b0b8155ef97f", - "placeholder": "​", - "style": "IPY_MODEL_51b1111f5fe24042b38af809285e1b16", - "value": " 8192/? [00:00<00:00, 33325.29it/s]" - } - }, - "6aa5e292e2094c239e7418994a31ff51": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1f9c48164702427fb3aca2a26b2651e5", - "IPY_MODEL_d8bd5c9b233b41008109d14cffc89aaa" - ], - "layout": "IPY_MODEL_555443a6fa564d10a3a7901cf15a79a3" - } - }, - "6aafaca3c8824e2fa267f4a68d5d2ca3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_59f02fe7f9f2433bb25f5b292c213f50", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_53b2a85381b1460d9f446390c79bfc08", - "value": 1 - } - }, - "6ae5b2f9195847b5a0aa9991e14aa397": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1f3364ab59b541268fabcb3f9fb5c64c", - "placeholder": "​", - "style": "IPY_MODEL_70e48d7d8e8a411a90642926db4aada8", - "value": " 213k/213k [00:00<00:00, 746kB/s]" - } - }, - "6b3a598e7d01407aa5850b5a6620e7f0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e7a06a13ea11427ea3866cec3a55b644", - "IPY_MODEL_620f45256d504f0188f40c61e23e1355" - ], - "layout": "IPY_MODEL_e23c0d6c117246b2a0a6681008748917" - } - }, - "6d391353197443a694f6c75147ca96df": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "6f55aa11acb14afdb2ac0a1052be1bb6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_420e8d65e9584973a8004e8398cf430c", - "placeholder": "​", - "style": "IPY_MODEL_23fd97d95fae4f42bd21906f67115f8b", - "value": " 1520/1875 [00:05<00:01, 254.43it/s, loss=1.504, v_num=0]" - } - }, - "70e48d7d8e8a411a90642926db4aada8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7633820adf9a4757ae73b472e43031d6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_11db4a94a4534fc2b503aad28be631be", - "placeholder": "​", - "style": "IPY_MODEL_fc262db2a53948488092a77209081319", - "value": " 1400/3750 [00:08<00:13, 171.01it/s, loss=1.188, test_loss=1.17, v_num=2]" - } - }, - "7662324b3b924f8f9649dc409fb0d349": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "76c916d634c644a4a0d8f12e183822fd": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "773fd1b84c364903bc7350630e76a825": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7c03e0a2247442ad9c11569e443de4bb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7d15fc81537a449cb6b6afd7ccc65dac": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "7e8cf26303ed4975b239fd43184a1dc6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "Validation sanity check: ", - "description_tooltip": null, - "layout": "IPY_MODEL_5d3c506d3f4444d8a6b7024cd11de2cd", - "max": 5, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_fdf5c4a49602423184f6d94cd814177e", - "value": 5 - } - }, - "7ea8ad4e10bb465aa2b6708655a2793f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7ee81979301c447bb13ff9fff5153e0f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "7efa007fdb2d4e06b5f34c4286fe9a2f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "819c83bf0bbd472ba417c31e957718c7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5a761ed145474ec7a30006bc584b26be", - "placeholder": "​", - "style": "IPY_MODEL_8591bd2136ab4bb7831579609b43ee9c", - "value": " 13/? [00:00<00:00, 115.19it/s, loss=69.045, steps=6499, total_reward=200, v_num=0]" - } - }, - "845415af79634da5a64b9f368280c0e3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "85741a7765a147c4a8d69872850cb072": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8591bd2136ab4bb7831579609b43ee9c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "86482894cddd4956ae2fc3d9edd8ef9a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "866f880d850a4e689a1c99723f0366db": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "86bedd1fc6da4b8fa0deac637628729e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_fad0b06dc57e4b4599cf43daad7106b8", - "IPY_MODEL_c190999c2761453380f816372fcca608" - ], - "layout": "IPY_MODEL_f444ab7646444b9885cfec41b5a2236e" - } - }, - "8734c1b798ff4ba0bf77dca4f3de9cbf": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "87e118f890dc42319e723331e1306787": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dbb9fd5429f5416ab6a4f78f0c72867c", - "placeholder": "​", - "style": "IPY_MODEL_afc85a52a5d04653ae9e7168b180ff98", - "value": " 1750/1875 [00:31<00:02, 55.10it/s, d_loss=0.36, g_loss=1.65, loss=0.947, v_num=5]" - } - }, - "88c181cd21a94ec9a43df9754c1986c9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_438d19fb8e8243ebbc658f4b1d27df99", - "placeholder": "​", - "style": "IPY_MODEL_86482894cddd4956ae2fc3d9edd8ef9a", - "value": " 436M/436M [01:31<00:00, 4.77MB/s]" - } - }, - "892246fdf6bb476abb35ec321ddf86e8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_498a50387a0742a88356a7ee9920bf7a", - "max": 435779157, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e4098b0091124fef8ba342783a82cc6e", - "value": 435779157 - } - }, - "8a40fcad728841c7ab2fd15d2c40ee5f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_56ef38eff92143bcaf68b22c8dae7f98", - "placeholder": "​", - "style": "IPY_MODEL_e7a864f4dc0f485eb045b778e981fc01", - "value": " 1654784/? [00:01<00:00, 1615285.92it/s]" - } - }, - "8aab627e715a44ada2af81b74bece257": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8eb2086a01cf41429a5f4adff5f2359b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "danger", - "description": "Epoch 2: 99%", - "description_tooltip": null, - "layout": "IPY_MODEL_352d7dae131b407cb6e0238315c1b1a0", - "max": 3750, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d01088cc378044cba4879032d74a852e", - "value": 3700 - } - }, - "91a6de2063cc48b28021ef29feab7f69": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "992f545bb6f1489493d89a59d046f63f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a700c003887b4d2ba134fbfcf4823cb5", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_254ddfa7c23d4b6f828d515dbab38978", - "value": 1 - } - }, - "9a9ebf052d914a8881882da8d2fa9cd8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "9b6445338a69425889a8901c192d5144": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a1e2c38bb40642168cc9d44abf645a54": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a252ac32033b4e39b87a6c91bd21b5ae": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f1db9d62eee44e61bb8ac26c16b3b601", - "IPY_MODEL_69fa0a853cf84b9482043e12881c849f" - ], - "layout": "IPY_MODEL_40ce71ff339849748486ebbc73474cbe" - } - }, - "a2990a67f92c4047b95876aae91e3de0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7e8cf26303ed4975b239fd43184a1dc6", - "IPY_MODEL_d7406a8b15f9439fba19ec4dab086c61" - ], - "layout": "IPY_MODEL_44ca9ee5c356458680a5d20c6a891c91" - } - }, - "a38242d3231442e2a259067d6a1355c2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_a56deb884719491090a4146e72be3868", - "IPY_MODEL_7633820adf9a4757ae73b472e43031d6" - ], - "layout": "IPY_MODEL_9a9ebf052d914a8881882da8d2fa9cd8" - } - }, - "a4b49709f7464ce491324e8aa636c152": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "a56deb884719491090a4146e72be3868": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "danger", - "description": "Epoch 1: 37%", - "description_tooltip": null, - "layout": "IPY_MODEL_8aab627e715a44ada2af81b74bece257", - "max": 3750, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_b7a073dfdeaf48fc9f3e6352b0ea2ba7", - "value": 1400 - } - }, - "a5cc9e60aff641dca27f1adf6807e5b3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "a700c003887b4d2ba134fbfcf4823cb5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a7a94847786244dd9a5cb9718957143a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ee0a78c01b63443f9e51470a1b1e79a4", - "IPY_MODEL_632b9d0d9ffa4d479deb70f6fafb92ab" - ], - "layout": "IPY_MODEL_8734c1b798ff4ba0bf77dca4f3de9cbf" - } - }, - "a7bcd18049d8493b9d3d9f17d86f0429": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Testing: ", - "description_tooltip": null, - "layout": "IPY_MODEL_f261b8aab86b4d6e94984bf658c1b74d", - "max": 313, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_cd84335fb7234f3aa54dafe045614e56", - "value": 313 - } - }, - "a9413692ae5040e6ae3c2a446dbe297c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "a99b7813bf88496c875a818afe3b170a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "abeb0a773f3542c39ff724ae0674b74e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ad849800b2124195b92f3bf9dfc7681b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_386ff59e3694480394253f1c24ff8e84", - "max": 213450, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_240764252e7c4f5ca39db14fd1c724ed", - "value": 213450 - } - }, - "ae52e3d810aa4bc5965559ed2ba2b78a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ea162090fc954f0198a1d63507dfff9b", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7ee81979301c447bb13ff9fff5153e0f", - "value": 1 - } - }, - "afc85a52a5d04653ae9e7168b180ff98": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b3563100dd1b4a4abe14ab7193649064": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f23f02d00d424574afa29311b8d0906e", - "placeholder": "​", - "style": "IPY_MODEL_e6e15d5c14134be0b4cf86fdecfef687", - "value": " 361/361 [00:16<00:00, 22.2B/s]" - } - }, - "b3817e0ba30f449585f7641b4d3061bb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b5f184fbcba740999b205e34e23455d6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "b73a326ada4d4a859e3c2c39abf5530d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_29650c4a829b44ed9e1526b1dc5d2b83", - "IPY_MODEL_df6521155d05459882601ba8c84f3dce" - ], - "layout": "IPY_MODEL_cd942318db094680821f0d9902941977" - } - }, - "b7a073dfdeaf48fc9f3e6352b0ea2ba7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "ba3fe1aba1b04a1fbaab268dfd3d0166": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bb305bb378774c1586a3196eb3babd29": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_992f545bb6f1489493d89a59d046f63f", - "IPY_MODEL_44e2c572ab1641a29156ad4ee8884e12" - ], - "layout": "IPY_MODEL_ff6eead2826e4113abf7ab3a8cb31b0f" - } - }, - "bbef89e4fd9d4cf8ae4c8fcab9bc665d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_39422514a4a04a9ba290285dc586ea9f", - "placeholder": "​", - "style": "IPY_MODEL_91a6de2063cc48b28021ef29feab7f69", - "value": " 3700/3750 [00:11<00:00, 310.13it/s, loss=1.160, v_num=0]" - } - }, - "bc4dfb14c9d14499ae72d2a30fc6bc2d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "bcf69c2a0b694e0498beadb6f4509395": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_a7bcd18049d8493b9d3d9f17d86f0429", - "IPY_MODEL_3c99401bde8641c19978c11c9abb906a" - ], - "layout": "IPY_MODEL_2e20c741cf8a401cb90e8e230a23026a" - } - }, - "bef041a9f5a942f68b4a8488a371d3da": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bf257b8a04b44a389da2e6f4c64379d4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c190999c2761453380f816372fcca608": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_773fd1b84c364903bc7350630e76a825", - "placeholder": "​", - "style": "IPY_MODEL_cce9ed8de0a048679453e53b71523eea", - "value": " 50/? [00:00<00:00, 286.31it/s]" - } - }, - "c24384195a074989a86217b2edc411cb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "c293fc4171b0438595bc9a49fbb250cf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Epoch 500: ", - "description_tooltip": null, - "layout": "IPY_MODEL_b3817e0ba30f449585f7641b4d3061bb", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_c24384195a074989a86217b2edc411cb", - "value": 1 - } - }, - "c3587b40d9f942e98b708ff0b5fc5301": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c4200c1f957a4179af51245a797c8921": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0f688614251d49589f320f2b2cb55344", - "placeholder": "​", - "style": "IPY_MODEL_1dabf5740f4d44d68d06629f77b001e3", - "value": " 8192/? [00:00<00:00, 18498.86it/s]" - } - }, - "c93f037dc6044d858ae1862d5b29f6f0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_4b7021f73f6b4e5193454128ccf323d7", - "IPY_MODEL_6f55aa11acb14afdb2ac0a1052be1bb6" - ], - "layout": "IPY_MODEL_00ae53beaa9341f4826b1bdc0a6f88e0" - } - }, - "ca4cd1659d73446e964f9ab36d92e3a0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "cce9ed8de0a048679453e53b71523eea": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "cd84335fb7234f3aa54dafe045614e56": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "cd86997da08649d7999ade2d0e7cea96": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_19c7460c565d494abbb8b9731a34294d", - "placeholder": "​", - "style": "IPY_MODEL_e10c94b1fdf84a9186ab7d87fd83f87f", - "value": " 1800/1875 [00:17<00:00, 356.02it/s]" - } - }, - "cd942318db094680821f0d9902941977": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "d01088cc378044cba4879032d74a852e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "d0590d65433c4478af6a0762421f9f7a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d3b107acd1b1401cabe3090724e12e86": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_29f35103a6e94af09c8ac9cdb2cca89c", - "max": 361, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_17f0e360e85f48d9a17b84c9b7f6c9f0", - "value": 361 - } - }, - "d7406a8b15f9439fba19ec4dab086c61": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ea13174e5b894e93b3c59d7e599de5a9", - "placeholder": "​", - "style": "IPY_MODEL_42ede89dbd194eb6a603ccd7d4b96aae", - "value": " 50/? [00:00<00:00, 254.04it/s]" - } - }, - "d8bd5c9b233b41008109d14cffc89aaa": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_76c916d634c644a4a0d8f12e183822fd", - "placeholder": "​", - "style": "IPY_MODEL_d0590d65433c4478af6a0762421f9f7a", - "value": " 50/? [00:00<00:00, 576.16it/s]" - } - }, - "d8db4d3709f34c869dedbc066e60501e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "d9540ab5d2394b77a65f48b501acdc18": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dbb9fd5429f5416ab6a4f78f0c72867c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "de325f4002a945b4a2a15086c2a77816": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6aafaca3c8824e2fa267f4a68d5d2ca3", - "IPY_MODEL_c4200c1f957a4179af51245a797c8921" - ], - "layout": "IPY_MODEL_5012438370764b4db215d545e9414c94" - } - }, - "df6521155d05459882601ba8c84f3dce": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7ea8ad4e10bb465aa2b6708655a2793f", - "placeholder": "​", - "style": "IPY_MODEL_f4a052d2223a4d4fa95ed52f94ad465d", - "value": " 1900/? [00:05<00:00, 344.14it/s]" - } - }, - "e068e2b1c68c48a784c19fc716c043a3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "e10c94b1fdf84a9186ab7d87fd83f87f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e23c0d6c117246b2a0a6681008748917": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e4098b0091124fef8ba342783a82cc6e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "e64a723bcf474d6699d78ec05462f995": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e6e15d5c14134be0b4cf86fdecfef687": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e7a06a13ea11427ea3866cec3a55b644": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ee3d759a4e4442288599eacfd1347c8d", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_05f01b8813374534a4c58ac65fe2b390", - "value": 1 - } - }, - "e7a864f4dc0f485eb045b778e981fc01": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e918a6de59b64bd590e4f1233bbc078a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_892246fdf6bb476abb35ec321ddf86e8", - "IPY_MODEL_88c181cd21a94ec9a43df9754c1986c9" - ], - "layout": "IPY_MODEL_abeb0a773f3542c39ff724ae0674b74e" - } - }, - "e9a6bf4eda3244c6bb17216715f36525": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c293fc4171b0438595bc9a49fbb250cf", - "IPY_MODEL_819c83bf0bbd472ba417c31e957718c7" - ], - "layout": "IPY_MODEL_0922c5b2de554b4fa28dd531603f2709" - } - }, - "ea13174e5b894e93b3c59d7e599de5a9": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ea162090fc954f0198a1d63507dfff9b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ee0a78c01b63443f9e51470a1b1e79a4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": " 0%", - "description_tooltip": null, - "layout": "IPY_MODEL_ca4cd1659d73446e964f9ab36d92e3a0", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a9413692ae5040e6ae3c2a446dbe297c", - "value": 0 - } - }, - "ee3d759a4e4442288599eacfd1347c8d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ee908316d227495381e8cf7dcf5526f1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8eb2086a01cf41429a5f4adff5f2359b", - "IPY_MODEL_bbef89e4fd9d4cf8ae4c8fcab9bc665d" - ], - "layout": "IPY_MODEL_a4b49709f7464ce491324e8aa636c152" - } - }, - "f1db9d62eee44e61bb8ac26c16b3b601": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9b6445338a69425889a8901c192d5144", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d8db4d3709f34c869dedbc066e60501e", - "value": 1 - } - }, - "f220485e332d4c3cbfc3c45ce3b5fdf1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f23f02d00d424574afa29311b8d0906e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f261b8aab86b4d6e94984bf658c1b74d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": "2", - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f444ab7646444b9885cfec41b5a2236e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": "inline-flex", - "flex": null, - "flex_flow": "row wrap", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "100%" - } - }, - "f4a052d2223a4d4fa95ed52f94ad465d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f778d9ef70ca4f5898c423109cf82ed2": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fad0b06dc57e4b4599cf43daad7106b8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "Validation sanity check: ", - "description_tooltip": null, - "layout": "IPY_MODEL_0a96cc26343e4bb2ac2f5145be2fbacf", - "max": 5, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a5cc9e60aff641dca27f1adf6807e5b3", - "value": 5 - } - }, - "fc262db2a53948488092a77209081319": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fc3f14c4e83048aa9d6fe9963f95bf7a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_118c0b8da0df4ff68a90a3d500f1d1b8", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4eaea330bc8e414fbf2f0e2b21af8b08", - "value": 1 - } - }, - "fce1fc72006f4e84a6497a493cbbfca2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "fd8ec919352046dd84057e9763bb235a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fdf5c4a49602423184f6d94cd814177e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "ff6eead2826e4113abf7ab3a8cb31b0f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/notebooks/README.md b/notebooks/README.md deleted file mode 100644 index a72e154c36410..0000000000000 --- a/notebooks/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Lightning Notebooks ⚡ - -## Official Notebooks - -You can easily run any of the official notebooks by clicking the 'Open in Colab' links in the table below :smile: - -| Notebook | Description | Colab Link | -| :----------------------- | :----------------------------------------------------------------------------------- | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| **MNIST Hello World** | Train your first Lightning Module on the classic MNIST Handwritten Digits Dataset. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb) | -| **Datamodules** | Learn about DataModules and train a dataset-agnostic model on MNIST and CIFAR10. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/02-datamodules.ipynb) | -| **GAN** | Train a GAN on the MNIST Dataset. Learn how to use multiple optimizers in Lightning. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb) | -| **BERT** | Fine-tune HuggingFace Transformers models on the GLUE Benchmark | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb) | -| **Trainer Flags** | Overview of the available Lightning `Trainer` flags | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/05-trainer-flags-overview.ipynb) | -| **TPU Training** | Train a model on MNIST using TPUs with Lightning | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-mnist-tpu-training.ipynb) | -| **94% Baseline CIFAR10** | Establish a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/07-cifar10-baseline.ipynb) | diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py index 8ea03dabc9bdb..94e4fbfcf7ae2 100644 --- a/pl_examples/basic_examples/autoencoder.py +++ b/pl_examples/basic_examples/autoencoder.py @@ -87,6 +87,12 @@ def test_step(self, batch, batch_idx): loss = F.mse_loss(x_hat, x) self.log('test_loss', loss, on_step=True) + def predict_step(self, batch, batch_idx, dataloader_idx=None): + x, y = batch + x = x.view(x.size(0), -1) + z = self.encoder(x) + return self.decoder(z) + def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) return optimizer @@ -113,10 +119,15 @@ def val_dataloader(self): def test_dataloader(self): return DataLoader(self.mnist_test, batch_size=self.batch_size) + def predict_dataloader(self): + return DataLoader(self.mnist_test, batch_size=self.batch_size) + def cli_main(): - cli = LightningCLI(LitAutoEncoder, MyDataModule, seed_everything_default=1234) + cli = LightningCLI(LitAutoEncoder, MyDataModule, seed_everything_default=1234, save_config_overwrite=True) cli.trainer.test(cli.model, datamodule=cli.datamodule) + predictions = cli.trainer.predict(cli.model, datamodule=cli.datamodule) + print(predictions[0]) if __name__ == '__main__': diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py index 57cf97be00023..381cda088ea9d 100644 --- a/pl_examples/basic_examples/backbone_image_classifier.py +++ b/pl_examples/basic_examples/backbone_image_classifier.py @@ -100,6 +100,10 @@ def test_step(self, batch, batch_idx): loss = F.cross_entropy(y_hat, y) self.log('test_loss', loss) + def predict_step(self, batch, batch_idx, dataloader_idx=None): + x, y = batch + return self.backbone(x) + def configure_optimizers(self): # self.hparams available because we called self.save_hyperparameters() return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) @@ -126,10 +130,15 @@ def val_dataloader(self): def test_dataloader(self): return DataLoader(self.mnist_test, batch_size=self.batch_size) + def predict_dataloader(self): + return DataLoader(self.mnist_test, batch_size=self.batch_size) + def cli_main(): - cli = LightningCLI(LitClassifier, MyDataModule, seed_everything_default=1234) + cli = LightningCLI(LitClassifier, MyDataModule, seed_everything_default=1234, save_config_overwrite=True) cli.trainer.test(cli.model, datamodule=cli.datamodule) + predictions = cli.trainer.predict(cli.model, datamodule=cli.datamodule) + print(predictions[0]) if __name__ == '__main__': diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py deleted file mode 100644 index 9747c4a939340..0000000000000 --- a/pl_examples/basic_examples/conv_sequential_example.py +++ /dev/null @@ -1,226 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -Example script of running the experimental DDP Sequential Plugin. -This script splits a convolutional model onto multiple GPUs, whilst using the internal built in balancer -to balance across your GPUs. - -To run: -python conv_model_sequential_example.py --accelerator ddp --gpus 4 --max_epochs 1 --batch_size 256 --use_rpc_sequential -""" -import math -from argparse import ArgumentParser - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchvision -from torchmetrics.functional import accuracy - -import pytorch_lightning as pl -from pl_examples import cli_lightning_logo -from pytorch_lightning import Trainer -from pytorch_lightning.plugins import RPCSequentialPlugin -from pytorch_lightning.utilities import _BOLTS_AVAILABLE, _FAIRSCALE_PIPE_AVAILABLE - -if _BOLTS_AVAILABLE: - import pl_bolts - from pl_bolts.transforms.dataset_normalizations import cifar10_normalization - -##################### -# Modules # -##################### - - -class Flatten(nn.Module): - - def forward(self, x): - return x.view(x.size(0), -1) - - -############################### -# LightningModule # -############################### - - -class LitResnet(pl.LightningModule): - """ - >>> LitResnet() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - LitResnet( - (sequential_module): Sequential(...) - ) - """ - - def __init__(self, lr=0.05, batch_size=32, manual_optimization=False): - super().__init__() - - self.save_hyperparameters() - self.sequential_module = nn.Sequential( - # Conv Layer block 1 - nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1), - nn.BatchNorm2d(32), - nn.ReLU(inplace=False), - nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1), - nn.ReLU(inplace=False), - nn.MaxPool2d(kernel_size=2, stride=2), - - # Conv Layer block 2 - nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1), - nn.BatchNorm2d(128), - nn.ReLU(inplace=False), - nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1), - nn.ReLU(inplace=False), - nn.MaxPool2d(kernel_size=2, stride=2), - nn.Dropout2d(p=0.05), - - # Conv Layer block 3 - nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1), - nn.BatchNorm2d(256), - nn.ReLU(inplace=False), - nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1), - nn.ReLU(inplace=False), - nn.MaxPool2d(kernel_size=2, stride=2), - Flatten(), - nn.Dropout(p=0.1), - nn.Linear(4096, 1024), - nn.ReLU(inplace=False), - nn.Linear(1024, 512), - nn.ReLU(inplace=False), - nn.Dropout(p=0.1), - nn.Linear(512, 10) - ) - self._example_input_array = torch.randn((1, 3, 32, 32)) - - if manual_optimization: - self.automatic_optimization = False - self.training_step = self.training_step_manual - - def forward(self, x): - out = self.sequential_module(x) - return F.log_softmax(out, dim=-1) - - def training_step_manual(self, batch, batch_idx): - opt = self.optimizers() - - def closure(): - x, y = batch - logits = self.forward(x) - loss = F.nll_loss(logits, y) - self.manual_backward(loss, opt) - self.log('train_loss', loss, prog_bar=True) - - opt.step(closure=closure) - - def training_step(self, batch, batch_idx): - x, y = batch - logits = self.forward(x) - loss = F.nll_loss(logits, y) - self.log('Training Loss', loss) - return loss - - def _evaluate(self, batch, batch_idx, stage=None): - x, y = batch - out = self.forward(x) - logits = F.log_softmax(out, dim=-1) - loss = F.nll_loss(logits, y) - preds = torch.argmax(logits, dim=-1) - acc = accuracy(preds, y) - - if stage: - self.log(f'{stage}_loss', loss, prog_bar=True) - self.log(f'{stage}_acc', acc, prog_bar=True) - - return loss, acc - - def validation_step(self, batch, batch_idx): - return self._evaluate(batch, batch_idx, 'val')[0] - - def test_step(self, batch, batch_idx): - loss, acc = self._evaluate(batch, batch_idx, 'test') - self.log_dict({'test_loss': loss, 'test_acc': acc}) - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr, momentum=0.9, weight_decay=5e-4) - return { - 'optimizer': optimizer, - 'lr_scheduler': { - 'scheduler': torch.optim.lr_scheduler.OneCycleLR( - optimizer, - 0.1, - epochs=self.trainer.max_epochs, - steps_per_epoch=math.ceil(45000 / self.hparams.batch_size) - ), - 'interval': 'step', - } - } - - -################################# -# Instantiate Data Module # -################################# - - -def instantiate_datamodule(args): - train_transforms = torchvision.transforms.Compose([ - torchvision.transforms.RandomCrop(32, padding=4), - torchvision.transforms.RandomHorizontalFlip(), - torchvision.transforms.ToTensor(), - cifar10_normalization(), - ]) - - test_transforms = torchvision.transforms.Compose([ - torchvision.transforms.ToTensor(), - cifar10_normalization(), - ]) - - cifar10_dm = pl_bolts.datamodules.CIFAR10DataModule( - data_dir=args.data_dir, - batch_size=args.batch_size, - train_transforms=train_transforms, - test_transforms=test_transforms, - val_transforms=test_transforms, - ) - - return cifar10_dm - - -if __name__ == "__main__": - cli_lightning_logo() - - assert _BOLTS_AVAILABLE, "Bolts is required for this example, install it via `pip install lightning-bolts`" - assert _FAIRSCALE_PIPE_AVAILABLE, "FairScale and PyTorch 1.6 is required for this example." - - parser = ArgumentParser(description="Pipe Example") - parser.add_argument("--use_rpc_sequential", action="store_true") - parser.add_argument("--manual_optimization", action="store_true") - parser = Trainer.add_argparse_args(parser) - parser = pl_bolts.datamodules.CIFAR10DataModule.add_argparse_args(parser) - args = parser.parse_args() - - cifar10_dm = instantiate_datamodule(args) - - plugins = None - if args.use_rpc_sequential: - plugins = RPCSequentialPlugin() - - model = LitResnet(batch_size=args.batch_size, manual_optimization=args.manual_optimization) - - trainer = pl.Trainer.from_argparse_args(args, plugins=[plugins] if plugins else None) - trainer.fit(model, cifar10_dm) - trainer.test(model, datamodule=cifar10_dm) - - if trainer.accelerator.rpc_enabled: - # Called at the end of trainer to ensure all processes are killed - trainer.training_type_plugin.exit_rpc_process() diff --git a/pl_examples/basic_examples/profiler_example.py b/pl_examples/basic_examples/profiler_example.py index c79214af93581..688eb15ef923f 100644 --- a/pl_examples/basic_examples/profiler_example.py +++ b/pl_examples/basic_examples/profiler_example.py @@ -62,6 +62,10 @@ def validation_step(self, batch, batch_idx): loss = self.criterion(outputs, labels) self.log("val_loss", loss) + def predict_step(self, batch, batch_idx, dataloader_idx: int = None): + inputs = batch[0] + return self.model(inputs) + def configure_optimizers(self): return torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9) diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py index ffb6434352b2e..70aaa35931f8e 100644 --- a/pl_examples/basic_examples/simple_image_classifier.py +++ b/pl_examples/basic_examples/simple_image_classifier.py @@ -76,7 +76,7 @@ def configure_optimizers(self): def cli_main(): - cli = LightningCLI(LitClassifier, MNISTDataModule, seed_everything_default=1234) + cli = LightningCLI(LitClassifier, MNISTDataModule, seed_everything_default=1234, save_config_overwrite=True) cli.trainer.test(cli.model, datamodule=cli.datamodule) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index abb65ba86fd93..f906ab9bde77c 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -59,8 +59,8 @@ def run(): max_epochs=1, weights_summary=None, ) - trainer.fit(model, train_dataloader=train_data, val_dataloaders=val_data) - trainer.test(model, test_dataloaders=test_data) + trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data) + trainer.test(model, dataloaders=test_data) if __name__ == '__main__': diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py index 70726a748818c..114097df483af 100644 --- a/pl_examples/domain_templates/reinforce_learn_Qnet.py +++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py @@ -34,7 +34,7 @@ import argparse from collections import deque, namedtuple, OrderedDict -from typing import List, Tuple +from typing import Iterator, List, Tuple import gym import numpy as np @@ -139,7 +139,7 @@ def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None: self.buffer = buffer self.sample_size = sample_size - def __iter__(self) -> Tuple: + def __iter__(self) -> Iterator: states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size) for i in range(len(dones)): yield states[i], actions[i], rewards[i], dones[i], new_states[i] diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index f3453a5eb86f0..5bca67f41a7b3 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -28,7 +28,7 @@ [3] https://github.com/sid-sundrani/ppo_lightning """ import argparse -from typing import Callable, Iterable, List, Tuple +from typing import Callable, Iterator, List, Tuple import gym import torch @@ -144,7 +144,7 @@ class ExperienceSourceDataset(IterableDataset): def __init__(self, generate_batch: Callable): self.generate_batch = generate_batch - def __iter__(self) -> Iterable: + def __iter__(self) -> Iterator: iterator = self.generate_batch() return iterator @@ -413,7 +413,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx, opt return loss_actor - elif optimizer_idx == 1: + if optimizer_idx == 1: loss_critic = self.critic_loss(state, action, old_logp, qval, adv) self.log('loss_critic', loss_critic, on_step=False, on_epoch=True, prog_bar=False, logger=True) diff --git a/pl_examples/ipu_examples/__init__.py b/pl_examples/ipu_examples/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py new file mode 100644 index 0000000000000..37cb63c076e2e --- /dev/null +++ b/pl_examples/ipu_examples/mnist.py @@ -0,0 +1,89 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch.nn import functional as F + +import pytorch_lightning as pl +from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule + + +class LitClassifier(pl.LightningModule): + + def __init__( + self, + hidden_dim: int = 128, + learning_rate: float = 0.0001, + ): + super().__init__() + self.save_hyperparameters() + + self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) + self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + probs = self(x) + # we currently return the accuracy as the validation_step/test_step is run on the IPU devices. + # Outputs from the step functions are sent to the host device, where we calculate the metrics in + # validation_epoch_end and test_epoch_end for the test_step. + acc = self.accuracy(probs, y) + return acc + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def accuracy(self, logits, y): + # currently IPU poptorch doesn't implicit convert bools to tensor + # hence we use an explicit calculation for accuracy here. Once fixed in poptorch + # we can use the accuracy metric. + acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) + return acc + + def validation_epoch_end(self, outputs) -> None: + # since the training step/validation step and test step are run on the IPU device + # we must log the average loss outside the step functions. + self.log('val_acc', torch.stack(outputs).mean(), prog_bar=True) + + def test_epoch_end(self, outputs) -> None: + self.log('test_acc', torch.stack(outputs).mean()) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + +if __name__ == '__main__': + dm = MNISTDataModule(batch_size=32) + + model = LitClassifier() + + trainer = pl.Trainer(max_epochs=2, ipus=8) + + trainer.fit(model, datamodule=dm) + trainer.test(model, datamodule=dm) diff --git a/pyproject.toml b/pyproject.toml index e8a3213f2b738..1f21e1f088acb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,3 +16,28 @@ profile = "black" line_length = 120 force_sort_within_sections = "False" order_by_type = "False" + +[tool.vulture] +exclude = ['pytorch_lightning/metrics'] +make_whitelist = true +min_confidence = 95 +paths = ["pytorch_lightning"] +ignore_names = [ + "*_nb", + "*batch", + "*idx", + "*param*", + "cmd_line", + "kw", + "loc", + "mocked_device_count*", + "my_path", + "new_device", + "new_dtype", + "prediction", + "root", + "signum", + "torch_save", + "using_lbfgs", +] +sort_by_size = true diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 05e15fe1f1767..2a460a27e373a 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -13,4 +13,5 @@ from pytorch_lightning.accelerators.accelerator import Accelerator # noqa F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa F401 from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.ipu import IPUAccelerator # noqa F401 from pytorch_lightning.accelerators.tpu import TPUAccelerator # noqa F401 diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 4ea017ae0c208..abfb29c149bff 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -179,10 +179,6 @@ def batch_to_device( return move_data_to_device(batch, device) - def on_train_start(self) -> None: - """Hook to do something upon the training start""" - pass - def training_step( self, step_kwargs: Dict[str, Union[Any, int]], @@ -348,14 +344,6 @@ def clip_gradients( model=self.model, ) - def on_train_epoch_end(self) -> None: - """Hook to do something on the end of an training epoch.""" - pass - - def on_train_end(self) -> None: - """Hook to do something at the end of the training""" - pass - def setup_optimizers(self, trainer: 'pl.Trainer') -> None: """ Creates optimizers and schedulers @@ -394,7 +382,7 @@ def to_device(self, step_kwargs: Dict[str, Union[Any, int]]) -> Dict[str, Union[ def amp_backend(self) -> Optional[LightningEnum]: if isinstance(self.precision_plugin, ApexMixedPrecisionPlugin): return AMPType.APEX - elif isinstance(self.precision_plugin, NativeMixedPrecisionPlugin): + if isinstance(self.precision_plugin, NativeMixedPrecisionPlugin): return AMPType.NATIVE return None @@ -406,10 +394,6 @@ def precision(self) -> Union[str, int]: def scaler(self) -> Optional['GradScaler']: return getattr(self.precision_plugin, 'scaler', None) - @property - def rpc_enabled(self) -> bool: - return self.training_type_plugin.rpc_enabled - def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: """ Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom @@ -460,6 +444,22 @@ def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[I """ return self.training_type_plugin.process_dataloader(dataloader) + def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the train dataloader.""" + return self.training_type_plugin.on_reset_train_dataloader(dataloader) + + def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the val dataloader.""" + return self.training_type_plugin.on_reset_val_dataloader(dataloader) + + def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the test dataloader.""" + return self.training_type_plugin.on_reset_test_dataloader(dataloader) + + def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the predict dataloader.""" + return self.training_type_plugin.on_reset_predict_dataloader(dataloader) + @property def results(self) -> Any: """ @@ -547,3 +547,45 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: return self.training_type_plugin.update_global_step(total_batch_idx, current_global_step) + + def on_train_epoch_end(self) -> None: + """Hook to do something on the end of an training epoch.""" + pass + + def on_train_start(self) -> None: + """Called when train begins.""" + return self.training_type_plugin.on_train_start() + + def on_validation_start(self) -> None: + """Called when validation begins.""" + return self.training_type_plugin.on_validation_start() + + def on_test_start(self) -> None: + """Called when test begins.""" + return self.training_type_plugin.on_test_start() + + def on_predict_start(self) -> None: + """Called when predict begins.""" + return self.training_type_plugin.on_predict_start() + + def on_validation_end(self) -> None: + """Called when validation ends.""" + return self.training_type_plugin.on_validation_end() + + def on_test_end(self) -> None: + """Called when test end.""" + return self.training_type_plugin.on_test_end() + + def on_predict_end(self) -> None: + """Called when predict ends.""" + return self.training_type_plugin.on_predict_end() + + def on_train_end(self) -> None: + """Called when train ends.""" + return self.training_type_plugin.on_train_end() + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the training loop before anything happens for that batch. + """ + return self.training_type_plugin.on_train_batch_start(batch, batch_idx, dataloader_idx) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 7543a2b794b5d..1c5ff56d805a6 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -42,10 +42,7 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None: def on_train_start(self) -> None: # clear cache before training - # use context because of: - # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() @staticmethod def set_nvidia_flags(local_rank: int) -> None: diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py new file mode 100644 index 0000000000000..c9bee827af0e6 --- /dev/null +++ b/pytorch_lightning/accelerators/ipu.py @@ -0,0 +1,35 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import Callable +from typing import Any + +from torch.optim import Optimizer + +import pytorch_lightning as pl +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class IPUAccelerator(Accelerator): + """ Accelerator for IPUs. """ + + def setup_optimizers(self, trainer: 'pl.Trainer') -> None: + super().setup_optimizers(trainer) + + if len(self.optimizers) > 1: + raise MisconfigurationException("IPUs currently only support one optimizer.") + + def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None: + # Optimizer step is handled by the IPU accelerator. + lambda_closure() diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index f0c1a3a95819e..6f9ea07c0716d 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -26,7 +26,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException log = logging.getLogger(__name__) @@ -88,7 +88,7 @@ class EarlyStopping(Callback): def __init__( self, - monitor: str = 'early_stop_on', + monitor: Optional[str] = None, min_delta: float = 0.0, patience: int = 3, verbose: bool = False, @@ -100,7 +100,6 @@ def __init__( check_on_train_epoch_end: bool = True, ): super().__init__() - self.monitor = monitor self.min_delta = min_delta self.patience = patience self.verbose = verbose @@ -120,6 +119,13 @@ def __init__( torch_inf = torch.tensor(np.Inf) self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf + if monitor is None: + rank_zero_deprecation( + "The `EarlyStopping(monitor)` argument will be required starting in v1.6." + " For backward compatibility, setting this to `early_stop_on`." + ) + self.monitor = monitor or "early_stop_on" + def _validate_condition_metric(self, logs): monitor_val = logs.get(self.monitor) @@ -190,7 +196,7 @@ def _run_early_stopping_check(self, trainer) -> None: # when in dev debugging trainer.dev_debugger.track_early_stopping_history(self, current) - should_stop, reason = self._evalute_stopping_criteria(current) + should_stop, reason = self._evalute_stopping_criteria(current, trainer) # stop every ddp process if any world process decides to stop should_stop = trainer.training_type_plugin.reduce_boolean_decision(should_stop) @@ -200,7 +206,7 @@ def _run_early_stopping_check(self, trainer) -> None: if reason and self.verbose: self._log_info(trainer, reason) - def _evalute_stopping_criteria(self, current: torch.Tensor) -> Tuple[bool, str]: + def _evalute_stopping_criteria(self, current: torch.Tensor, trainer: 'pl.Trainer') -> Tuple[bool, str]: should_stop = False reason = None if self.check_finite and not torch.isfinite(current): @@ -223,7 +229,7 @@ def _evalute_stopping_criteria(self, current: torch.Tensor) -> Tuple[bool, str]: f" {self.monitor} = {current} {self.order_dict[self.mode]} {self.divergence_threshold}." " Signaling Trainer to stop." ) - elif self.monitor_op(current - self.min_delta, self.best_score): + elif self.monitor_op(current - self.min_delta, self.best_score.to(trainer.lightning_module.device)): should_stop = False reason = self._improvement_message(current) self.best_score = current diff --git a/pytorch_lightning/callbacks/finetuning.py b/pytorch_lightning/callbacks/finetuning.py index a6c13d1b0c0db..fe7e5f7bc09eb 100644 --- a/pytorch_lightning/callbacks/finetuning.py +++ b/pytorch_lightning/callbacks/finetuning.py @@ -20,7 +20,7 @@ from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union import torch -from torch.nn import Module +from torch.nn import Module, ModuleDict from torch.nn.modules.batchnorm import _BatchNorm from torch.optim.optimizer import Optimizer @@ -63,7 +63,7 @@ def configure_optimizer(self): class FeatureExtractorFreezeUnfreeze(BaseFinetuning): - def __init__(self, unfreeze_at_epoch=10) + def __init__(self, unfreeze_at_epoch=10): self._unfreeze_at_epoch = unfreeze_at_epoch def freeze_before_training(self, pl_module): @@ -105,7 +105,8 @@ def on_load_checkpoint( @staticmethod def flatten_modules(modules: Union[Module, Iterable[Union[Module, Iterable]]]) -> List[Module]: """ - This function is used to flatten a module or an iterable of modules into a list of its modules. + This function is used to flatten a module or an iterable of modules into a list of its leaf modules (modules + with no children) and parent modules that have parameters directly themselves. Args: modules: A given module or an iterable of modules @@ -113,6 +114,9 @@ def flatten_modules(modules: Union[Module, Iterable[Union[Module, Iterable]]]) - Returns: List of modules """ + if isinstance(modules, ModuleDict): + modules = modules.values() + if isinstance(modules, Iterable): _modules = [] for m in modules: @@ -121,8 +125,8 @@ def flatten_modules(modules: Union[Module, Iterable[Union[Module, Iterable]]]) - else: _modules = modules.modules() - # Leaf nodes in the graph have no children, so we use that to filter - return [m for m in _modules if not list(m.children())] + # Capture all leaf modules as well as parent modules that have parameters directly themsleves + return [m for m in _modules if not list(m.children()) or m._parameters] @staticmethod def filter_params( @@ -136,7 +140,6 @@ def filter_params( modules: A given module or an iterable of modules train_bn: Whether to train BatchNorm module requires_grad: Whether to create a generator for trainable or non-trainable parameters. - Returns: Generator """ @@ -144,7 +147,8 @@ def filter_params( for mod in modules: if isinstance(mod, _BatchNorm) and not train_bn: continue - for param in mod.parameters(): + # recursion could yield duplicate parameters for parent modules w/ parameters so disabling it + for param in mod.parameters(recurse=False): if param.requires_grad == requires_grad: yield param @@ -158,7 +162,8 @@ def make_trainable(modules: Union[Module, Iterable[Union[Module, Iterable]]]) -> """ modules = BaseFinetuning.flatten_modules(modules) for module in modules: - for param in module.parameters(): + # recursion could yield duplicate parameters for parent modules w/ parameters so disabling it + for param in module.parameters(recurse=False): param.requires_grad = True @staticmethod @@ -178,7 +183,8 @@ def freeze(modules: Union[Module, Iterable[Union[Module, Iterable]]], train_bn: if isinstance(mod, _BatchNorm) and train_bn: BaseFinetuning.make_trainable(mod) else: - for param in mod.parameters(): + # recursion could yield duplicate parameters for parent modules w/ parameters so disabling it + for param in mod.parameters(recurse=False): param.requires_grad = False @staticmethod @@ -282,7 +288,7 @@ def _store( def on_train_epoch_start(self, trainer, pl_module): """Called when the epoch begins.""" - for opt_idx, optimizer in trainer.train_loop.get_active_optimizers(): + for opt_idx, optimizer in trainer.fit_loop.epoch_loop.batch_loop.get_active_optimizers(): num_param_groups = len(optimizer.param_groups) self.finetune_function(pl_module, trainer.current_epoch, optimizer, opt_idx) current_param_groups = optimizer.param_groups diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py index 410f8b319c239..d3afcde35f55e 100644 --- a/pytorch_lightning/callbacks/lr_monitor.py +++ b/pytorch_lightning/callbacks/lr_monitor.py @@ -19,8 +19,10 @@ Monitor and logs learning rate for lr schedulers during training. """ +from collections import defaultdict +from typing import Any, DefaultDict, Dict, List, Optional, Set, Type -from typing import Dict, List, Optional +from torch.optim.optimizer import Optimizer from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities import rank_zero_warn @@ -53,7 +55,9 @@ class LearningRateMonitor(Callback): In case of multiple optimizers of same type, they will be named ``Adam``, ``Adam-1`` etc. If a optimizer has multiple parameter groups they will be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a - ``name`` keyword in the construction of the learning rate schdulers + ``name`` keyword in the construction of the learning rate schedulers. + A ``name`` keyword can also be used for parameter groups in the + construction of the optimizer. Example:: @@ -65,6 +69,19 @@ def configure_optimizer(self): } return [optimizer], [lr_scheduler] + Example:: + + def configure_optimizer(self): + optimizer = torch.optim.SGD( + [{ + 'params': [p for p in self.parameters()], + 'name': 'my_parameter_group_name' + }], + lr=0.1 + ) + lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, ...) + return [optimizer], [lr_scheduler] + """ def __init__(self, logging_interval: Optional[str] = None, log_momentum: bool = False): @@ -138,6 +155,9 @@ def on_train_epoch_start(self, trainer, *args, **kwargs): def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: latest_stat = {} + names = self._find_names(trainer.lr_schedulers, add_lr_sch_names=False) + self._remap_keys(names) + for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): if scheduler['interval'] == interval or interval == 'any': opt = scheduler['scheduler'].optimizer @@ -145,22 +165,33 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: use_betas = 'betas' in opt.defaults for i, pg in enumerate(param_groups): - suffix = f'/pg{i + 1}' if len(param_groups) > 1 else '' - lr = self._extract_lr(param_group=pg, name=f'{name}{suffix}') + name_and_suffix = self._add_suffix(name, param_groups, i) + lr = self._extract_lr(pg, name_and_suffix) latest_stat.update(lr) momentum = self._extract_momentum( - param_group=pg, name=f'{name}-momentum{suffix}', use_betas=use_betas + param_group=pg, name=name_and_suffix.replace(name, f'{name}-momentum'), use_betas=use_betas ) latest_stat.update(momentum) return latest_stat - def _extract_lr(self, param_group, name: str) -> Dict[str, float]: + def _extract_lr(self, param_group: Dict[str, Any], name: str) -> Dict[str, Any]: lr = param_group.get('lr') self.lrs[name].append(lr) return {name: lr} - def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str, float]: + def _remap_keys(self, names: List[str], token: str = '/pg1') -> None: + """ + This function is used the remap the keys if param groups for a given optimizer increased. + """ + for new_name in names: + old_name = new_name.replace(token, '') + if token in new_name and old_name in self.lrs: + self.lrs[new_name] = self.lrs.pop(old_name) + elif new_name not in self.lrs: + self.lrs[new_name] = [] + + def _extract_momentum(self, param_group: Dict[str, Any], name: str, use_betas: bool) -> Dict[str, float]: if not self.log_momentum: return {} @@ -168,35 +199,65 @@ def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str self.last_momentum_values[name] = momentum return {name: momentum} - def _find_names(self, lr_schedulers) -> List[str]: - # Create uniqe names in the case we have multiple of the same learning - # rate schduler + multiple parameter groups + def _add_prefix( + self, name: str, optimizer_cls: Type[Optimizer], seen_optimizer_types: DefaultDict[Type[Optimizer], int] + ) -> str: + if optimizer_cls not in seen_optimizer_types: + return name + count = seen_optimizer_types[optimizer_cls] + return name + f'-{count - 1}' if count > 1 else name + + def _add_suffix(self, name: str, param_groups: List[Dict], param_group_index: int, use_names: bool = True) -> str: + if len(param_groups) > 1: + if not use_names: + return f'{name}/pg{param_group_index+1}' + pg_name = param_groups[param_group_index].get('name', f'pg{param_group_index+1}') + return f'{name}/{pg_name}' + elif use_names: + pg_name = param_groups[param_group_index].get('name') + return f'{name}/{pg_name}' if pg_name else name + return name + + def _duplicate_param_group_names(self, param_groups: List[Dict]) -> Set[str]: + names = [pg.get('name', f'pg{i}') for i, pg in enumerate(param_groups, start=1)] + unique = set(names) + if len(names) == len(unique): + return set() + return set(n for n in names if names.count(n) > 1) + + def _find_names(self, lr_schedulers: List, add_lr_sch_names: bool = True) -> List[str]: + # Create unique names in the case we have multiple of the same learning + # rate scheduler + multiple parameter groups names = [] + seen_optimizers = [] + seen_optimizer_types = defaultdict(int) for scheduler in lr_schedulers: sch = scheduler['scheduler'] if scheduler['name'] is not None: name = scheduler['name'] else: - opt_name = 'lr-' + sch.optimizer.__class__.__name__ - i, name = 1, opt_name + name = 'lr-' + sch.optimizer.__class__.__name__ - # Multiple schduler of the same type - while True: - if name not in names: - break - i, name = i + 1, f'{opt_name}-{i}' + seen_optimizers.append(sch.optimizer) + optimizer_cls = type(sch.optimizer) + if scheduler['name'] is None: + seen_optimizer_types[optimizer_cls] += 1 - # Multiple param groups for the same schduler + # Multiple param groups for the same scheduler param_groups = sch.optimizer.param_groups + duplicates = self._duplicate_param_group_names(param_groups) + if duplicates: + raise MisconfigurationException( + 'A single `Optimizer` cannot have multiple parameter groups with identical ' + f'`name` values. {name} has duplicated parameter group names {duplicates}' + ) - if len(param_groups) != 1: - for i, pg in enumerate(param_groups): - temp = f'{name}/pg{i + 1}' - names.append(temp) - else: - names.append(name) + name = self._add_prefix(name, optimizer_cls, seen_optimizer_types) + + names.extend(self._add_suffix(name, param_groups, i) for i in range(len(param_groups))) - self.lr_sch_names.append(name) + if add_lr_sch_names: + self.lr_sch_names.append(name) return names diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 1bf8046dcee5b..ec2f5e0d990a5 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -26,6 +26,7 @@ from datetime import timedelta from pathlib import Path from typing import Any, Callable, Dict, Optional, Union +from weakref import proxy import numpy as np import torch @@ -101,7 +102,7 @@ class ModelCheckpoint(Callback): saved (``model.save_weights(filepath)``), else the full model is saved (``model.save(filepath)``). every_n_train_steps: Number of training steps between checkpoints. - If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training + If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training. To disable, set ``every_n_train_steps = 0``. This value must be ``None`` or non-negative. This must be mutually exclusive with ``train_time_interval`` and ``every_n_val_epochs``. train_time_interval: Checkpoints are monitored at the specified time interval. @@ -109,8 +110,9 @@ class ModelCheckpoint(Callback): of time it takes to process a single training batch. This is not guaranteed to execute at the exact time specified, but should be close. This must be mutually exclusive with ``every_n_train_steps`` and ``every_n_val_epochs``. + FIXME every_n_val_epochs: Number of validation epochs between checkpoints. - If ``every_n_val_epochs == None or every_n_val_epochs == 0``, we skip saving on validation end + If ``every_n_val_epochs == None or every_n_val_epochs == 0``, we skip saving on validation end. To disable, set ``every_n_val_epochs = 0``. This value must be ``None`` or non-negative. This must be mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``. Setting both ``ModelCheckpoint(..., every_n_val_epochs=V)`` and @@ -118,7 +120,7 @@ class ModelCheckpoint(Callback): will only save checkpoints at epochs 0 < E <= N where both values for ``every_n_val_epochs`` and ``check_val_every_n_epoch`` evenly divide E. period: Interval (number of epochs) between checkpoints. - save_on_train_epoch_end: TODO + save_on_train_epoch_end: FIXME .. warning:: This argument has been deprecated in v1.3 and will be removed in v1.5. @@ -203,7 +205,7 @@ def __init__( train_time_interval: Optional[timedelta] = None, every_n_val_epochs: Optional[int] = None, period: Optional[int] = None, - save_on_train_epoch_end: bool = True, + save_on_train_epoch_end: Optional[bool] = None, ): super().__init__() self.monitor = monitor @@ -234,6 +236,10 @@ def on_pretrain_routine_start(self, trainer: 'pl.Trainer', pl_module: 'pl.Lightn """ self.__resolve_ckpt_dir(trainer) self._save_function = trainer.save_checkpoint + if self._save_on_train_epoch_end is None: + # if the user runs validation before multiple times per training epoch, we try to save checkpoint after + # validation instead of on train epoch end + self._save_on_train_epoch_end = trainer.val_check_interval == 1.0 def on_train_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: self._last_time_checked = time.monotonic() @@ -275,11 +281,13 @@ def on_train_epoch_end( ) -> None: """ Save a checkpoint at the end of the training epoch. """ if ( - self._should_skip_saving_checkpoint(trainer) or self._save_on_train_epoch_end - # TODO: should every_n_val_epochs be repurposed to work for this too? + self._should_skip_saving_checkpoint(trainer) or not self._save_on_train_epoch_end + # FIXME: repurpose every_n_val_epochs to work for this hook + or self._every_n_val_epochs < 1 or (trainer.current_epoch + 1) % self._every_n_val_epochs != 0 ): return # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates + # FIXME: last_global_step_saved wrong trainer.train_loop.global_step -= 1 self.save_checkpoint(trainer) trainer.train_loop.global_step += 1 @@ -298,16 +306,16 @@ def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') - Save a checkpoint at the very end of training. This will only save a checkpoint if `save_last` is also enabled - as the monitor metrics produced by training or validation steps or end of epochs - is not guaranteed to be available at this stage. + as the monitor metrics logged during training/validation steps or end of epochs + are not guaranteed to be available at this stage. """ - if self._should_skip_saving_checkpoint(trainer) or not trainer.checkpoint_connector.has_trained: + if self._should_skip_saving_checkpoint(trainer): return if self.save_last and self.verbose: - rank_zero_info("Saving last checkpoint...") + rank_zero_info("Saving latest checkpoint...") # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates + monitor_candidates = self._monitor_candidates(trainer, trainer.current_epoch, trainer.global_step - 1) trainer.train_loop.global_step -= 1 - monitor_candidates = self._monitor_candidates(trainer) self._save_last_checkpoint(trainer, monitor_candidates) trainer.train_loop.global_step += 1 @@ -364,6 +372,10 @@ def save_checkpoint(self, trainer: 'pl.Trainer', unused: Optional['pl.LightningM # Mode 3: save last checkpoints self._save_last_checkpoint(trainer, monitor_candidates) + # notify loggers + if trainer.is_global_zero and trainer.logger: + trainer.logger.after_save_checkpoint(proxy(self)) + def _should_skip_saving_checkpoint(self, trainer: 'pl.Trainer') -> bool: from pytorch_lightning.trainer.states import TrainerFn return ( @@ -388,7 +400,7 @@ def __validate_init_configuration(self) -> None: every_n_train_steps_triggered = self._every_n_train_steps >= 1 every_n_val_epochs_triggered = self._every_n_val_epochs >= 1 train_time_interval_triggered = self._train_time_interval is not None - if (every_n_train_steps_triggered + every_n_val_epochs_triggered + train_time_interval_triggered > 1): + if every_n_train_steps_triggered + every_n_val_epochs_triggered + train_time_interval_triggered > 1: raise MisconfigurationException( f"Combination of parameters every_n_train_steps={self._every_n_train_steps}, " f"every_n_val_epochs={self._every_n_val_epochs} and train_time_interval={self._train_time_interval} " @@ -446,8 +458,11 @@ def __init_monitor_mode(self, mode: str) -> None: self.kth_value, self.mode = mode_dict[mode] def __init_triggers( - self, every_n_train_steps: Optional[int], every_n_val_epochs: Optional[int], - train_time_interval: Optional[timedelta], period: Optional[int] + self, + every_n_train_steps: Optional[int], + every_n_val_epochs: Optional[int], + train_time_interval: Optional[timedelta], + period: Optional[int], ) -> None: # Default to running once after each validation epoch if neither @@ -471,7 +486,6 @@ def __init_triggers( ' Please use `every_n_val_epochs` instead.' ) self._every_n_val_epochs = period - self._period = self._every_n_val_epochs @property @@ -512,15 +526,6 @@ def _del_model(self, trainer: 'pl.Trainer', filepath: str) -> None: log.debug(f"Removed checkpoint: {filepath}") def _save_model(self, trainer: 'pl.Trainer', filepath: str) -> None: - if trainer.training_type_plugin.rpc_enabled: - # RPCPlugin manages saving all model states - # TODO: the rpc plugin should wrap trainer.save_checkpoint - # instead of us having to do it here manually - trainer.training_type_plugin.rpc_save_model(trainer, self._do_save, filepath) - else: - self._do_save(trainer, filepath) - - def _do_save(self, trainer: 'pl.Trainer', filepath: str) -> None: # in debugging, track when we save checkpoints trainer.dev_debugger.track_checkpointing_history(filepath) @@ -679,10 +684,10 @@ def _add_backward_monitor_support(self, trainer: 'pl.Trainer') -> None: self.save_top_k = 1 if deprecation_warning: - warning_cache.warn( + warning_cache.deprecation( "Relying on `self.log('val_loss', ...)` to set the ModelCheckpoint monitor is deprecated in v1.2" " and will be removed in v1.4. Please, create your own `mc = ModelCheckpoint(monitor='your_monitor')`" - " and use it as `Trainer(callbacks=[mc])`.", DeprecationWarning + " and use it as `Trainer(callbacks=[mc])`.", ) def _validate_monitor_key(self, trainer: 'pl.Trainer') -> None: @@ -695,7 +700,10 @@ def _validate_monitor_key(self, trainer: 'pl.Trainer') -> None: f" {list(metrics.keys())}. " f"HINT: Did you call self.log('{self.monitor}', value) in the LightningModule?" ) - raise MisconfigurationException(m) + if not trainer.fit_loop.epoch_loop.val_loop._has_run: + warning_cache.warn(m) + else: + raise MisconfigurationException(m) def _get_metric_interpolated_filepath_name( self, diff --git a/pytorch_lightning/callbacks/prediction_writer.py b/pytorch_lightning/callbacks/prediction_writer.py index cbcff74ff0278..962877cc5a658 100644 --- a/pytorch_lightning/callbacks/prediction_writer.py +++ b/pytorch_lightning/callbacks/prediction_writer.py @@ -109,7 +109,7 @@ def on_predict_batch_end( if not self.interval.on_batch: return is_distributed = trainer.accelerator_connector.is_distributed - batch_indices = trainer.predict_loop.batch_indices if is_distributed else None + batch_indices = trainer.predict_loop.epoch_loop.current_batch_indices if is_distributed else None self.write_on_batch_end(trainer, pl_module, outputs, batch_indices, batch, batch_idx, dataloader_idx) def on_predict_epoch_end( diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index 0fe05ff812e20..2fd4b8c25df19 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -200,7 +200,7 @@ def on_init_end(self, trainer): self._trainer = trainer def on_train_start(self, trainer, pl_module): - self._train_batch_idx = trainer.train_loop.batch_idx + self._train_batch_idx = trainer.fit_loop.batch_idx def on_train_epoch_start(self, trainer, pl_module): self._train_batch_idx = 0 diff --git a/pytorch_lightning/callbacks/pruning.py b/pytorch_lightning/callbacks/pruning.py index e7da752d1c844..ced8d29c14424 100644 --- a/pytorch_lightning/callbacks/pruning.py +++ b/pytorch_lightning/callbacks/pruning.py @@ -259,25 +259,26 @@ def _create_pruning_fn(self, pruning_fn: str, **kwargs: Any) -> Union[Callable, def _wrap_pruning_fn(pruning_fn: Callable, **kwargs: Any) -> Callable: return partial(pruning_fn, **kwargs) - def make_pruning_permanent(self, pl_module: LightningModule) -> None: + def make_pruning_permanent(self, module: nn.Module) -> None: """ Removes pruning buffers from any pruned modules Adapted from https://github.com/pytorch/pytorch/blob/1.7.1/torch/nn/utils/prune.py#L1176-L1180 """ - for _, module in pl_module.named_modules(): + for _, module in module.named_modules(): for k in list(module._forward_pre_hooks): hook = module._forward_pre_hooks[k] if isinstance(hook, pytorch_prune.BasePruningMethod): hook.remove(module) del module._forward_pre_hooks[k] - def _restore_original_weights(self, module: nn.Module, orig_module: nn.Module, tensor_name: str) -> None: - trained = getattr(module, tensor_name) - orig = getattr(orig_module, tensor_name) - if trained is None or orig is None: + @staticmethod + def _copy_param(new: nn.Module, old: nn.Module, name: str) -> None: + dst = getattr(new, name) + src = getattr(old, name) + if dst is None or src is None or not isinstance(dst, torch.Tensor) or not isinstance(src, torch.Tensor): return - trained.data = orig.data.to(trained.device) + dst.data = src.data.to(dst.device) def apply_lottery_ticket_hypothesis(self) -> None: r""" @@ -292,14 +293,6 @@ def apply_lottery_ticket_hypothesis(self) -> None: The ``resample_parameters`` argument can be used to reset the parameters with a new :math:`\theta_z \sim \mathcal{D}_\theta` """ # noqa: E501 - - def copy_param(new: nn.Module, old: nn.Module, name: str) -> None: - dst = getattr(new, name) - src = getattr(old, name) - if dst is None or src is None or not isinstance(dst, torch.Tensor) or not isinstance(src, torch.Tensor): - return - dst.data = src.data.to(dst.device) - assert self._original_layers is not None for d in self._original_layers.values(): copy = d["data"] @@ -309,7 +302,7 @@ def copy_param(new: nn.Module, old: nn.Module, name: str) -> None: copy.reset_parameters() for i, name in names: new, new_name = self._parameters_to_prune[i] - copy_param(new, copy, name) + self._copy_param(new, copy, name) def _apply_local_pruning(self, amount: float) -> None: for module, name in self._parameters_to_prune: diff --git a/pytorch_lightning/callbacks/stochastic_weight_avg.py b/pytorch_lightning/callbacks/stochastic_weight_avg.py index 3ec7774d5f8b6..0cd788c8c8647 100644 --- a/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -159,7 +159,7 @@ def on_fit_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'): self._max_epochs = trainer.max_epochs if self._model_contains_batch_norm: # virtually increase max_epochs to perform batch norm update on latest epoch. - trainer.train_loop.max_epochs += 1 + trainer.fit_loop.max_epochs += 1 def on_train_epoch_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'): if trainer.current_epoch == self.swa_start: @@ -220,19 +220,20 @@ def on_train_epoch_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningMo # performing only one pass over the train data-loader to compute activation statistics # Therefore, we will virtually increase `num_training_batches` by 1 and skip backward. trainer.num_training_batches += 1 - trainer.train_loop._skip_backward = True + trainer.fit_loop._skip_backward = True self._accumulate_grad_batches = trainer.accumulate_grad_batches - trainer.accumulate_grad_batches = len(trainer.train_dataloader) + + trainer.accumulate_grad_batches = trainer.num_training_batches def on_train_epoch_end(self, trainer: 'pl.Trainer', *args): - trainer.train_loop._skip_backward = False + trainer.fit_loop._skip_backward = False def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'): if self._model_contains_batch_norm and trainer.current_epoch == self.swa_end + 1: # BatchNorm epoch update. Reset state trainer.accumulate_grad_batches = self._accumulate_grad_batches trainer.num_training_batches -= 1 - trainer.train_loop.max_epochs -= 1 + trainer.fit_loop.max_epochs -= 1 self.reset_momenta() elif trainer.current_epoch == self.swa_end: # Last SWA epoch. Transfer weights from average model to pl_module @@ -265,7 +266,7 @@ def reset_momenta(self): """ Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165 """ - for bn_module in self.momenta.keys(): + for bn_module in self.momenta: bn_module.momentum = self.momenta[bn_module] @staticmethod diff --git a/pytorch_lightning/callbacks/timer.py b/pytorch_lightning/callbacks/timer.py index 9b93499c82ea1..ba42419141253 100644 --- a/pytorch_lightning/callbacks/timer.py +++ b/pytorch_lightning/callbacks/timer.py @@ -170,4 +170,5 @@ def _check_time_remaining(self, trainer: 'pl.Trainer') -> None: should_stop = trainer.accelerator.broadcast(should_stop) trainer.should_stop = trainer.should_stop or should_stop if should_stop and self._verbose: - rank_zero_info(f"Time limit reached. Elapsed time is {self.time_elapsed}. Signaling Trainer to stop.") + elapsed = timedelta(seconds=int(self.time_elapsed(RunningStage.TRAINING))) + rank_zero_info(f"Time limit reached. Elapsed time is {elapsed}. Signaling Trainer to stop.") diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py index 84210e9d7b667..df3fa26a24a17 100644 --- a/pytorch_lightning/core/datamodule.py +++ b/pytorch_lightning/core/datamodule.py @@ -20,7 +20,7 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks -from pytorch_lightning.utilities import rank_zero_only +from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types @@ -160,7 +160,13 @@ def has_prepared_data(self) -> bool: Returns: bool: True if ``datamodule.prepare_data()`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_prepared_data` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_prepared_data @property @@ -169,7 +175,11 @@ def has_setup_fit(self) -> bool: Returns: bool: True ``if datamodule.setup(stage='fit')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation('DataModule property `has_setup_fit` was deprecated in v1.4 and will be removed in v1.6.') return self._has_setup_fit @property @@ -178,7 +188,13 @@ def has_setup_validate(self) -> bool: Returns: bool: True if ``datamodule.setup(stage='validate')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_setup_validate` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_setup_validate @property @@ -187,7 +203,13 @@ def has_setup_test(self) -> bool: Returns: bool: True if ``datamodule.setup(stage='test')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_setup_test` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_setup_test @property @@ -196,7 +218,13 @@ def has_setup_predict(self) -> bool: Returns: bool: True if ``datamodule.setup(stage='predict')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_setup_predict` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_setup_predict @property @@ -205,7 +233,13 @@ def has_teardown_fit(self) -> bool: Returns: bool: True ``if datamodule.teardown(stage='fit')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_teardown_fit` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_teardown_fit @property @@ -214,7 +248,13 @@ def has_teardown_validate(self) -> bool: Returns: bool: True if ``datamodule.teardown(stage='validate')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_teardown_validate` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_teardown_validate @property @@ -223,7 +263,13 @@ def has_teardown_test(self) -> bool: Returns: bool: True if ``datamodule.teardown(stage='test')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_teardown_test` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_teardown_test @property @@ -232,7 +278,13 @@ def has_teardown_predict(self) -> bool: Returns: bool: True if ``datamodule.teardown(stage='predict')`` has been called. False by default. + + .. deprecated:: v1.4 + Will be removed in v1.6.0. """ + rank_zero_deprecation( + 'DataModule property `has_teardown_predict` was deprecated in v1.4 and will be removed in v1.6.' + ) return self._has_teardown_predict @classmethod @@ -329,7 +381,7 @@ def test_dataloader(): def __new__(cls, *args: Any, **kwargs: Any) -> 'LightningDataModule': obj = super().__new__(cls) # track `DataHooks` calls and run `prepare_data` only on rank zero - obj.prepare_data = cls._track_data_hook_calls(obj, rank_zero_only(obj.prepare_data)) + obj.prepare_data = cls._track_data_hook_calls(obj, obj.prepare_data) obj.setup = cls._track_data_hook_calls(obj, obj.setup) obj.teardown = cls._track_data_hook_calls(obj, obj.teardown) return obj @@ -381,8 +433,13 @@ def wrapped_fn(*args: str, **kwargs: Optional[str]) -> Any: has_run = obj._has_prepared_data obj._has_prepared_data = True - if not has_run: - return fn(*args, **kwargs) + if has_run: + rank_zero_deprecation( + f"DataModule.{name} has already been called, so it will not be called again. " + f"In v1.6 this behavior will change to always call DataModule.{name}." + ) + else: + fn(*args, **kwargs) return wrapped_fn diff --git a/pytorch_lightning/core/grads.py b/pytorch_lightning/core/grads.py index 30a2f0ae7e38f..f6a0d41035460 100644 --- a/pytorch_lightning/core/grads.py +++ b/pytorch_lightning/core/grads.py @@ -18,7 +18,7 @@ from torch.nn import Module -from pytorch_lightning.utilities.distributed import rank_zero_deprecation +from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.grads import grad_norm as new_grad_norm diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 0ad6b131ad14b..50b058c3c24c2 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -13,14 +13,13 @@ # limitations under the License. """Various hooks to be used in the Lightning code.""" -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import torch from torch.optim.optimizer import Optimizer -from torch.utils.data import DataLoader from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn -from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, STEP_OUTPUT, TRAIN_DATALOADERS class ModelHooks: @@ -428,14 +427,13 @@ def teardown(self, stage: Optional[str] = None) -> None: stage: either ``'fit'``, ``'validate'``, ``'test'``, or ``'predict'`` """ - def train_dataloader(self) -> Union[DataLoader, List[DataLoader], Dict[str, DataLoader]]: + def train_dataloader(self) -> TRAIN_DATALOADERS: """ Implement one or more PyTorch DataLoaders for training. Return: - Either a single PyTorch :class:`~torch.utils.data.DataLoader` or a collection of these - (list, dict, nested lists and dicts). In the case of multiple dataloaders, please see - this :ref:`page ` + A collection of :class:`torch.utils.data.DataLoader` specifying training samples. + In the case of multiple dataloaders, please see this :ref:`page `. The dataloader you return will not be called every epoch unless you set :paramref:`~pytorch_lightning.trainer.Trainer.reload_dataloaders_every_epoch` to ``True``. @@ -503,7 +501,7 @@ def train_dataloader(self): """ rank_zero_warn("`train_dataloader` must be implemented to be used with the Lightning Trainer") - def test_dataloader(self) -> Union[DataLoader, List[DataLoader]]: + def test_dataloader(self) -> EVAL_DATALOADERS: r""" Implement one or multiple PyTorch DataLoaders for testing. @@ -533,7 +531,7 @@ def test_dataloader(self) -> Union[DataLoader, List[DataLoader]]: There is no need to set it yourself. Return: - Single or multiple PyTorch DataLoaders. + A :class:`torch.utils.data.DataLoader` or a sequence of them specifying testing samples. Example:: @@ -563,7 +561,7 @@ def test_dataloader(self): will have an argument ``dataloader_idx`` which matches the order here. """ - def val_dataloader(self) -> Union[DataLoader, List[DataLoader]]: + def val_dataloader(self) -> EVAL_DATALOADERS: r""" Implement one or multiple PyTorch DataLoaders for validation. @@ -584,7 +582,7 @@ def val_dataloader(self) -> Union[DataLoader, List[DataLoader]]: There is no need to set it yourself. Return: - Single or multiple PyTorch DataLoaders. + A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples. Examples:: @@ -614,7 +612,7 @@ def val_dataloader(self): will have an argument ``dataloader_idx`` which matches the order here. """ - def predict_dataloader(self) -> Union[DataLoader, List[DataLoader]]: + def predict_dataloader(self) -> EVAL_DATALOADERS: r""" Implement one or multiple PyTorch DataLoaders for prediction. @@ -632,7 +630,7 @@ def predict_dataloader(self) -> Union[DataLoader, List[DataLoader]]: There is no need to set it yourself. Return: - Single or multiple PyTorch DataLoaders. + A :class:`torch.utils.data.DataLoader` or a sequence of them specifying prediction samples. Note: In the case where you return multiple prediction dataloaders, the :meth:`predict` @@ -807,7 +805,8 @@ def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: else you might want to save. Args: - checkpoint: Checkpoint to be saved + checkpoint: The full checkpoint dictionary before it gets dumped to a file. + Implementations of this hook can insert additional data into this dictionary. Example:: diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 74c1ef442f993..2478f698e659b 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -24,30 +24,31 @@ import uuid from abc import ABC from argparse import Namespace -from functools import partial from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union +import numpy as np import torch from torch import ScriptModule, Tensor from torch.nn import Module from torch.optim.optimizer import Optimizer +from torchmetrics import Metric from pytorch_lightning.core.grads import GradInformation from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, ModelIO, PRIMITIVE_TYPES -from pytorch_lightning.core.step_result import Result +from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import FxValidator from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin -from pytorch_lightning.utilities.distributed import sync_ddp_if_available, tpu_distributed +from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, save_hyperparameters from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature -from pytorch_lightning.utilities.types import _METRIC, EPOCH_OUTPUT, STEP_OUTPUT +from pytorch_lightning.utilities.types import _METRIC_COLLECTION, EPOCH_OUTPUT, STEP_OUTPUT from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() @@ -80,6 +81,7 @@ class LightningModule( "model_size", "automatic_optimization", "truncated_bptt_steps", + "loaded_optimizer_states_dict", ] + DeviceDtypeModuleMixin.__jit_unused_properties__ def __init__(self, *args: Any, **kwargs: Any) -> None: @@ -89,7 +91,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # torch/nn/modules/module.py#L227) torch._C._log_api_usage_once(f"lightning.module.{self.__class__.__name__}") - self.loaded_optimizer_states_dict = {} + self._loaded_optimizer_states_dict = {} #: Pointer to the trainer object self.trainer = None @@ -106,13 +108,13 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # optionally can be set by user self._example_input_array = None self._datamodule = None - self._results: Optional[Result] = None self._current_fx_name: Optional[str] = None self._running_manual_backward: bool = False self._current_dataloader_idx: Optional[int] = None self._automatic_optimization: bool = True self._truncated_bptt_steps: int = 0 self._param_requires_grad_state = dict() + self._metric_attributes: Optional[Dict[int, str]] = None def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]: if use_pl_optimizer: @@ -170,12 +172,31 @@ def example_input_array(self, example: Any) -> None: @property def datamodule(self) -> Any: - rank_zero_deprecation( + warning_cache.deprecation( "The `LightningModule.datamodule` property is deprecated in v1.3 and will be removed in v1.5." - " Access the datamodule through using `self.trainer.datamodule` instead." + " Access the datamodule through using `self.trainer.datamodule` instead.", + stacklevel=6, ) return self._datamodule + @property + def loaded_optimizer_states_dict(self) -> dict: + warning_cache.deprecation( + "The `LightningModule.loaded_optimizer_states_dict` property is deprecated in v1.4" + " and will be removed in v1.6.", + stacklevel=6, + ) + return self._loaded_optimizer_states_dict + + @loaded_optimizer_states_dict.setter + def loaded_optimizer_states_dict(self, val: dict) -> None: + warning_cache.deprecation( + "The `LightningModule.loaded_optimizer_states_dict` property is deprecated in v1.4" + " and will be removed in v1.6.", + stacklevel=6, + ) + self._loaded_optimizer_states_dict = val + @datamodule.setter def datamodule(self, datamodule: Any) -> None: self._datamodule = datamodule @@ -225,10 +246,10 @@ def _apply_batch_transfer_handler( if is_param_in_hook_signature(self.transfer_batch_to_device, 'dataloader_idx'): batch = self.transfer_batch_to_device(batch, device, dataloader_idx) else: - warning_cache.warn( + warning_cache.deprecation( "`transfer_batch_to_device` hook signature has changed in v1.4." " `dataloader_idx` parameter has been added to it. Support for" - " the old signature will be removed in v1.6", DeprecationWarning + " the old signature will be removed in v1.6" ) batch = self.transfer_batch_to_device(batch, device) @@ -259,19 +280,22 @@ def forward(self, x): def log( self, name: str, - value: Any, + value: _METRIC_COLLECTION, prog_bar: bool = False, logger: bool = True, on_step: Optional[bool] = None, on_epoch: Optional[bool] = None, - reduce_fx: Callable = torch.mean, + reduce_fx: Union[str, Callable] = 'default', # TODO: change to 'mean' when `sync_dist_op` is removed in 1.6 tbptt_reduce_fx: Optional = None, # noqa: Remove in 1.6 tbptt_pad_token: Optional = None, # noqa: Remove in 1.6 enable_graph: bool = False, sync_dist: bool = False, - sync_dist_op: Union[Any, str] = 'mean', + sync_dist_op: Optional = None, # noqa: Remove in 1.6 sync_dist_group: Optional[Any] = None, add_dataloader_idx: bool = True, + batch_size: Optional[int] = None, + metric_attribute: Optional[str] = None, + rank_zero_only: Optional[bool] = None, ) -> None: """ Log a key, value @@ -294,20 +318,25 @@ def log( "validation_epoch_end*", "F", "T", "F", "T" Args: - name: key name - value: value name + name: key to log + value: value to log. Can be a ``float``, ``Tensor``, ``Metric``, or a dictionary of the former. prog_bar: if True logs to the progress bar logger: if True logs to the logger on_step: if True logs at this step. None auto-logs at the training_step but not validation/test_step on_epoch: if True logs epoch accumulated metrics. None auto-logs at the val/test step but not training_step - reduce_fx: reduction function over step values for end of epoch. Torch.mean by default + reduce_fx: reduction function over step values for end of epoch. :meth:`torch.mean` by default. enable_graph: if True, will not auto detach the graph sync_dist: if True, reduces the metric across GPUs/TPUs - sync_dist_op: the op to sync across GPUs/TPUs sync_dist_group: the ddp group to sync across add_dataloader_idx: if True, appends the index of the current dataloader to the name (when using multiple). If False, user needs to give unique names for each dataloader to not mix values + batch_size: Current batch_size. This will be directly inferred from the loaded batch, + but some data structures might need to explicitly provide it. + metric_attribute: To restore the metric state, Lightning requires the reference of the + :class:`torchmetrics.Metric` in your model. This is found automatically if it is a model attribute. + rank_zero_only: Whether the value will be logged only on rank 0. This will prevent synchronization which + would produce a deadlock as not all processes would perform this log call. """ if tbptt_reduce_fx is not None: rank_zero_deprecation( @@ -321,62 +350,105 @@ def log( ' Please, open a discussion explaining your use-case in' ' `https://github.com/PyTorchLightning/pytorch-lightning/discussions`' ) + if sync_dist_op is not None: + rank_zero_deprecation( + f"`self.log(sync_dist_op='{sync_dist_op}')` is deprecated and will be removed in v.1.6." + f" Use `self.log(reduce_fx={sync_dist_op})` instead." + ) + if reduce_fx == 'default': + reduce_fx = sync_dist_op + elif reduce_fx == 'default': + reduce_fx = 'mean' + + # check for invalid values + apply_to_collection(value, dict, self.__check_not_nested, name) + apply_to_collection( + value, object, self.__check_allowed, name, value, wrong_dtype=(numbers.Number, Metric, Tensor, dict) + ) - if self._results is not None: - # TODO: if logged twice fail with crash + # set the default depending on the fx_name + on_step = self.__auto_choose_log_on_step(on_step) + on_epoch = self.__auto_choose_log_on_epoch(on_epoch) - # set the default depending on the fx_name - on_step = self.__auto_choose_log_on_step(on_step) - on_epoch = self.__auto_choose_log_on_epoch(on_epoch) + results = self.trainer._results + assert results is not None + assert self._current_fx_name is not None + FxValidator.check_logging(self._current_fx_name, on_step=on_step, on_epoch=on_epoch) - assert self._current_fx_name is not None - self.trainer.logger_connector.check_logging(self._current_fx_name, on_step=on_step, on_epoch=on_epoch) + # make sure user doesn't introduce logic for multi-dataloaders + if "/dataloader_idx_" in name: + raise MisconfigurationException( + f"You called `self.log` with the key `{name}`" + " but it should not contain information about `dataloader_idx`" + ) + + value = apply_to_collection(value, numbers.Number, self.__to_tensor) + + if self.trainer.logger_connector.should_reset_tensors(self._current_fx_name): + # if we started a new epoch (running it's first batch) the hook name has changed + # reset any tensors for the new hook name + results.reset(metrics=False, fx=self._current_fx_name) - # make sure user doesn't introduce logic for multi-dataloaders - if "/dataloader_idx_" in name: + if metric_attribute is None and isinstance(value, Metric): + if self._metric_attributes is None: + # compute once + self._metric_attributes = { + id(module): name + for name, module in self.named_modules() if isinstance(module, Metric) + } + if not self._metric_attributes: + raise MisconfigurationException( + "Could not find the `LightningModule` attribute for the `torchmetrics.Metric` logged." + " You can fix this by setting an attribute for the metric in your `LightningModule`." + ) + # try to find the passed metric in the LightningModule + metric_attribute = self._metric_attributes.get(id(value), None) + if metric_attribute is None: raise MisconfigurationException( - f"Logged key: {name} should not contain information about dataloader_idx." + "Could not find the `LightningModule` attribute for the `torchmetrics.Metric` logged." + f" You can fix this by calling `self.log({name}, ..., metric_attribute=name)` where `name` is one" + f" of {list(self._metric_attributes.values())}" ) - value = self.__sync( - value, - sync_fn=self.trainer.training_type_plugin.reduce, - sync_dist=sync_dist, - sync_dist_op=sync_dist_op, - sync_dist_group=sync_dist_group, - device=self.device, - ) + results.log( + self._current_fx_name, + name, + value, + prog_bar=prog_bar, + logger=logger, + on_step=on_step, + on_epoch=on_epoch, + reduce_fx=reduce_fx, + enable_graph=enable_graph, + dataloader_idx=(self._current_dataloader_idx if add_dataloader_idx else None), + batch_size=batch_size, + sync_dist=sync_dist and distributed_available(), + sync_dist_fn=self.trainer.training_type_plugin.reduce or sync_ddp, + sync_dist_group=sync_dist_group, + metric_attribute=metric_attribute, + rank_zero_only=rank_zero_only, + ) - self._results.log( - name, - value, - prog_bar=prog_bar, - logger=logger, - on_step=on_step, - on_epoch=on_epoch, - reduce_fx=reduce_fx, - enable_graph=enable_graph, - dataloader_idx=(self._current_dataloader_idx if add_dataloader_idx else None), - ) + self.trainer.logger_connector._current_fx = self._current_fx_name def log_dict( self, - dictionary: dict, + dictionary: Mapping[str, _METRIC_COLLECTION], prog_bar: bool = False, logger: bool = True, on_step: Optional[bool] = None, on_epoch: Optional[bool] = None, - reduce_fx: Callable = torch.mean, - tbptt_reduce_fx: Optional = None, # noqa: Remove in 1.6 - tbptt_pad_token: Optional = None, # noqa: Remove in 1.6 + reduce_fx: Union[str, Callable] = 'default', # TODO: change to 'mean' when `sync_dist_op` is removed in 1.6 + tbptt_reduce_fx: Optional[Any] = None, # noqa: Remove in 1.6 + tbptt_pad_token: Optional[Any] = None, # noqa: Remove in 1.6 enable_graph: bool = False, sync_dist: bool = False, - sync_dist_op: Union[Any, str] = 'mean', + sync_dist_op: Optional[Any] = None, # noqa: Remove in 1.6 sync_dist_group: Optional[Any] = None, add_dataloader_idx: bool = True, ) -> None: """ - Log a dictonary of values at once + Log a dictionary of values at once Example:: @@ -384,15 +456,15 @@ def log_dict( self.log_dict(values) Args: - dictionary: key value pairs (str, tensors) + dictionary: key value pairs. + The values can be a ``float``, ``Tensor``, ``Metric``, or a dictionary of the former. prog_bar: if True logs to the progress base logger: if True logs to the logger on_step: if True logs at this step. None auto-logs for training_step but not validation/test_step on_epoch: if True logs epoch accumulated metrics. None auto-logs for val/test step but not training_step - reduce_fx: reduction function over step values for end of epoch. Torch.mean by default + reduce_fx: reduction function over step values for end of epoch. :meth:`torch.mean` by default. enable_graph: if True, will not auto detach the graph sync_dist: if True, reduces the metric across GPUs/TPUs - sync_dist_op: the op to sync across GPUs/TPUs sync_dist_group: the ddp group sync across add_dataloader_idx: if True, appends the index of the current dataloader to the name (when using multiple). If False, user needs to give unique names for @@ -417,29 +489,32 @@ def log_dict( ) @staticmethod - def __sync( - value: _METRIC, - sync_fn: Optional[Callable] = None, - sync_dist: bool = False, - sync_dist_op: Union[Any, str] = 'mean', - sync_dist_group: Optional[Any] = None, - device: torch.device = None, - ) -> _METRIC: - """Sync across workers when using distributed training""" - if not isinstance(value, (torch.Tensor, numbers.Number)): - return value - - sync_fn = sync_fn or sync_ddp_if_available - dist_available = torch.distributed.is_available() and torch.distributed.is_initialized() or tpu_distributed() - if not sync_dist or not dist_available: - return value - - # TODO: Find a way to make the reduction only once, so we don't need to clone. - if isinstance(value, torch.Tensor): - value = value.clone() - else: - value = torch.tensor(value, device=device, dtype=torch.float) - return sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) + def __check_not_nested(value: dict, name: str) -> dict: + # self-imposed restriction. for simplicity + if any(isinstance(v, dict) for v in value.values()): + raise ValueError(f'`self.log({name}, {value})` was called, but nested dictionaries cannot be logged') + return value + + @staticmethod + def __check_allowed(v: Any, name: str, value: Any) -> None: + raise ValueError(f'`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged') + + def __to_tensor(self, value: numbers.Number) -> torch.Tensor: + return torch.tensor(value, device=self.device) + + def log_grad_norm(self, grad_norm_dict: Dict[str, torch.Tensor]) -> None: + """Override this method to change the default behaviour of ``log_grad_norm``. + + Args: + grad_norm_dict: Dictionary containing current grad norm metrics + + Example:: + + # DEFAULT + def log_grad_norm(self, grad_norm_dict): + self.log_dict(grad_norm_dict, on_step=False, on_epoch=True, prog_bar=False, logger=True) + """ + self.log_dict(grad_norm_dict, on_step=True, on_epoch=True, prog_bar=True, logger=True) def write_prediction( self, name: str, value: Union[torch.Tensor, List[torch.Tensor]], filename: str = 'predictions.pt' @@ -468,7 +543,7 @@ def write_prediction( ' and will be removed in v1.5.' ) - self.trainer.evaluation_loop.predictions._add_prediction(name, value, filename) + self.trainer._evaluation_loop.predictions._add_prediction(name, value, filename) def write_prediction_dict(self, predictions_dict: Dict[str, Any], filename: str = 'predictions.pt'): """ @@ -535,8 +610,7 @@ def all_gather( group = group if group is not None else torch.distributed.group.WORLD all_gather = self.trainer.accelerator.all_gather data = convert_to_tensors(data, device=self.device) - all_gather = partial(all_gather, group=group, sync_grads=sync_grads) - return apply_to_collection(data, torch.Tensor, all_gather) + return apply_to_collection(data, torch.Tensor, all_gather, group=group, sync_grads=sync_grads) def forward(self, *args, **kwargs) -> Any: r""" @@ -1093,6 +1167,29 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] By default, it calls :meth:`~pytorch_lightning.core.lightning.LightningModule.forward`. Override to add any processing logic. + The :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` is used + to scale inference on multi-devices. + + To prevent an OOM error, it is possible to use :class:`~pytorch_lightning.callbacks.BasePredictionWriter` + callback to write the predictions to disk or database after each batch or on epoch end. + + The :class:`~pytorch_lightning.callbacks.BasePredictionWriter` should be used while using a spawn + based accelerator. This happens for ``Trainer(accelerator="ddp_spawn")`` + or training on 8 TPU cores with ``Trainer(tpu_cores=8)`` as predictions won't be returned. + + Example :: + + class MyModel(LightningModule): + + def predicts_step(self, batch, batch_idx, dataloader_idx): + return self(batch) + + dm = ... + model = MyModel() + trainer = Trainer(gpus=2) + predictions = trainer.predict(model, dm) + + Args: batch: Current batch batch_idx: Index of current batch @@ -1324,7 +1421,7 @@ def training_step(...): # backward self._running_manual_backward = True - self.trainer.train_loop.backward(loss, optimizer=None, opt_idx=None, *args, **kwargs) + self.trainer.fit_loop.epoch_loop.batch_loop.backward(loss, optimizer=None, opt_idx=None, *args, **kwargs) self._running_manual_backward = False def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: @@ -1417,13 +1514,14 @@ def optimizer_step( Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls each optimizer. By default, Lightning calls ``step()`` and ``zero_grad()`` as shown in the example - once per optimizer. + once per optimizer. This method (and ``zero_grad()``) won't be called during the + accumulation phase when ``Trainer(accumulate_grad_batches != 1)``. Warning: If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter to ``optimizer.step()`` function as shown in the examples. This ensures that ``training_step()``, ``optimizer.zero_grad()``, ``backward()`` are called within - :meth:`~pytorch_lightning.trainer.training_loop.TrainLoop.run_training_batch`. + :meth:`~pytorch_lightning.loops.training_batch_loop.TrainingBatchLoop.advance`. Args: epoch: Current epoch @@ -1563,15 +1661,24 @@ def tbptt_split_batch(self, batch, split_size): return splits - def summarize(self, mode: Optional[str] = ModelSummary.MODE_DEFAULT) -> Optional[ModelSummary]: + def summarize(self, mode: Optional[str] = "top", max_depth: Optional[int] = None) -> Optional[ModelSummary]: model_summary = None - if mode in ModelSummary.MODES: - model_summary = ModelSummary(self, mode=mode) - log.info("\n" + str(model_summary)) - elif mode is not None: - raise MisconfigurationException(f"`mode` can be None, {', '.join(ModelSummary.MODES)}, got {mode}") + # temporary mapping from mode to max_depth + if max_depth is None: + if mode in ModelSummary.MODES: + max_depth = ModelSummary.MODES[mode] + rank_zero_deprecation( + f"Argument `mode` in `LightningModule.summarize` is deprecated in v1.4" + f" and will be removed in v1.6. Use `max_depth={max_depth}` to replicate `mode={mode}` behavior." + ) + model_summary = ModelSummary(self, max_depth=max_depth) + elif mode is not None: + raise MisconfigurationException(f"`mode` can be None, {', '.join(ModelSummary.MODES)}, got {mode}") + else: + model_summary = ModelSummary(self, max_depth=max_depth) + log.info("\n" + str(model_summary)) return model_summary def freeze(self) -> None: @@ -1628,7 +1735,7 @@ def get_progress_bar_dict(self): Dictionary with the items to be displayed in the progress bar. """ # call .item() only once but store elements without graphs - running_train_loss = self.trainer.train_loop.running_loss.mean() + running_train_loss = self.trainer.fit_loop.running_loss.mean() avg_training_loss = None if running_train_loss is not None: avg_training_loss = running_train_loss.cpu().item() @@ -1642,7 +1749,7 @@ def get_progress_bar_dict(self): module_tbptt_enabled = self.truncated_bptt_steps > 0 trainer_tbptt_enabled = self.trainer.truncated_bptt_steps is not None and self.trainer.truncated_bptt_steps > 0 if module_tbptt_enabled or trainer_tbptt_enabled: - tqdm_dict["split_idx"] = self.trainer.train_loop.split_idx + tqdm_dict["split_idx"] = self.trainer.fit_loop.split_idx if self.trainer.logger is not None and self.trainer.logger.version is not None: version = self.trainer.logger.version @@ -1926,3 +2033,30 @@ def model_size(self) -> float: size_mb = os.path.getsize(tmp_name) / 1e6 os.remove(tmp_name) return size_mb + + def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: + """Appends the :attr:`trainer.callback_metrics` dictionary to the given queue. + + To avoid issues with memory sharing, we cast the data to numpy. + + Args: + queue: the instance of the queue to append the data. + """ + callback_metrics: dict = apply_to_collection( + self.trainer.callback_metrics, torch.Tensor, lambda x: x.cpu().numpy() + ) # send as numpy to avoid issues with memory sharing + queue.put(callback_metrics) + + def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: + """Retrieve the :attr:`trainer.callback_metrics` dictionary from the given queue. + + To preserve consistency, we cast back the data to ``torch.Tensor``. + + Args: + queue: the instance of the queue from where to get the data. + """ + # NOTE: `add_to_queue` needs to be called before + callback_metrics: dict = queue.get() + self.trainer.callback_metrics.update( + apply_to_collection(callback_metrics, np.ndarray, lambda x: torch.tensor(x)) + ) diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py index 2908064e3c68f..bba42d6997be3 100644 --- a/pytorch_lightning/core/memory.py +++ b/pytorch_lightning/core/memory.py @@ -21,9 +21,14 @@ import numpy as np import torch import torch.nn as nn +from torch import Tensor from torch.utils.hooks import RemovableHandle from pytorch_lightning.utilities import AMPType, DeviceType +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 +from pytorch_lightning.utilities.warnings import WarningCache + +warning_cache = WarningCache() PARAMETER_NUM_UNITS = [" ", "K", "M", "B", "T"] UNKNOWN_SIZE = "?" @@ -118,7 +123,7 @@ def layer_type(self) -> str: @property def num_parameters(self) -> int: """ Returns the number of parameters in this module. """ - return sum(np.prod(p.shape) for p in self._module.parameters()) + return sum(np.prod(p.shape) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) class ModelSummary(object): @@ -126,11 +131,17 @@ class ModelSummary(object): Generates a summary of all layers in a :class:`~pytorch_lightning.core.lightning.LightningModule`. Args: - model: The model to summarize (also referred to as the root module) + model: The model to summarize (also referred to as the root module). mode: Can be one of - - `top` (default): only the top-level modules will be recorded (the children of the root module) - - `full`: summarizes all layers and their submodules in the root module + - `top` (default): only the top-level modules will be recorded (the children of the root module) + - `full`: summarizes all layers and their submodules in the root module + + .. deprecated:: v1.4 + This parameter was deprecated in v1.4 in favor of `max_depth` and will be removed in v1.6. + + max_depth: Maximum depth of modules to show. Use -1 to show all modules or 0 to show no + summary. Defaults to 1. The string representation of this summary prints a table with columns containing the name, type and number of parameters for each layer. @@ -155,7 +166,7 @@ class ModelSummary(object): ... return self.net(x) ... >>> model = LitModel() - >>> ModelSummary(model, mode='top') # doctest: +NORMALIZE_WHITESPACE + >>> ModelSummary(model, max_depth=1) # doctest: +NORMALIZE_WHITESPACE | Name | Type | Params | In sizes | Out sizes ------------------------------------------------------------ 0 | net | Sequential | 132 K | [10, 256] | [10, 512] @@ -164,7 +175,7 @@ class ModelSummary(object): 0 Non-trainable params 132 K Total params 0.530 Total estimated model params size (MB) - >>> ModelSummary(model, mode='full') # doctest: +NORMALIZE_WHITESPACE + >>> ModelSummary(model, max_depth=-1) # doctest: +NORMALIZE_WHITESPACE | Name | Type | Params | In sizes | Out sizes -------------------------------------------------------------- 0 | net | Sequential | 132 K | [10, 256] | [10, 512] @@ -177,14 +188,28 @@ class ModelSummary(object): 0.530 Total estimated model params size (MB) """ - MODE_TOP = "top" - MODE_FULL = "full" - MODE_DEFAULT = MODE_TOP - MODES = [MODE_FULL, MODE_TOP] + MODES = dict(top=1, full=-1) # TODO: remove in v1.6 - def __init__(self, model, mode: str = MODE_DEFAULT): + def __init__(self, model, mode: Optional[str] = None, max_depth: Optional[int] = 1): self._model = model - self._mode = mode + + # temporary mapping from mode to max_depth + if max_depth is None or mode is not None: + if mode in ModelSummary.MODES: + max_depth = ModelSummary.MODES[mode] + from pytorch_lightning.utilities import rank_zero_deprecation + rank_zero_deprecation( + f"Argument `mode` in `ModelSummary` is deprecated in v1.4" + f" and will be removed in v1.6. Use `max_depth={max_depth}` to replicate `mode={mode}` behaviour." + ) + else: + from pytorch_lightning.utilities.exceptions import MisconfigurationException + raise MisconfigurationException(f"`mode` can be {', '.join(ModelSummary.MODES)}, got {mode}.") + + if not isinstance(max_depth, int) or max_depth < -1: + raise ValueError(f"`max_depth` can be -1, 0 or > 0, got {max_depth}.") + + self._max_depth = max_depth self._layer_summary = self.summarize() # 1 byte -> 8 bits # TODO: how do we compute precisin_megabytes in case of mixed precision? @@ -193,14 +218,14 @@ def __init__(self, model, mode: str = MODE_DEFAULT): @property def named_modules(self) -> List[Tuple[str, nn.Module]]: - if self._mode == ModelSummary.MODE_FULL: - mods = self._model.named_modules() - mods = list(mods)[1:] # do not include root module (LightningModule) - elif self._mode == ModelSummary.MODE_TOP: + if self._max_depth == 0: + mods = [] + elif self._max_depth == 1: # the children are the top-level modules mods = self._model.named_children() else: - mods = [] + mods = self._model.named_modules() + mods = list(mods)[1:] # do not include root module (LightningModule) return list(mods) @property @@ -225,11 +250,13 @@ def param_nums(self) -> List[int]: @property def total_parameters(self) -> int: - return sum(p.numel() for p in self._model.parameters()) + return sum(p.numel() if not _is_lazy_weight_tensor(p) else 0 for p in self._model.parameters()) @property def trainable_parameters(self) -> int: - return sum(p.numel() for p in self._model.parameters() if p.requires_grad) + return sum( + p.numel() if not _is_lazy_weight_tensor(p) else 0 for p in self._model.parameters() if p.requires_grad + ) @property def model_size(self) -> float: @@ -242,6 +269,12 @@ def summarize(self) -> Dict[str, LayerSummary]: self._forward_example_input() for layer in summary.values(): layer.detach_hook() + + if self._max_depth >= 1: + # remove summary entries with depth > max_depth + for k in [k for k in summary if k.count(".") >= self._max_depth]: + del summary[k] + return summary def _forward_example_input(self) -> None: @@ -438,3 +471,15 @@ def get_human_readable_count(number: int) -> str: return f"{int(number):,d} {labels[index]}" return f"{number:,.1f} {labels[index]}" + + +def _is_lazy_weight_tensor(p: Tensor) -> bool: + if _TORCH_GREATER_EQUAL_1_8: + from torch.nn.parameter import UninitializedParameter + if isinstance(p, UninitializedParameter): + warning_cache.warn( + "A layer with UninitializedParameter was found. " + "Thus, the total number of parameters detected may be inaccurate." + ) + return True + return False diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index 174631ae73e8b..3572a79b9bd84 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -120,7 +120,7 @@ def toggle_model(self, sync_grad: bool = True): during the accumulation phase. Setting `sync_grad` to False will block this synchronization and improve performance. """ - with self._trainer.train_loop.block_ddp_sync_behaviour(not sync_grad): + with self._trainer.fit_loop.epoch_loop.batch_loop.block_ddp_sync_behaviour(not sync_grad): self._toggle_model() yield self._untoggle_model() diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py index ffa9b0a1359ee..74862735aba61 100644 --- a/pytorch_lightning/core/saving.py +++ b/pytorch_lightning/core/saving.py @@ -202,7 +202,17 @@ def _load_model_state(cls, checkpoint: Dict[str, Any], strict: bool = True, **cl model.on_load_checkpoint(checkpoint) # load the state_dict on the model automatically - model.load_state_dict(checkpoint['state_dict'], strict=strict) + keys = model.load_state_dict(checkpoint['state_dict'], strict=strict) + + if not strict: + if keys.missing_keys: + rank_zero_warn( + f"Found keys that are in the model state dict but not in the checkpoint: {keys.missing_keys}" + ) + if keys.unexpected_keys: + rank_zero_warn( + f"Found keys that are not in the model state dict but in the checkpoint: {keys.unexpected_keys}" + ) return model diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py deleted file mode 100644 index c55fb14a7eed4..0000000000000 --- a/pytorch_lightning/core/step_result.py +++ /dev/null @@ -1,613 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Result class for easier logging and epoch-wise reduction.""" - -from copy import copy -from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union - -import torch -from torch import Tensor -from torchmetrics import Metric - - -class Result(Dict): - - def __init__(self) -> None: - super().__init__() - self['meta'] = {'_internal': {'_reduce_on_epoch': False, 'batch_sizes': []}} - - def __getitem__(self, key: Union[str, Any]) -> Any: - try: - return super().__getitem__(key) - except KeyError: - return super().__getitem__(f'{key}_step') - - def __getattr__(self, key: str) -> Any: - try: - if key == 'batch_log_metrics': - return self.get_batch_log_metrics() - elif key == 'batch_pbar_metrics': - return self.get_batch_pbar_metrics() - elif key == 'epoch_log_metrics': - return self.get_epoch_log_metrics() - elif key == 'epoch_pbar_metrics': - return self.get_epoch_pbar_metrics() - else: - return self[key] - except KeyError: - return None - - def __setattr__(self, key: str, val: Union[Tensor, Any]): - # ensure tensors are detached - if isinstance(val, torch.Tensor) and key != 'minimize': - val = val.detach() - self[key] = val - - def __getstate__(self): - return self - - def __setstate__(self, d): - self.update(d) - - @property - def minimize(self) -> Optional[Tensor]: - return self.get('minimize', None) - - @minimize.setter - def minimize(self, val: Optional[torch.Tensor]) -> None: - if val is not None: - if not isinstance(val, Tensor): - raise ValueError(f"`Result.minimize` must be a `torch.Tensor`, found: {val}") - if val.grad_fn is None: - raise RuntimeError("`Result.minimize` must have a `grad_fn`") - self['minimize'] = val - - def log( - self, - name: str, - value: Any, - prog_bar: bool = False, - logger: bool = True, - on_step: bool = False, - on_epoch: bool = True, - reduce_fx: Callable = torch.mean, - enable_graph: bool = False, - dataloader_idx: Optional[int] = None, - ): - # no metrics should be logged with graphs - if not enable_graph and isinstance(value, torch.Tensor): - value = value.detach() - - if isinstance(value, torch.Tensor) and value.device.type == "xla": - value = value.cpu() - - if 'meta' not in self: - self.__setitem__('meta', {}) - - # if user requests both step and epoch, then we split the metric in two automatically - # one will be logged per step. the other per epoch - was_forked = False - if on_step and on_epoch: - was_forked = True - - # set step version - step_name = f'{name}_step' - - self.__set_meta( - step_name, - value, - prog_bar, - logger, - on_step=True, - on_epoch=False, - reduce_fx=reduce_fx, - forked=False, - dataloader_idx=dataloader_idx, - ) - - self.__setitem__(step_name, value) - - # set epoch version - epoch_name = f'{name}_epoch' - - self.__set_meta( - epoch_name, - value, - prog_bar, - logger, - on_step=False, - on_epoch=True, - reduce_fx=reduce_fx, - forked=False, - dataloader_idx=dataloader_idx, - ) - self.__setitem__(epoch_name, value) - - # always log the original metric - self.__set_meta( - name, - value, - prog_bar, - logger, - on_step, - on_epoch, - reduce_fx, - forked=was_forked, - dataloader_idx=dataloader_idx, - ) - - # set the value - self.__setitem__(name, value) - - def __set_meta( - self, - name: str, - value: Any, - prog_bar: bool, - logger: bool, - on_step: bool, - on_epoch: bool, - reduce_fx: Callable, - forked: bool, - dataloader_idx: Union[int, None], - ): - # set the meta for the item - meta_value = value - meta = dict( - prog_bar=prog_bar, - logger=logger, - on_step=on_step, - on_epoch=on_epoch, - reduce_fx=reduce_fx, - value=meta_value, - forked=forked, - dataloader_idx=dataloader_idx, - ) - - self['meta'][name] = meta - - # track whether any input requires reduction on epoch end - _internal = self['meta']['_internal'] - _internal['_reduce_on_epoch'] = max(_internal['_reduce_on_epoch'], on_epoch) - - def track_batch_size(self, batch): - batch_size = Result.extract_batch_size(batch) - Result.attach_batch_size(batch_size, self) - - @staticmethod - def extract_batch_size(batch): - try: - batch_size = Result.unpack_batch_size(batch) - except RecursionError: - batch_size = 1 - return batch_size - - @staticmethod - def attach_batch_size(batch_size: Union[int, None], result: 'Result') -> None: - if batch_size is not None: - meta = result['meta'] - meta['_internal']['batch_sizes'].append(batch_size) - - def get_batch_sizes(self): - meta = self['meta'] - return torch.tensor(meta['_internal']['batch_sizes']) - - def _add_dataloader_idx(self, k: str, dataloader_idx: Union[int, None], add_dataloader_idx: bool) -> str: - if dataloader_idx is not None and add_dataloader_idx: - return f"{k}/dataloader_idx_{dataloader_idx}" - return k - - def get_batch_log_metrics(self, include_forked_originals=True, add_dataloader_idx=False) -> dict: - """ - Gets the metrics to log at the end of the batch step - - """ - result = {} - - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - - if options['forked'] and not include_forked_originals: - continue - - dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) - - if options['logger'] and options['on_step']: - if isinstance(self[k], Metric) and self[k]._forward_cache is not None: - result[dl_key] = self[k]._forward_cache.detach() - else: - result[dl_key] = self[k] - - return result - - def get_epoch_log_metrics(self, add_dataloader_idx=False) -> dict: - """ - Gets the metrics to log at the end of epoch - """ - result = {} - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - - if options['forked']: - continue - - dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) - - if options['logger'] and options['on_epoch']: - if isinstance(self[k], Metric): - result[dl_key] = self[k].compute().detach() - else: - result[dl_key] = self[k] - - if k in self and not options['on_epoch'] and isinstance(self[k], Metric): - # compute for reuse later - self[k].compute() - - return result - - def get_epoch_pbar_metrics(self, add_dataloader_idx=False): - """ - Gets the metrics to log at the end of epoch - """ - result = {} - - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - - if options['forked']: - continue - - dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) - - if options['prog_bar'] and options['on_epoch']: - if isinstance(self[k], Metric): - result[dl_key] = self[k].compute().detach() - else: - result[dl_key] = self[k] - - if k in self and not options['on_epoch'] and isinstance(self[k], Metric): - # compute for reuse later - self[k].compute() - - return result - - def get_forked_metrics(self, add_dataloader_idx=False): - """ - Gets the metrics to log at the end of epoch - """ - result = {} - - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - - dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) - - if options['forked']: - if isinstance(self[k], Metric): - result[dl_key] = self[k].compute().detach() - else: - result[dl_key] = self[k] - - return result - - def get_batch_pbar_metrics(self, include_forked_originals=True, add_dataloader_idx=False): - """ - Gets the metrics to log at the end of the batch step - """ - result = {} - - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - - if options['forked'] and not include_forked_originals: - continue - - dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) - - if options['prog_bar'] and options['on_step']: - if isinstance(self[k], Metric) and self[k]._forward_cache is not None: - result[dl_key] = self[k]._forward_cache - else: - result[dl_key] = self[k] - - return result - - def detach(self) -> 'Result': - for k, v in self.items(): - if isinstance(v, torch.Tensor): - self.__setitem__(k, v.detach()) - return self - - def to(self, *args, **kwargs) -> 'Result': - """Move all self attributes to the given device.""" - for k, v in self.items(): - if isinstance(v, torch.Tensor): - self.__setitem__(k, v.to(*args, **kwargs)) - return self - - def cpu(self) -> 'Result': - """Move all self attributes to CPU.""" - return self.to(torch.device("cpu")) - - def __repr__(self): - self_copy = self.copy() - - if 'meta' in self_copy: - del self_copy['meta'] - - return str(self_copy) - - def __str__(self): - copy = self.copy() - del copy['meta'] - - return str(copy) - - def __copy__(self): - newone = type(self)() - for k, v in self.items(): - if isinstance(v, torch.Tensor): - v = v.detach() - newone[k] = copy(v) - return newone - - @staticmethod - def unpack_batch_size(sample): - """ - Recursively unpack sample to find a torch.Tensor. - returns len(tensor) when found, or 1 when it hits an empty or non iterable. - """ - if isinstance(sample, torch.Tensor): - size = sample.size(0) - elif isinstance(sample, str): - return len(sample) - elif isinstance(sample, dict): - sample = next(iter(sample.values()), 1) - size = Result.unpack_batch_size(sample) - elif isinstance(sample, Iterable): - sample = next(iter(sample), 1) - size = Result.unpack_batch_size(sample) - else: - size = 1 - return size - - @classmethod - def reduce_on_epoch_end(cls, outputs): - # get the batch sizes for all outputs - batch_sizes = [] - meta = {} - for x in outputs: - batch_sizes.append(x.get_batch_sizes()) - meta.update(x['meta']) - - batch_sizes = torch.stack(batch_sizes).view(-1) - - result = cls() - result = recursive_gather(outputs, result) - recursive_stack(result) - - for k, option in meta.items(): - if k == '_internal' or isinstance(result[k], Metric): - continue - - # for forked metrics don't reduce, just take the last val - if option['forked']: - result[k] = choose_last(result[k]) - continue - - if option['on_epoch']: - fx = option['reduce_fx'] - if fx == torch.mean: - if isinstance(result[k], list): - result[k] = torch.tensor(result[k]).float() - try: - reduced_val = weighted_mean(result[k], batch_sizes) - # todo: specify the expected Exceptions to come - except Exception: - reduced_val = torch.mean(result[k]) - else: - reduced_val = fx(result[k]) - - result[k] = reduced_val - else: - del result[k] - - result['meta'] = meta - return result - - @classmethod - def reduce_across_time(cls, time_outputs): - # auto-reduce across time for tbptt - meta = time_outputs[0]['meta'] - - result = cls() - result = recursive_gather(time_outputs, result) - recursive_stack(result) - - for k, value in result.items(): - if k in ['meta', 'extra'] or isinstance(value, Metric): - continue - - if isinstance(value, list): - value = torch.tensor(value) - - if isinstance(value, dict): - # TODO: recursive reduce: - _recursive_fx_apply(value, torch.mean) - else: - result[k] = torch.mean(value.float()) - - result['meta'] = meta - return result - - def dp_reduce(self): - for k, value in self.items(): - if k == 'meta' or isinstance(value, Metric): - continue - - if isinstance(value, list): - value = torch.tensor(value) - - self[k] = value.mean(dim=-1) - - @property - def should_reduce_on_epoch_end(self) -> bool: - return self['meta']['_internal']['_reduce_on_epoch'] - - def rename_keys(self, map_dict: dict): - """ - Maps key values to the target values. Useful when renaming variables in mass. - - Args: - map_dict: - """ - meta = self.meta - for source, dest in map_dict.items(): - # map the main keys - self[dest] = self[source] - del self[source] - - # map meta - meta[dest] = meta[source] - del meta[source] - - def reset(self) -> None: - """ - Call at the end of epoch to reset all metric objects - """ - for k, value in self.items(): - if isinstance(value, Metric): - value.reset() - - -def choose_last(x): - if isinstance(x, (torch.Tensor, list)): - return x[-1] - if isinstance(x, dict): - for k, v in x.items(): - x[k] = x[k][-1] - - -def recursive_gather(outputs: Sequence[dict], result: Optional[MutableMapping] = None) -> Optional[MutableMapping]: - for out in outputs: - if 'meta' in out: - del out['meta'] - - for k, v in out.items(): - # support manual opt where the user does not return a minimize key - if k == 'minimize' and v is None: - continue - - if isinstance(v, dict): - in_d = result.get(k, {}) - v = recursive_gather([v], in_d) - result[k] = v - else: - if isinstance(v, Metric): - # if v is a metric, just keep one of them, - # don't keep on adding a list of them - result[k] = v - else: - if k not in result: - result[k] = [] - result[k].append(v) - - return result - - -def recursive_stack(result: MutableMapping): - for k, v in result.items(): - if isinstance(v, dict): - recursive_stack(v) - - result[k] = collate_tensors(v) - - -def _recursive_fx_apply(input: dict, fx): - for k, v in input.items(): - if isinstance(v, list): - v = torch.tensor(v) - - if isinstance(v, torch.Tensor): - v = fx(v.float()) - input[k] = v - else: - _recursive_fx_apply(v, fx) - - -def collate_tensors(items: Union[List, Tuple]) -> Union[Tensor, List, Tuple]: - if not items or not isinstance(items, (list, tuple)) or any(not isinstance(item, Tensor) for item in items): - # items is not a sequence, empty, or contains non-tensors - return items - - if all(item.ndim == 0 for item in items): - # all tensors are scalars, we need to stack - return torch.stack(items) - - if all(item.ndim >= 1 and item.shape[1:] == items[0].shape[1:] for item in items): - # we can concatenate along the first dimension - return torch.cat(items) - - return items - - -def weighted_mean(result, weights): - - if isinstance(result, dict): - _process_dataloader_aggregated_steps(result, weights) - else: - if isinstance(result, list): - result = torch.tensor(result) - - weights = weights.to(result.device)[:result.size(0)] - numerator = torch.dot(result.float(), weights.transpose(-1, 0).float()) - result = numerator / weights.sum().float() - return result - - -def _process_dataloader_aggregated_steps(result, weights): - internal_keys = {'meta'} - - moved = False - - for k, v in result.items(): - if k in internal_keys: - continue - - # make sure v is a tensor - if not isinstance(v, torch.Tensor): - v = torch.tensor(v) - - # move to memory only once - if not moved: - weights = weights.to(v.device) - moved = True - - # move weights to same device as value to reduce - weights_t = weights[:v.size(0)] - - # weighted mean - numerator = torch.dot(v.float(), weights_t.transpose(-1, 0).float()) - v = numerator / weights.sum().float() - result[k] = v diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index 035a42338fe68..d6875d225790c 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -20,11 +20,13 @@ from argparse import Namespace from functools import wraps from typing import Any, Callable, Dict, Iterable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union +from weakref import ReferenceType import numpy as np import torch -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint from pytorch_lightning.utilities import rank_zero_only @@ -71,6 +73,15 @@ def __init__( self._agg_key_funcs = agg_key_funcs if agg_key_funcs else {} self._agg_default_func = agg_default_func + def after_save_checkpoint(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None: + """ + Called after model checkpoint callback saves a new checkpoint + + Args: + model_checkpoint: the model checkpoint callback instance + """ + pass + def update_agg_funcs( self, agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, @@ -289,7 +300,7 @@ def log_hyperparams(self, params: argparse.Namespace, *args, **kwargs): kwargs: Optional keywoard arguments, depends on the specific logger being used """ - def log_graph(self, model: LightningModule, input_array=None) -> None: + def log_graph(self, model: 'pl.LightningModule', input_array=None) -> None: """ Record model graph @@ -355,7 +366,11 @@ def __init__(self, logger_iterable: Iterable[LightningLoggerBase]): self._logger_iterable = logger_iterable def __getitem__(self, index: int) -> LightningLoggerBase: - return [logger for logger in self._logger_iterable][index] + return list(self._logger_iterable)[index] + + def after_save_checkpoint(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None: + for logger in self._logger_iterable: + logger.after_save_checkpoint(checkpoint_callback) def update_agg_funcs( self, @@ -381,7 +396,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: for logger in self._logger_iterable: logger.log_hyperparams(params) - def log_graph(self, model: LightningModule, input_array=None) -> None: + def log_graph(self, model: 'pl.LightningModule', input_array=None) -> None: for logger in self._logger_iterable: logger.log_graph(model, input_array) diff --git a/pytorch_lightning/loggers/comet.py b/pytorch_lightning/loggers/comet.py index 148e512f5e439..498a16a9daa29 100644 --- a/pytorch_lightning/loggers/comet.py +++ b/pytorch_lightning/loggers/comet.py @@ -24,7 +24,7 @@ import torch from torch import is_tensor -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment from pytorch_lightning.utilities import _module_available, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -318,6 +318,6 @@ def __getstate__(self): state["_experiment"] = None return state - def log_graph(self, model: LightningModule, input_array=None) -> None: + def log_graph(self, model: 'pl.LightningModule', input_array=None) -> None: if self._experiment is not None: self._experiment.set_model_graph(model) diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py index 4df672fa6e3b5..754a7cf892060 100644 --- a/pytorch_lightning/loggers/csv_logs.py +++ b/pytorch_lightning/loggers/csv_logs.py @@ -29,7 +29,8 @@ from pytorch_lightning.core.saving import save_hparams_to_yaml from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment -from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.distributed import rank_zero_only log = logging.getLogger(__name__) diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index aed09f11464f8..89b9628534e09 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -278,15 +278,13 @@ def save_dir(self) -> Optional[str]: def name(self) -> str: if self.offline_mode: return 'offline-name' - else: - return self.experiment.name + return self.experiment.name @property def version(self) -> str: if self.offline_mode: return 'offline-id-1234' - else: - return self.experiment.id + return self.experiment.id @rank_zero_only def log_metric( diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py index 94268f6063f51..ea0937016550d 100644 --- a/pytorch_lightning/loggers/tensorboard.py +++ b/pytorch_lightning/loggers/tensorboard.py @@ -25,7 +25,7 @@ from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard.summary import hparams -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.core.saving import save_hparams_to_yaml from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_only, rank_zero_warn @@ -112,8 +112,7 @@ def root_dir(self) -> str: """ if self.name is None or len(self.name) == 0: return self.save_dir - else: - return os.path.join(self.save_dir, self.name) + return os.path.join(self.save_dir, self.name) @property def log_dir(self) -> str: @@ -223,7 +222,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> raise ValueError(m) from ex @rank_zero_only - def log_graph(self, model: LightningModule, input_array=None): + def log_graph(self, model: 'pl.LightningModule', input_array=None): if self._log_graph: if input_array is None: input_array = model.example_input_array @@ -267,14 +266,16 @@ def version(self) -> int: return self._version def _get_next_version(self): - root_dir = os.path.join(self.save_dir, self.name) + root_dir = self.root_dir - if not self._fs.isdir(root_dir): + try: + listdir_info = self._fs.listdir(root_dir) + except OSError: log.warning('Missing logger folder: %s', root_dir) return 0 existing_versions = [] - for listing in self._fs.listdir(root_dir): + for listing in listdir_info: d = listing["name"] bn = os.path.basename(d) if self._fs.isdir(d) and bn.startswith("version_"): diff --git a/pytorch_lightning/loggers/test_tube.py b/pytorch_lightning/loggers/test_tube.py index 84f231b0f16d7..1650ab8f4ba49 100644 --- a/pytorch_lightning/loggers/test_tube.py +++ b/pytorch_lightning/loggers/test_tube.py @@ -18,10 +18,10 @@ from argparse import Namespace from typing import Any, Dict, Optional, Union -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment -from pytorch_lightning.utilities import _module_available -from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities import _module_available, rank_zero_warn +from pytorch_lightning.utilities.distributed import rank_zero_only _TESTTUBE_AVAILABLE = _module_available("test_tube") @@ -153,7 +153,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> self.experiment.log(metrics, global_step=step) @rank_zero_only - def log_graph(self, model: LightningModule, input_array=None): + def log_graph(self, model: 'pl.LightningModule', input_array=None): if self._log_graph: if input_array is None: input_array = model.example_input_array diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index 0f73153378ed4..5daf2176f3421 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -15,20 +15,26 @@ Weights and Biases Logger ------------------------- """ +import operator import os from argparse import Namespace +from pathlib import Path from typing import Any, Dict, Optional, Union +from weakref import ReferenceType import torch.nn as nn +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment from pytorch_lightning.utilities import _module_available, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _compare_version from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() _WANDB_AVAILABLE = _module_available("wandb") +_WANDB_GREATER_EQUAL_0_10_22 = _compare_version("wandb", operator.ge, "0.10.22") try: import wandb @@ -40,7 +46,7 @@ class WandbLogger(LightningLoggerBase): r""" - Log using `Weights and Biases `_. + Log using `Weights and Biases `_. Install it with pip: @@ -56,7 +62,15 @@ class WandbLogger(LightningLoggerBase): version: Same as id. anonymous: Enables or explicitly disables anonymous logging. project: The name of the project to which this run will belong. - log_model: Save checkpoints in wandb dir to upload on W&B servers. + log_model: Log checkpoints created by :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` + as W&B artifacts. + + * if ``log_model == 'all'``, checkpoints are logged during training. + * if ``log_model == True``, checkpoints are logged at the end of training, except when + :paramref:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint.save_top_k` ``== -1`` + which also logs every checkpoint during training. + * if ``log_model == False`` (default), no checkpoint is logged. + prefix: A string to put at the beginning of metric keys. experiment: WandB experiment object. Automatically set when creating a run. \**kwargs: Arguments passed to :func:`wandb.init` like `entity`, `group`, `tags`, etc. @@ -71,15 +85,16 @@ class WandbLogger(LightningLoggerBase): from pytorch_lightning.loggers import WandbLogger from pytorch_lightning import Trainer - wandb_logger = WandbLogger() + + # instrument experiment with W&B + wandb_logger = WandbLogger(project='MNIST', log_model='all') trainer = Trainer(logger=wandb_logger) - Note: When logging manually through `wandb.log` or `trainer.logger.experiment.log`, - make sure to use `commit=False` so the logging step does not increase. + # log gradients and model topology + wandb_logger.watch(model) See Also: - - `Tutorial `__ - on how to use W&B with PyTorch Lightning + - `Demo in Google Colab `__ with model logging - `W&B Documentation `__ """ @@ -114,10 +129,17 @@ def __init__( 'Hint: Set `offline=False` to log your model.' ) - if sync_step is not None: + if log_model and not _WANDB_GREATER_EQUAL_0_10_22: warning_cache.warn( + f'Providing log_model={log_model} requires wandb version >= 0.10.22' + ' for logging associated model metadata.\n' + 'Hint: Upgrade with `pip install --ugrade wandb`.' + ) + + if sync_step is not None: + warning_cache.deprecation( "`WandbLogger(sync_step=(True|False))` is deprecated in v1.2.1 and will be removed in v1.5." - " Metrics are now logged separately and automatically synchronized.", DeprecationWarning + " Metrics are now logged separately and automatically synchronized." ) super().__init__() @@ -125,6 +147,8 @@ def __init__( self._log_model = log_model self._prefix = prefix self._experiment = experiment + self._logged_model_time = {} + self._checkpoint_callback = None # set wandb init arguments anonymous_lut = {True: 'allow', False: None} self._wandb_init = dict( @@ -168,10 +192,6 @@ def experiment(self) -> Run: os.environ['WANDB_MODE'] = 'dryrun' self._experiment = wandb.init(**self._wandb_init) if wandb.run is None else wandb.run - # save checkpoints in wandb dir to upload on W&B servers - if self._save_dir is None: - self._save_dir = self._experiment.dir - # define default x-axis (for latest wandb versions) if getattr(self._experiment, "define_metric", None): self._experiment.define_metric("trainer/global_step") @@ -213,8 +233,49 @@ def version(self) -> Optional[str]: # don't create an experiment if we don't have one return self._experiment.id if self._experiment else self._id + def after_save_checkpoint(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None: + # log checkpoints as artifacts + if self._log_model == 'all' or self._log_model is True and checkpoint_callback.save_top_k == -1: + self._scan_and_log_checkpoints(checkpoint_callback) + elif self._log_model is True: + self._checkpoint_callback = checkpoint_callback + @rank_zero_only def finalize(self, status: str) -> None: - # upload all checkpoints from saving dir - if self._log_model: - wandb.save(os.path.join(self.save_dir, "*.ckpt")) + # log checkpoints as artifacts + if self._checkpoint_callback: + self._scan_and_log_checkpoints(self._checkpoint_callback) + + def _scan_and_log_checkpoints(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None: + # get checkpoints to be saved with associated score + checkpoints = { + checkpoint_callback.last_model_path: checkpoint_callback.current_score, + checkpoint_callback.best_model_path: checkpoint_callback.best_model_score, + **checkpoint_callback.best_k_models + } + checkpoints = sorted([(Path(p).stat().st_mtime, p, s) for p, s in checkpoints.items() if Path(p).is_file()]) + checkpoints = [ + c for c in checkpoints if c[1] not in self._logged_model_time.keys() or self._logged_model_time[c[1]] < c[0] + ] + + # log iteratively all new checkpoints + for t, p, s in checkpoints: + metadata = { + 'score': s, + 'original_filename': Path(p).name, + 'ModelCheckpoint': { + k: getattr(checkpoint_callback, k) + for k in [ + 'monitor', 'mode', 'save_last', 'save_top_k', 'save_weights_only', '_every_n_train_steps', + '_every_n_val_epochs' + ] + # ensure it does not break if `ModelCheckpoint` args change + if hasattr(checkpoint_callback, k) + } + } if _WANDB_GREATER_EQUAL_0_10_22 else None + artifact = wandb.Artifact(name=f"model-{self.experiment.id}", type="model", metadata=metadata) + artifact.add_file(p, name='model.ckpt') + aliases = ["latest", "best"] if p == checkpoint_callback.best_model_path else ["latest"] + self.experiment.log_artifact(artifact, aliases=aliases) + # remember logged models - timestamp needed in case filename didn't change (lastkckpt or custom name) + self._logged_model_time[p] = t diff --git a/pytorch_lightning/loops/__init__.py b/pytorch_lightning/loops/__init__.py new file mode 100644 index 0000000000000..b7eb47167d26f --- /dev/null +++ b/pytorch_lightning/loops/__init__.py @@ -0,0 +1,19 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pytorch_lightning.loops.base import Loop # noqa: F401 +from pytorch_lightning.loops.batch import TrainingBatchLoop # noqa: F401 +from pytorch_lightning.loops.dataloader import DataLoaderLoop, EvaluationLoop, PredictionLoop # noqa: F401 +from pytorch_lightning.loops.epoch import EvaluationEpochLoop, PredictionEpochLoop, TrainingEpochLoop # noqa: F401 +from pytorch_lightning.loops.fit_loop import FitLoop # noqa: F401 diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py new file mode 100644 index 0000000000000..18657fe5dbaff --- /dev/null +++ b/pytorch_lightning/loops/base.py @@ -0,0 +1,158 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional + +from deprecate import void + +import pytorch_lightning as pl +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class Loop(ABC): + """ + Basic Loops interface. All classes derived from this must implement the following properties and methods: + + * :attr:`done` (property): Condition to break the loop + * :attr:`reset` (method): Resets the internal state between multiple calls of :attr:`run` + * :attr:`advance` (method): Implements one step of the loop + + This class implements the following loop structure: + + .. codeblock:: python + + on_run_start() + + while not done: + on_advance_start() + advance() + on_advance_end() + + on_run_end() + """ + + def __init__(self) -> None: + self.iteration_count: int = 0 + self.trainer: Optional['pl.Trainer'] = None + self._restarting = False + + @property + def restarting(self) -> bool: + return self._restarting + + @restarting.setter + def restarting(self, restarting: bool) -> None: + self._restarting = restarting + + @property + @abstractmethod + def done(self) -> bool: + """Property indicating when loop is finished""" + + @property + def skip(self) -> bool: + """Determine whether to return immediately from the call to :meth:`run`.""" + return False + + def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None: + """Connects Loop with all the necessary things like connectors and accelerators.""" + # TODO(@justusschock): Make the trainer a weakref/proxy + if not isinstance(trainer, pl.Trainer): + raise MisconfigurationException( + f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}." + ) + self.trainer = trainer + + def on_skip(self) -> Optional[Any]: + """ + The function to run when :meth:`run` should be skipped, determined by the condition in :attr:`skip`. + + Returns: + the default output value of :meth:`on_run_end` + """ + + def run(self, *args: Any, **kwargs: Any) -> Optional[Any]: + """ + The main entry point to the loop. + + Will frequently check the :attr:`done` condition and calls :attr:`advance` + until :attr:`done` evaluates to ``True``. + + Returns: + the output of :attr:`on_run_end` (often outputs collected from each step of the loop) + """ + if self.skip: + return self.on_skip() + + if self.restarting: + self.restore() + self.restarting = False + else: + self.reset() + + self.on_run_start(*args, **kwargs) + + while not self.done: + try: + self.on_advance_start(*args, **kwargs) + self.advance(*args, **kwargs) + self.on_advance_end() + self.iteration_count += 1 + except StopIteration: + break + + output = self.on_run_end() + return output + + def restore(self) -> None: + """Restore the internal state of the loop the beginning of run if restarting is ``True``.""" + + @abstractmethod + def reset(self) -> None: + """Resets the internal state of the loop at the beginning of each call to :attr:`run`.""" + + def on_run_start(self, *args: Any, **kwargs: Any) -> None: + """ + Hook to be called as the first thing after entering :attr:`run` (except the state reset). + + Accepts all arguments passed to :attr:`run`. + """ + void(*args, **kwargs) + + def on_advance_start(self, *args: Any, **kwargs: Any) -> None: + """ + Hook to be called each time before :attr:`advance` is called. Accepts all arguments passed to :attr`run`. + """ + void(*args, **kwargs) + + @abstractmethod + def advance(self, *args: Any, **kwargs: Any) -> None: + """Performs a single step. Accepts all arguments passed to :attr:`run`.""" + + def on_advance_end(self) -> None: + """Hook to be called each time after :attr:`advance` is called.""" + + def on_run_end(self) -> Any: + """Hook to be called at the end of the run. Its return argument is returned from :attr:`run`.""" + + def teardown(self) -> None: + """Use to release memory etc.""" + + def load_state_dict(self, state_dict: Dict) -> None: + """Restore the loop state from the provided state_dict.""" + + def state_dict(self) -> Dict: + """Return the loop current states.""" + return {} diff --git a/pytorch_lightning/loops/batch/__init__.py b/pytorch_lightning/loops/batch/__init__.py new file mode 100644 index 0000000000000..6e6522165404a --- /dev/null +++ b/pytorch_lightning/loops/batch/__init__.py @@ -0,0 +1,15 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pytorch_lightning.loops.batch.training_batch_loop import TrainingBatchLoop # noqa: F401 diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py new file mode 100644 index 0000000000000..64df877ce68ee --- /dev/null +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -0,0 +1,677 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from contextlib import contextmanager +from functools import partial, update_wrapper +from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple + +import numpy as np +import torch +from deprecate import void +from torch import Tensor +from torch.optim import Optimizer + +from pytorch_lightning.core.optimizer import LightningOptimizer +from pytorch_lightning.loops.base import Loop +from pytorch_lightning.plugins import ParallelPlugin +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.trainer.supporters import TensorRunningAccum +from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm +from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.finite_checks import detect_nan_parameters +from pytorch_lightning.utilities.imports import _TPU_AVAILABLE +from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature +from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.warnings import WarningCache + + +class TrainingBatchLoop(Loop): + """ Runs over a single batch of data. """ + + def __init__(self) -> None: + super().__init__() + self.accumulated_loss: Optional[Tensor] = None + self.batch_outputs: Optional[List[List[STEP_OUTPUT]]] = None + self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20) + self.batch_idx: int = 0 + self.split_idx: Optional[int] = None + self._warning_cache: WarningCache = WarningCache() + + self._hiddens: Optional[Tensor] = None + self._optimizer_freq_cumsum: Optional[int] = None + self._remaining_splits: Optional[List[Any]] = None + self._skip_backward: bool = False + + @property + def done(self) -> bool: + """Returns if all batch splits have been processed already""" + return len(self._remaining_splits) == 0 + + @property + def optimizer_freq_cumsum(self) -> int: + """Returns the cumulated sum of optimizer frequencies""" + if self._optimizer_freq_cumsum is None: + self._optimizer_freq_cumsum = np.cumsum(self.trainer.optimizer_frequencies) + return self._optimizer_freq_cumsum + + def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict: + """Runs all the data splits and the ``on_batch_start`` and ``on_train_batch_start`` hooks + + Args: + batch: the current batch to run the train step on + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + """ + if batch is None: + self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...") + return AttributeDict(signal=0, training_step_output=[[]]) + + # hook + self.trainer.logger_connector.on_batch_start() + response = self.trainer.call_hook("on_batch_start") + if response == -1: + return AttributeDict(signal=-1) + + # hook + response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx) + if response == -1: + return AttributeDict(signal=-1) + + super().run(batch, batch_idx, dataloader_idx) + output = AttributeDict(signal=0, training_step_output=self.batch_outputs) + self.batch_outputs = None # free memory + return output + + def reset(self) -> None: + """Resets the loop state""" + self._hiddens = None + self.batch_idx = 0 + self.batch_outputs = [[] for _ in range(len(self.trainer.optimizers))] + + def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int): + """Splits the data into tbptt splits + + Args: + batch: the current batch to run the trainstep on + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + """ + void(batch_idx, dataloader_idx) + self._remaining_splits = list(enumerate(self._tbptt_split_batch(batch))) + + def advance(self, batch, batch_idx, dataloader_idx): + """Runs the train step together with optimization (if necessary) on the current batch split + + Args: + batch: the current batch to run the training on (this is not the split!) + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + """ + void(batch, dataloader_idx) + split_idx, split_batch = self._remaining_splits.pop(0) + self.batch_idx = batch_idx + self.split_idx = split_idx + + # let logger connector extract current batch size + self.trainer.logger_connector.on_train_split_start(batch_idx, split_idx, split_batch) + + if self.trainer.lightning_module.automatic_optimization: + for opt_idx, optimizer in self.get_active_optimizers(batch_idx): + result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer) + if result: + self.batch_outputs[opt_idx].append(result.training_step_output) + else: + # in manual optimization, there is no looping over optimizers + result = self._run_optimization(batch_idx, split_batch) + if result: + self.batch_outputs[0].append(result.training_step_output) + + def num_active_optimizers(self, batch_idx: Optional[int] = None) -> int: + """Gets the number of active optimizers based on their frequency""" + return len(self.get_active_optimizers(batch_idx)) + + def _run_optimization( + self, batch_idx: int, split_batch: Any, opt_idx: int = 0, optimizer: Optional[torch.optim.Optimizer] = None + ): + """Runs closure (train step + backward) together with optimization if necessary. + + Args: + batch_idx: the index of the current batch + split_batch: the current tbptt split of the whole batch + opt_idx: the index of the current optimizer + optimizer: the current optimizer + """ + # TODO(@awaelchli): In v1.5, when optimizer_idx gets removed from training_step in manual_optimization, change + # opt_idx=0 to opt_idx=None in the signature here + + # toggle model params + self._run_optimization_start(opt_idx, optimizer) + + result = AttributeDict() + closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer, self._hiddens, result) + + if self.should_accumulate(): + # For gradient accumulation + + # ------------------- + # calculate loss (train step + train step end) + # ------------------- + # automatic_optimization=True: perform ddp sync only when performing optimizer_step + # automatic_optimization=False: don't block synchronization here + with self.block_ddp_sync_behaviour(): + closure() + + # ------------------------------ + # BACKWARD PASS + # ------------------------------ + # gradient update with accumulated gradients + else: + if self.trainer.lightning_module.automatic_optimization: + self._optimizer_step(optimizer, opt_idx, batch_idx, closure) + if len(self.trainer.optimizers) > 1: + # revert back to previous state + self.trainer.lightning_module.untoggle_optimizer(opt_idx) + else: + result = self._training_step(split_batch, batch_idx, opt_idx, self._hiddens) + + if not result: + # user decided to skip optimization + return result + + # update running loss + reset accumulated loss + self._update_running_loss(result.loss) + + self._process_closure_result(result) + return result + + def _training_step_and_backward_closure( + self, + split_batch: Any, + batch_idx: int, + opt_idx: int, + optimizer: Optimizer, + hiddens: Tensor, + return_result: AttributeDict, + ) -> Optional[Tensor]: + """Closure for training step and backward + + Args: + split_batch: the current tbptt split of the batch + batch_idx: the index of the current batch + opt_idx: the index of the current optimizer + optimizer: the current optimizer + hiddens: the hidden state of the recurrent net + return_result: the storage of the trainstep results + """ + + result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + if result is not None: + return_result.update(result) + return return_result.loss + + def _make_closure(self, *closure_args: Any, **closure_kwargs: Any) -> Callable: + """ Wraps the training step closure into a partial object which will be called within ``optimizer.step``. """ + partial_func = partial(self._training_step_and_backward_closure, *closure_args, **closure_kwargs) + return update_wrapper(partial_func, self._training_step_and_backward_closure) + + def _process_closure_result(self, opt_closure_result: Optional[AttributeDict]) -> None: + """Checks if the closure results is finite and optionally breaks if it is not + + Args: + opt_closure_result: the result of the train step wrapped in an attribute dict + """ + if not opt_closure_result: + return + + # check if loss or model weights are nan + if self.trainer.terminate_on_nan: + self._check_finite(opt_closure_result.loss) + + def _on_after_backward(self, batch_idx: int, untouched_loss: Tensor) -> None: + """Calls ``on_after_backward`` hook and tracks loss history + + Args: + batch_idx: the index of the current batch + untouched_loss: the original loss value + """ + + # insert after step hook + self.trainer.call_hook("on_after_backward") + + # when in dev debugging track the losses + self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) + + def _check_training_step_output(self, training_step_output: STEP_OUTPUT) -> None: + """Sanity checks that training produced a valid output and optimizer step has already been called in manual + optimization. + + Args: + training_step_output: the output of the training step (before wrapping in an AttributeDict) + + """ + if isinstance(training_step_output, Tensor) and not self.trainer.lightning_module.automatic_optimization: + if training_step_output.grad_fn is None: + # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... + raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + elif self.trainer.lightning_module.automatic_optimization: + if not any(( + isinstance(training_step_output, Tensor), + (isinstance(training_step_output, Mapping) + and 'loss' in training_step_output), training_step_output is None + )): + raise MisconfigurationException( + "In automatic optimization, `training_step` must either return a Tensor, " + "a dict with key 'loss' or None (where the step will be skipped)." + ) + + def _training_step( + self, + split_batch: Any, + batch_idx: int, + opt_idx: int, + hiddens: Tensor, + ) -> Optional[AttributeDict]: + """Performs the actual train step with the tied hooks. + + Args: + split_batch: the current tbptt split of the current batch + batch_idx: the index of the current batch + opt_idx: the index of the current optimizer + hiddens: the model's hidden state of the previous iteration + + Returns: + an AttributeDict containing the loss value and the training step output. + """ + # give the PL module a result for logging + model_ref = self.trainer.lightning_module + + with self.trainer.profiler.profile("model_forward"): + step_kwargs = self._build_kwargs(split_batch, batch_idx, opt_idx, hiddens) + + # manually capture logged metrics + model_ref._current_fx_name = 'training_step' + with self.trainer.profiler.profile("training_step"): + training_step_output = self.trainer.accelerator.training_step(step_kwargs) + self.trainer.accelerator.post_training_step() + + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) + + self._check_training_step_output(training_step_output) + + training_step_output = self._process_training_step_output(training_step_output) + if training_step_output is None: + return + + closure_loss = None + loss = None + if self.trainer.lightning_module.automatic_optimization: + # accumulate loss. if accumulate_grad_batches==1, no effect + closure_loss = training_step_output.minimize / self.trainer.accumulate_grad_batches + # the loss will get scaled for amp. avoid any modifications to it + loss = closure_loss.detach().clone() + return AttributeDict(closure_loss=closure_loss, loss=loss, training_step_output=training_step_output) + + def _process_training_step_output(self, training_step_output: STEP_OUTPUT) -> Optional[ResultCollection]: + """Adds the :param:`training_step_output` to the trainer's results + + Args: + training_step_output: the output of the training step (before wrapping into an AttributeDict) + + Returns: + the updated results if the training_step's output was not None else None + """ + if training_step_output is None: + return None + + results = self.trainer._results + + loss = None + hiddens = None + results.extra = {} + + # handle dict return + if isinstance(training_step_output, dict): + loss = training_step_output.pop("loss", None) + hiddens = training_step_output.pop("hiddens", None) + # detach hiddens to avoid `RuntimeError: Trying to backward through the graph a second time` + hiddens = apply_to_collection(hiddens, Tensor, lambda t: t.detach()) + results.extra = training_step_output + + # handle scalar return + elif isinstance(training_step_output, Tensor): + loss = training_step_output + + # map to results under the hood + results.minimize = loss + self._hiddens = hiddens + + if self.trainer.move_metrics_to_cpu: + results.cpu() + return results + + def _optimizer_step( + self, optimizer: torch.optim.Optimizer, opt_idx: int, batch_idx: int, train_step_and_backward_closure: Callable + ) -> None: + """Performs the optimizer step and some sanity checking. + + Args: + optimizer: the optimizer to perform the step with + opt_idx: the index of the current :param:`optimizer` + batch_idx: the index of the current batch + train_step_and_backward_closure: the closure function performing the train step and computing the + gradients. By default called by the optimizer (if possible) + """ + model_ref = self.trainer.lightning_module + + is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) + using_native_amp = self.trainer.amp_backend == AMPType.NATIVE + + # native amp + lbfgs is a no go right now + if using_native_amp and is_lbfgs: + raise MisconfigurationException( + 'native PyTorch amp and lbfgs are not compatible.' + ' To request, please file a Github issue in PyTorch and tag @mcarilli' + ) + + # wraps into LightningOptimizer only for running step + optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx) + + # model hook + model_ref.optimizer_step( + self.trainer.current_epoch, + batch_idx, + optimizer, + opt_idx, + train_step_and_backward_closure, + on_tpu=(self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE), + using_native_amp=using_native_amp, + using_lbfgs=is_lbfgs, + ) + + def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None: + """Calls the ``on_before_zero_grad`` hook. + + Args: + optimizer: the current optimizer + """ + self.trainer.call_hook('on_before_zero_grad', optimizer) + + def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int) -> None: + """Zeroes out all gradients of parameters optimized by the current optimizer. + + Args: + batch_idx: the index of the current batch + optimizer: the current optimizer + opt_idx: the index of the current optimizer + """ + self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) + + def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, Tensor]: + """Tracks gradient norms and clips the gradients of all parameters optimized by the current optimizer. + + Args: + optimizer: the current optimizer + """ + # track gradient norms + grad_norm_dict = {} + can_log = (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 + should_track = float(self.trainer.track_grad_norm) > 0 + if should_track and can_log: + grad_norm_dict = grad_norm(self.trainer.lightning_module, self.trainer.track_grad_norm) + + # clip gradients + self.trainer.accelerator.clip_gradients( + optimizer, self.trainer.gradient_clip_val, gradient_clip_algorithm=self.trainer.gradient_clip_algorithm + ) + return grad_norm_dict + + def _accumulated_batches_reached(self) -> bool: + """Determine if accumulation will be finished by the end of the current batch.""" + # FIXME(@awaelchli): use progress tracking of batches instead of manual batch_idx + return (self.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0 + + def _num_training_batches_reached(self, is_last_batch: bool = False) -> bool: + """Checks whether sufficient training batches have been processed. + + Args: + is_last_batch: Whether the current batch is the last one + """ + # FIXME(@awaelchli): use progress tracking of batches instead of manual batch_idx + return (self.batch_idx + 1) == self.trainer.num_training_batches or is_last_batch + + def should_accumulate(self) -> bool: + """Checks if the optimizer step should be performed or gradients should be accumulated for the current step.""" + # checks if backward or backward + optimizer step (via closure) + accumulation_done = self._accumulated_batches_reached() + is_final_batch = self._num_training_batches_reached() + return not (accumulation_done or is_final_batch) + + def _tbptt_split_batch(self, batch: Any) -> List[Any]: + """Splits a single batch into a list of sequence steps for tbptt. + + Args: + batch: the current batch to split + """ + splits = [batch] + if self.trainer.truncated_bptt_steps is not None: + model_ref = self.trainer.lightning_module + with self.trainer.profiler.profile("tbptt_split_batch"): + splits = model_ref.tbptt_split_batch(batch, self.trainer.truncated_bptt_steps) + return splits + + def _run_optimization_start(self, opt_idx: int, optimizer: torch.optim.Optimizer) -> None: + """Toggles the optimizer to ensure the correct one is used and prevend dangling grads. + + Args: + opt_idx: the index of the optimizer to use + optimizer: the optimizer to use + + """ + # make sure only the gradients of the current optimizer's parameters are calculated + # in the training step to prevent dangling gradients in multiple-optimizer setup. + if self.trainer.lightning_module.automatic_optimization and len(self.trainer.optimizers) > 1: + model = self.trainer.lightning_module + model.toggle_optimizer(optimizer, opt_idx) + + @contextmanager + def block_ddp_sync_behaviour(self, should_block_sync: bool = False) -> Generator[None, None, None]: + """ + automatic_optimization = True + Blocks ddp sync gradients behaviour on backwards pass. + This is useful for skipping sync when accumulating gradients, reducing communication overhead + + automatic_optimization = False + do not block ddp gradient sync when using manual optimization + as gradients are needed within the training step + + Returns: + context manager with sync behaviour off + """ + if ( + isinstance(self.trainer.training_type_plugin, ParallelPlugin) + and (self.trainer.lightning_module.automatic_optimization or should_block_sync) + ): + with self.trainer.training_type_plugin.block_backward_sync(): + yield None + else: + yield None + + def training_step_and_backward( + self, + split_batch: Any, + batch_idx: int, + opt_idx: int, + optimizer: torch.optim.Optimizer, + hiddens: Optional[Tensor], + ) -> STEP_OUTPUT: + """Wrap forward, zero_grad and backward in a closure so second order methods work""" + with self.trainer.profiler.profile("training_step_and_backward"): + # lightning module hook + result = self._training_step(split_batch, batch_idx, opt_idx, hiddens) + + if not self._skip_backward and self.trainer.lightning_module.automatic_optimization: + is_first_batch_to_accumulate = batch_idx % self.trainer.accumulate_grad_batches == 0 + + if is_first_batch_to_accumulate: + self._on_before_zero_grad(optimizer) + self._optimizer_zero_grad(batch_idx, optimizer, opt_idx) + + # backward pass + if result is not None: + with self.trainer.profiler.profile("backward"): + self.backward(result, optimizer, opt_idx) + + # hook - call this hook only + # when gradients have finished to accumulate + if not self.should_accumulate(): + self._on_after_backward(batch_idx, result.loss) + + # check if loss or model weights are nan + if self.trainer.terminate_on_nan: + self._check_finite(result.loss) + + else: + self._warning_cache.warn( + "training_step returned None. If this was on purpose, ignore this warning..." + ) + + return result + + def _check_finite(self, loss: Tensor) -> None: + """Checks fotr finite parameters and loss values. + + Args: + loss: the loss value to check to be finite + """ + if not torch.isfinite(loss).all(): + raise ValueError(f'The loss returned in `training_step` is {loss}.') + model = self.trainer.lightning_module + detect_nan_parameters(model) + + def backward( + self, result: STEP_OUTPUT, optimizer: torch.optim.Optimizer, opt_idx: int, *args: Any, **kwargs: Any + ) -> None: + """Performs the backward step. + + Args: + result: The output of the trainstep (including the loss value) + optimizer: The optimizer optimizing the gradients to call backward for + opt_idx: the index of the current optimizer + """ + self.trainer.dev_debugger.track_event("backward_call") + + should_accumulate = self.should_accumulate() + + # backward can be called manually in the training loop + if isinstance(result, Tensor): + self.trainer.accelerator.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs) + else: + result.closure_loss = self.trainer.accelerator.backward( + result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs + ) + + if not self.should_accumulate(): + # track gradients + grad_norm_dict = self._track_and_norm_grad(optimizer=optimizer) + if grad_norm_dict: + self.trainer.lightning_module._current_fx_name = "on_after_backward" + self.trainer.lightning_module.log_grad_norm(grad_norm_dict) + + def _update_running_loss(self, current_loss: Tensor) -> None: + """Updates the running loss value with the current value""" + if self.trainer.lightning_module.automatic_optimization: + # track total loss for logging (avoid mem leaks) + self.accumulated_loss.append(current_loss) + + accumulated_loss = self.accumulated_loss.mean() + + if accumulated_loss is not None: + # calculate running loss for display + self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) + + # reset for next set of accumulated grads + self.accumulated_loss.reset() + + def get_active_optimizers(self, batch_idx: Optional[int] = None) -> List[Tuple[int, Optimizer]]: + """ + Returns the currently active optimizers. When multiple optimizers are used with different frequencies, + only one of the optimizers is active at a time. + + Returns: + A list of tuples (opt_idx, optimizer) of currently active optimizers. + """ + if not self.trainer.optimizer_frequencies: + # call training_step once per optimizer + return list(enumerate(self.trainer.optimizers)) + + optimizers_loop_length = self.optimizer_freq_cumsum[-1] + current_place_in_loop = batch_idx % optimizers_loop_length + + # find optimzier index by looking for the first {item > current_place} in the cumsum list + opt_idx = int(np.argmax(self.optimizer_freq_cumsum > current_place_in_loop)) + return [(opt_idx, self.trainer.optimizers[opt_idx])] + + def _build_kwargs(self, batch: Any, batch_idx: int, opt_idx: int, hiddens: Optional[Tensor]) -> Dict[str, Any]: + """Builds the keyword arguments for training_step + + Args: + batch: the batch to train on + batch_idx: the index of the current batch + opt_idx: the index of the current optimizer + hiddens: the hidden state of the previous RNN iteration + + Returns: + the keyword arguments for the training step + """ + # enable not needing to add opt_idx to training_step + step_kwargs = OrderedDict([('batch', batch), ('batch_idx', batch_idx)]) + + lightning_module = self.trainer.lightning_module + + if len(self.trainer.optimizers) > 1: + training_step_fx = getattr(lightning_module, "training_step") + has_opt_idx_in_train_step = is_param_in_hook_signature(training_step_fx, "optimizer_idx") + if has_opt_idx_in_train_step: + if not lightning_module.automatic_optimization: + self._warning_cache.deprecation( + "`training_step` hook signature has changed in v1.3." + " `optimizer_idx` argument has been removed in case of manual optimization. Support for" + " the old signature will be removed in v1.5" + ) + step_kwargs['optimizer_idx'] = opt_idx + elif not has_opt_idx_in_train_step and lightning_module.automatic_optimization: + raise ValueError( + f"Your LightningModule defines {len(self.trainer.optimizers)} optimizers but" + ' `training_step` is missing the `optimizer_idx` argument.' + ) + + # pass hiddens if using tbptt + if self._truncated_bptt_enabled(): + step_kwargs['hiddens'] = hiddens + + return step_kwargs + + def _truncated_bptt_enabled(self) -> bool: + """ Temporary tbptt utilities until this flag is fully migrated to the lightning module. """ + return self._truncated_bptt_steps() > 0 + + def _truncated_bptt_steps(self) -> int: + """Returns the number of tbptt steps""" + lightning_module = self.trainer.lightning_module + # Give precedence to the LightningModule as the Trainer flag will be removed in v1.5 + if lightning_module.truncated_bptt_steps > 0: + return lightning_module.truncated_bptt_steps + return self.trainer.truncated_bptt_steps or 0 diff --git a/pytorch_lightning/loops/dataloader/__init__.py b/pytorch_lightning/loops/dataloader/__init__.py new file mode 100644 index 0000000000000..db2b2f7926d50 --- /dev/null +++ b/pytorch_lightning/loops/dataloader/__init__.py @@ -0,0 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop # noqa: F401 +from pytorch_lightning.loops.dataloader.evaluation_loop import EvaluationLoop # noqa: F401 +from pytorch_lightning.loops.dataloader.prediction_loop import PredictionLoop # noqa: F401 diff --git a/pytorch_lightning/loops/dataloader/dataloader_loop.py b/pytorch_lightning/loops/dataloader/dataloader_loop.py new file mode 100644 index 0000000000000..ce255b73d0bba --- /dev/null +++ b/pytorch_lightning/loops/dataloader/dataloader_loop.py @@ -0,0 +1,53 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from typing import Sequence + +from torch.utils.data import DataLoader + +from pytorch_lightning.loops.base import Loop + + +class DataLoaderLoop(Loop): + """Base class to loop over all dataloaders""" + + @property + @abstractmethod + def dataloaders(self) -> Sequence[DataLoader]: + """Returns the dataloaders to loop over""" + + @property + def current_dataloader_idx(self) -> int: + """Returns the index of the current dataloader""" + return self.iteration_count + + @property + def current_dataloader(self) -> DataLoader: + """Returns the current dataloader""" + return self.dataloaders[self.current_dataloader_idx] + + @property + def num_dataloaders(self) -> int: + """Returns the number of dataloaders present""" + return len(self.dataloaders) if self.dataloaders is not None else 0 + + @property + def done(self) -> bool: + """Returns whether all dataloaders have been processed""" + return self.current_dataloader_idx >= self.num_dataloaders + + def reset(self) -> None: + """Resets the internal state""" + self.iteration_count = 0 diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py new file mode 100644 index 0000000000000..02d802fb3fc15 --- /dev/null +++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -0,0 +1,269 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, List, Optional, Sequence, Union + +from deprecate.utils import void +from torch.utils.data.dataloader import DataLoader + +import pytorch_lightning as pl +from pytorch_lightning.loops.dataloader import DataLoaderLoop +from pytorch_lightning.loops.epoch import EvaluationEpochLoop +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.trainer.states import TrainerFn +from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.types import EPOCH_OUTPUT + + +class EvaluationLoop(DataLoaderLoop): + """Loops over all dataloaders for evaluation.""" + + def __init__(self): + super().__init__() + self._max_batches: Optional[Union[int, Sequence[int]]] = None + self.outputs = [] + + self.epoch_loop = EvaluationEpochLoop() + + self._results = ResultCollection(training=False) + self._has_run: bool = False + + @property + def num_dataloaders(self) -> int: + """Returns the total number of dataloaders""" + # case where user does: + # return dl1, dl2 + dataloaders = self.dataloaders + if dataloaders is None: + return 0 + length = len(dataloaders) + if length > 0 and isinstance(dataloaders[0], (list, tuple)): + length = len(dataloaders[0]) + return length + + @property + def dataloaders(self) -> Sequence[DataLoader]: + """Returns the validation or test dataloaders""" + if self.trainer.testing: + return self.trainer.test_dataloaders + return self.trainer.val_dataloaders + + @property + def predictions(self): + """Returns the predictions from all dataloaders""" + return self.epoch_loop.predictions + + def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None: + """Connects the loop to everything necessary (like trainer and accelerators)""" + super().connect(trainer, *args, **kwargs) + self.epoch_loop.connect(trainer) + + @property + def done(self) -> bool: + """Returns whether all dataloaders are processed or evaluation should be skipped altogether""" + return (self.current_dataloader_idx >= len(self.dataloaders)) or self.skip + + @property + def skip(self) -> bool: + """Returns whether the evaluation should be skipped.""" + max_batches = self.get_max_batches() + return sum(max_batches) == 0 + + def reset(self) -> None: + """Resets the internal state of the loop""" + self.iteration_count = 0 + self._max_batches = self.get_max_batches() + # bookkeeping + self.outputs = [] + + if isinstance(self._max_batches, int): + self._max_batches = [self._max_batches] * len(self.dataloaders) + + def on_skip(self) -> List: + return [] + + def on_run_start(self, *args: Any, **kwargs: Any) -> None: + """Runs the ``on_evaluation_model_eval``, ``on_evaluation_start`` and ``on_evaluation_epoch_start`` hooks""" + void(*args, **kwargs) + # hook + self.on_evaluation_model_eval() + self.trainer.lightning_module.zero_grad() + self.on_evaluation_start() + self.on_evaluation_epoch_start() + + def advance(self, *args: Any, **kwargs: Any) -> None: + """Performs evaluation on one single dataloader""" + void(*args, **kwargs) + dataloader = self.trainer.accelerator.process_dataloader(self.current_dataloader) + dataloader_iter = enumerate(dataloader) + dl_max_batches = self._max_batches[self.current_dataloader_idx] + + dl_outputs = self.epoch_loop.run( + dataloader_iter, + self.current_dataloader_idx, + dl_max_batches, + self.num_dataloaders, + ) + + # store batch level output per dataloader + if self.should_track_batch_outputs_for_epoch_end: + self.outputs.append(dl_outputs) + + if not self.trainer.sanity_checking: + # indicate the loop has run + self._has_run = True + + def on_run_end(self) -> Any: + """Runs the ``on_evaluation_epoch_end`` hook""" + outputs = self.outputs + + # free memory + self.outputs = [] + + # with a single dataloader don't pass a 2D list + if len(outputs) > 0 and self.num_dataloaders == 1: + outputs = outputs[0] + + # lightning module method + self.evaluation_epoch_end(outputs) + + # hook + self.on_evaluation_epoch_end() + + # log epoch metrics + eval_loop_results = self.trainer.logger_connector.update_eval_epoch_metrics() + + # hook + self.on_evaluation_end() + + # save predictions to disk + self.epoch_loop.predictions.to_disk() + + # enable train mode again + self.on_evaluation_model_train() + + return eval_loop_results + + def get_max_batches(self) -> List[Union[int, float]]: + """Returns the max number of batches for each dataloader""" + if self.trainer.testing: + max_batches = self.trainer.num_test_batches + else: + if self.trainer.sanity_checking: + self.trainer.num_sanity_val_batches = [ + min(self.trainer.num_sanity_val_steps, val_batches) for val_batches in self.trainer.num_val_batches + ] + max_batches = self.trainer.num_sanity_val_batches + else: + max_batches = self.trainer.num_val_batches + return max_batches + + def reload_evaluation_dataloaders(self) -> None: + """Reloads dataloaders if necessary""" + model = self.trainer.lightning_module + if self.trainer.testing: + self.trainer.reset_test_dataloader(model) + elif self.trainer.val_dataloaders is None or self.trainer.reload_dataloaders_every_epoch: + self.trainer.reset_val_dataloader(model) + + def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None: + """Runs ``on_{validation/test}_start`` hooks""" + self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end() + + assert self._results is not None + self._results.to(device=self.trainer.lightning_module.device) + + if self.trainer.testing: + self.trainer.call_hook("on_test_start", *args, **kwargs) + else: + self.trainer.call_hook("on_validation_start", *args, **kwargs) + + def on_evaluation_model_eval(self) -> None: + """Sets model to eval mode""" + model_ref = self.trainer.lightning_module + if self.trainer.testing: + model_ref.on_test_model_eval() + else: + model_ref.on_validation_model_eval() + + def on_evaluation_model_train(self) -> None: + """Sets model to train mode""" + model_ref = self.trainer.lightning_module + if self.trainer.testing: + model_ref.on_test_model_train() + else: + model_ref.on_validation_model_train() + + def on_evaluation_end(self, *args: Any, **kwargs: Any) -> None: + """Runs ``on_{validation/test}_end`` hook""" + if self.trainer.testing: + self.trainer.call_hook("on_test_end", *args, **kwargs) + else: + self.trainer.call_hook("on_validation_end", *args, **kwargs) + + if self.trainer.state.fn != TrainerFn.FITTING: + # summarize profile results + self.trainer.profiler.describe() + + # reset any `torchmetrics.Metric` and the logger connector state + self.trainer.logger_connector.reset(metrics=True) + + def on_evaluation_epoch_start(self, *args: Any, **kwargs: Any) -> None: + """Runs ``on_epoch_start`` and ``on_{validation/test}_epoch_start`` hooks""" + self.trainer.logger_connector.on_epoch_start() + self.trainer.call_hook("on_epoch_start", *args, **kwargs) + + if self.trainer.testing: + self.trainer.call_hook("on_test_epoch_start", *args, **kwargs) + else: + self.trainer.call_hook("on_validation_epoch_start", *args, **kwargs) + + def _should_track_batch_outputs_for_epoch_end(self) -> bool: + """Whether the batch outputs should be stored for later usage""" + model = self.trainer.lightning_module + if self.trainer.testing: + return is_overridden("test_epoch_end", model) + return is_overridden("validation_epoch_end", model) + + def evaluation_epoch_end(self, outputs: EPOCH_OUTPUT) -> None: + """Runs ``{validation/test}_epoch_end``""" + # inform logger the batch loop has finished + self.trainer.logger_connector.epoch_end_reached() + + # call the model epoch end + model = self.trainer.lightning_module + + # unset dataloader_idx in model + model._current_dataloader_idx = None + + if self.trainer.testing: + if is_overridden("test_epoch_end", model): + model._current_fx_name = "test_epoch_end" + model.test_epoch_end(outputs) + + else: + if is_overridden("validation_epoch_end", model): + model._current_fx_name = "validation_epoch_end" + model.validation_epoch_end(outputs) + + def on_evaluation_epoch_end(self) -> None: + """Runs ``on_{validation/test}_epoch_end`` hook""" + hook_name = ("on_test_epoch_end" if self.trainer.testing else "on_validation_epoch_end") + self.trainer.call_hook(hook_name) + self.trainer.call_hook("on_epoch_end") + self.trainer.logger_connector.on_epoch_end() + + def teardown(self) -> None: + self._results.cpu() + self.epoch_loop.teardown() diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py new file mode 100644 index 0000000000000..37b4b83a25ebe --- /dev/null +++ b/pytorch_lightning/loops/dataloader/prediction_loop.py @@ -0,0 +1,151 @@ +from typing import Any, List, Optional, Sequence, Union + +from deprecate.utils import void +from torch.utils.data import DataLoader + +import pytorch_lightning as pl +from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop +from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop +from pytorch_lightning.plugins import DDPSpawnPlugin +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.types import _PREDICT_OUTPUT + + +class PredictionLoop(DataLoaderLoop): + """Loop to run over dataloaders for prediction""" + + def __init__(self): + super().__init__() + self.predictions: Optional[List[List[Any]]] = None + self.epoch_batch_indices: Optional[List[List[int]]] = None + + self.epoch_loop: PredictionEpochLoop = PredictionEpochLoop() + + self._results = None # for `trainer._results` access + self._return_predictions: bool = False + + @property + def return_predictions(self) -> bool: + """Whether to return the predictions or not""" + return self._return_predictions + + @return_predictions.setter + def return_predictions(self, return_predictions: Optional[bool] = None) -> None: + # ``DDPSpawnPlugin`` plugins and derivate don't support return predictions. + is_ddp_spawn = isinstance(self.trainer.training_type_plugin, DDPSpawnPlugin) + if return_predictions and is_ddp_spawn: + raise MisconfigurationException( + "`return_predictions` should be set to `False` when using the `DDPSpawnPlugin` or children class. " + f"Found {return_predictions} with training_type_plugin {type(self.trainer.training_type_plugin)}." + ) + # For non ``DDPSpawnPlugin`` plugin, the `return_predictions` is True by default unless user decide otherwise. + self._return_predictions = not is_ddp_spawn if return_predictions is None else return_predictions + + @property + def num_dataloaders(self) -> int: + """Returns the number of prediction dataloaders""" + # case where user does: + # return dl1, dl2 + dataloaders = self.dataloaders + length = len(dataloaders) + if len(dataloaders) > 0 and isinstance(dataloaders[0], (list, tuple)): + length = len(dataloaders[0]) + return length + + @property + def max_batches(self) -> List[int]: + """The max number of batches this loop will run for each dataloader.""" + max_batches = self.trainer.num_predict_batches + if isinstance(max_batches, int): + max_batches = [max_batches] * len(self.dataloaders) + return max_batches + + @property + def dataloaders(self) -> Sequence[DataLoader]: + """Returns all prediction dataloaders""" + return self.trainer.predict_dataloaders + + @property + def done(self) -> bool: + """Whether prediction is finished: Max batches run or all dataloaders processed""" + return self.current_dataloader_idx >= len(self.dataloaders) + + @property + def skip(self) -> bool: + return sum(self.max_batches) == 0 + + def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None: + """Connects the loop with all necessary things (like trainer)""" + super().connect(trainer, *args, **kwargs) + self.epoch_loop.connect(trainer, *args, **kwargs) + + def reset(self) -> None: + """Resets the internal state of the loop for a new run""" + super().reset() + self.predictions = [] + self.epoch_batch_indices = [] + + def on_run_start(self) -> None: + """Calls ``on_predict_start`` hook""" + self.on_predict_start() + + def advance(self, *args: Any, **kwargs: Any) -> None: + """Predicts one entire dataloader""" + void(*args, **kwargs) + dataloader = self.trainer.accelerator.process_dataloader(self.current_dataloader) + dataloader_iter = enumerate(dataloader) + dl_max_batches = self.max_batches[self.current_dataloader_idx] + + dl_predictions, dl_batch_indices = self.epoch_loop.run( + dataloader_iter, self.current_dataloader_idx, dl_max_batches, self.num_dataloaders, self.return_predictions + ) + self.predictions.append(dl_predictions) + self.epoch_batch_indices.append(dl_batch_indices) + + def on_run_end(self) -> Union[List[Any], List[List[Any]]]: + """Calls ``on_predict_epoch_end`` and ``on_predict_end`` hooks and returns results from all dataloaders""" + results = self.on_predict_epoch_end() + self.on_predict_end() + return results + + def on_predict_start(self) -> None: + """ + Sets model to eval mode and disables gradients. Also calls ``on_predict_start`` and + ``on_predict_epoch_start`` hooks. + """ + # enable eval mode + no grads + self.on_predict_model_eval() + self.trainer.lightning_module.zero_grad() + + # hook + self.trainer.call_hook("on_predict_start") + self.trainer.call_hook("on_predict_epoch_start") + + def on_predict_epoch_end(self) -> Optional[_PREDICT_OUTPUT]: + """Calls ``on_predict_epoch_end`` hook. + + Returns: + the results for all dataloaders + """ + self.trainer.profiler.describe() + + results = self.predictions + + self.trainer.call_hook("on_predict_epoch_end", results) + + if self.return_predictions: + return results[0] if self.num_dataloaders == 1 else results + + def on_predict_end(self) -> None: + """Resets previous gradient status and calls ``on_predict_end`` hook""" + # clear memory. the predictions are extracted in `on_predict_epoch_end`. + self.predictions = [] + self.epoch_batch_indices = [] + + # hook + self.trainer.call_hook("on_predict_end") + + def on_predict_model_eval(self): + """Calls ``on_predict_model_eval`` hook""" + model_ref = self.trainer.lightning_module + model_ref.on_predict_model_eval() diff --git a/pytorch_lightning/loops/epoch/__init__.py b/pytorch_lightning/loops/epoch/__init__.py new file mode 100644 index 0000000000000..789953937a6b4 --- /dev/null +++ b/pytorch_lightning/loops/epoch/__init__.py @@ -0,0 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pytorch_lightning.loops.epoch.evaluation_epoch_loop import EvaluationEpochLoop # noqa: F401 +from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop # noqa: F401 +from pytorch_lightning.loops.epoch.training_epoch_loop import TrainingEpochLoop # noqa: F401 diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py new file mode 100644 index 0000000000000..7f8ef06d7687f --- /dev/null +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -0,0 +1,255 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from typing import Any, Dict, Iterator, List, Optional, Union + +from deprecate import void +from torch import Tensor + +from pytorch_lightning.loops.base import Loop +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.trainer.supporters import PredictionCollection +from pytorch_lightning.utilities.memory import recursive_detach +from pytorch_lightning.utilities.types import STEP_OUTPUT + + +class EvaluationEpochLoop(Loop): + """ + This is the loop performing the evaluation. It mainly loops over the given dataloader and runs the validation + or test step (depending on the trainer's current state). + """ + + def __init__(self) -> None: + super().__init__() + self.predictions: Optional[PredictionCollection] = None + self.dataloader: Optional[Iterator] = None + self.dl_max_batches: Optional[int] = None + self.dataloader_idx: Optional[int] = None + self.num_dataloaders: Optional[int] = None + self.outputs: List[STEP_OUTPUT] = [] + + @property + def done(self) -> bool: + """Returns ``True`` if the current iteration count reaches the number of dataloader batches.""" + return self.iteration_count >= self.dl_max_batches + + def reset(self) -> None: + """Resets the loop's internal state.""" + self.iteration_count = 0 + self.predictions = PredictionCollection(self.trainer.global_rank, self.trainer.world_size) + self.dl_max_batches = None + self.dataloader_idx = None + self.num_dataloaders = None + self.outputs = [] + + def on_run_start( + self, + dataloader_iter: Iterator, + dataloader_idx: int, + dl_max_batches: int, + num_dataloaders: int, + ) -> None: + """Adds the passed arguments to the loop's state if necessary + + Args: + dataloader_iter: iterator over the dataloader + dataloader_idx: index of the current dataloader + dl_max_batches: maximum number of batches the dataloader can produce + num_dataloaders: the total number of dataloaders + """ + void(dataloader_iter) + + self.dl_max_batches = dl_max_batches + self.dataloader_idx = dataloader_idx + self.num_dataloaders = num_dataloaders + + def advance( + self, + dataloader_iter: Iterator, + dataloader_idx: int, + dl_max_batches: int, + num_dataloaders: int, + ) -> None: + """Calls the evaluation step with the corresponding hooks and updates the logger connector. + + Args: + dataloader_iter: iterator over the dataloader + dataloader_idx: index of the current dataloader + dl_max_batches: maximum number of batches the dataloader can produce + num_dataloaders: the total number of dataloaders + + Raises: + StopIteration: If the current batch is None + """ + void(dl_max_batches, num_dataloaders) + + batch_idx, batch = next(dataloader_iter) + + if batch is None: + raise StopIteration + + # hook + self.on_evaluation_batch_start(batch, batch_idx, dataloader_idx) + + # lightning module methods + with self.trainer.profiler.profile("evaluation_step_and_end"): + output = self.evaluation_step(batch, batch_idx, dataloader_idx) + output = self.evaluation_step_end(output) + + # hook + store predictions + self.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx) + + # log batch metrics + self.trainer.logger_connector.update_eval_step_metrics() + + # track epoch level outputs + self.outputs = self._track_output_for_epoch_end(self.outputs, output) + + def on_run_end(self) -> List[STEP_OUTPUT]: + """Returns the outputs of the whole run""" + outputs = self.outputs + # free memory + self.outputs = [] + return outputs + + def evaluation_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Optional[STEP_OUTPUT]: + """The evaluation step (validation_step or test_step depending on the trainer's state). + + Args: + batch: The current batch to run through the step. + batch_idx: The index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + + Returns: + the outputs of the step + """ + # configure step_kwargs + step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx) + + if self.trainer.testing: + self.trainer.lightning_module._current_fx_name = "test_step" + with self.trainer.profiler.profile("test_step"): + output = self.trainer.accelerator.test_step(step_kwargs) + else: + self.trainer.lightning_module._current_fx_name = "validation_step" + with self.trainer.profiler.profile("validation_step"): + output = self.trainer.accelerator.validation_step(step_kwargs) + + return output + + def evaluation_step_end(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + """Calls the `{validation/test}_step_end` hook""" + hook_name = "test_step_end" if self.trainer.testing else "validation_step_end" + output = self.trainer.call_hook(hook_name, *args, **kwargs) + return output + + def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """Calls the ``on_{validation/test}_batch_start`` hook. + + Args: + batch: The current batch to run through the step + batch_idx: The index of the current batch + dataloader_idx: The index of the dataloader producing the current batch + + Raises: + AssertionError: If the number of dataloaders is None (has not yet been set). + """ + self.trainer.logger_connector.on_batch_start() + + assert self.num_dataloaders is not None + self.trainer.logger_connector.on_evaluation_batch_start(batch, batch_idx, dataloader_idx, self.num_dataloaders) + + if self.trainer.testing: + self.trainer.call_hook("on_test_batch_start", batch, batch_idx, dataloader_idx) + else: + self.trainer.call_hook("on_validation_batch_start", batch, batch_idx, dataloader_idx) + + def on_evaluation_batch_end( + self, + output: Optional[STEP_OUTPUT], + batch: Any, + batch_idx: int, + dataloader_idx: int, + ) -> None: + """The ``on_{validation/test}_batch_end`` hook. + + Args: + output: The output of the performed step + batch: The input batch for the step + batch_idx: The index of the current batch + dataloader_idx: Index of the dataloader producing the current batch + """ + hook_name = "on_test_batch_end" if self.trainer.testing else "on_validation_batch_end" + self.trainer.call_hook(hook_name, output, batch, batch_idx, dataloader_idx) + + self.trainer.logger_connector.on_batch_end() + + # store predicitons if do_write_predictions and track eval loss history + self.store_predictions(output, batch_idx, dataloader_idx) + + def store_predictions(self, output: Optional[STEP_OUTPUT], batch_idx: int, dataloader_idx: int) -> None: + """Stores the predictions in the prediction collection (only if running in test mode) + + Args: + output: the outputs of the current step + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + """ + # Add step predictions to prediction collection to write later + if output is not None and self.predictions is not None: + if isinstance(output, ResultCollection) and self.trainer.testing: + self.predictions.add(output.pop("predictions", None)) + + # track debug metrics + self.trainer.dev_debugger.track_eval_loss_history(batch_idx, dataloader_idx, output) + + def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict[str, Union[Any, int]]: + """Helper function to build the arguments for the current step + + Args: + batch: The current batch to run through the step + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + + Returns: + the keyword arguments to pass to the step function + """ + # make dataloader_idx arg in validation_step optional + step_kwargs = OrderedDict([("batch", batch), ("batch_idx", batch_idx)]) + + multiple_val_loaders = not self.trainer.testing and self.num_dataloaders > 1 + multiple_test_loaders = self.trainer.testing and self.num_dataloaders > 1 + + if multiple_test_loaders or multiple_val_loaders: + step_kwargs["dataloader_idx"] = dataloader_idx + + return step_kwargs + + def _track_output_for_epoch_end( + self, + outputs: List[Union[ResultCollection, Dict, Tensor]], + output: Optional[Union[ResultCollection, Dict, Tensor]], + ) -> List[Union[ResultCollection, Dict, Tensor]]: + if output is not None: + if isinstance(output, ResultCollection): + output = output.detach() + if self.trainer.move_metrics_to_cpu: + output = output.cpu() + elif isinstance(output, dict): + output = recursive_detach(output, to_cpu=self.trainer.move_metrics_to_cpu) + elif isinstance(output, Tensor) and output.is_cuda and self.trainer.move_metrics_to_cpu: + output = output.cpu() + outputs.append(output) + return outputs diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py new file mode 100644 index 0000000000000..29a76793b4648 --- /dev/null +++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -0,0 +1,151 @@ +from collections import OrderedDict +from typing import Any, Dict, Iterator, List, Optional, Tuple + +from deprecate import void + +from pytorch_lightning.loops.base import Loop +from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper +from pytorch_lightning.utilities.warnings import WarningCache + + +class PredictionEpochLoop(Loop): + """Loop performing prediction on arbitrary sequentially used dataloaders.""" + + def __init__(self) -> None: + super().__init__() + self.return_predictions: bool = False + self.predictions: List[Any] = [] + self.current_batch_indices: List[int] = [] + self._dl_max_batches: Optional[int] = None + self._num_dataloaders: Optional[int] = None + self._warning_cache = WarningCache() + self._all_batch_indices: List[int] = [] + + @property + def done(self) -> bool: + """Ends prediction when the iteration count exceeds the total number of available batches""" + return self.iteration_count >= self._dl_max_batches + + @property + def should_store_predictions(self) -> bool: + """Whether the predictions should be stored for later usage (e.g. aggregation or returning)""" + any_pred = any(cb.interval.on_epoch for cb in self.trainer.prediction_writer_callbacks) + return self.return_predictions or any_pred + + def reset(self) -> None: + """Resets the loops internal state""" + self.iteration_count = 0 + self._all_batch_indices: List[int] = [] + self.predictions: List[Any] = [] + + def on_run_start( + self, + dataloader_iter: Iterator, + dataloader_idx: int, + dl_max_batches: int, + num_dataloaders: int, + return_predictions: bool = False + ) -> None: + """ + Prepares the loops internal state + + Args: + dataloader_iter: the iterator over the current dataloader + dataloader_idx: the index of the current dataloader + dl_max_batches: the maximum number of batches the current loader can produce + num_dataloaders: the total number of dataloaders + return_predictions: whether to return the obtained predictions + """ + void(dataloader_iter, dataloader_idx) + self._dl_max_batches = dl_max_batches + self._num_dataloaders = num_dataloaders + self.return_predictions = return_predictions + + def advance( + self, + dataloader_iter: Iterator, + dataloader_idx: int, + dl_max_batches: int, + num_dataloaders: int, + return_predictions: bool = False + ) -> None: + """ + Runs one prediction step. + + Args: + dataloader_iter: the iterator over the current dataloader + dataloader_idx: the index of the current dataloader + dl_max_batches: the maximum number of batches the current loader can produce + num_dataloaders: the total number of dataloaders + return_predictions: whether to return the obtained predictions + """ + batch_idx, batch = next(dataloader_iter) + if batch is None: + raise StopIteration + + with self.trainer.profiler.profile("predict_step"): + self._predict_step(batch, batch_idx, dataloader_idx) + + def on_run_end(self) -> Tuple[Any, Any]: + """Returns the predictions and the corresponding batch indices""" + predictions = self.predictions + all_batch_indices = self._all_batch_indices + # free memory + self.predictions = [] + self._all_batch_indices = [] + return predictions, all_batch_indices + + def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """Runs the actual predict step together with all the + necessary bookkeeping and the hooks tied to the predict step. + + Args: + batch: the current batch to run the prediction on + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + """ + # configure step_kwargs + step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx) + + # extract batch_indices and store them + self._store_batch_indices(dataloader_idx) + + model_ref = self.trainer.lightning_module + + self.trainer.call_hook("on_predict_batch_start", batch, batch_idx, dataloader_idx) + + model_ref._current_fx_name = "predict_step" + predictions = self.trainer.accelerator.predict_step(step_kwargs) + + if predictions is None: + self._warning_cache.warn("predict returned None if it was on purpose, ignore this warning...") + + self.trainer.call_hook("on_predict_batch_end", predictions, batch, batch_idx, dataloader_idx) + + if self.should_store_predictions: + self.predictions.append(predictions) + + def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict[str, Any]: + """ + Assembles the keyword arguments for the ``predict_step`` + + Args: + batch: the current batch to run the prediction on + batch_idx: the index of the current batch + dataloader_idx: the index of the dataloader producing the current batch + + Returns: + the dictionary containing all the keyboard arguments for the predict step + """ + step_kwargs = OrderedDict([('batch', batch), ('batch_idx', batch_idx)]) + if self._num_dataloaders > 1: + step_kwargs['dataloader_idx'] = dataloader_idx + return step_kwargs + + def _store_batch_indices(self, dataloader_idx: int) -> None: + """Stores the batch indices if the predictions should be stored""" + batch_sampler = self.trainer.predict_dataloaders[dataloader_idx].batch_sampler + if isinstance(batch_sampler, IndexBatchSamplerWrapper): + self.current_batch_indices = batch_sampler.batch_indices + if self.should_store_predictions: + self._all_batch_indices.append(batch_sampler.batch_indices) diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py new file mode 100644 index 0000000000000..f1eb3c942b8a0 --- /dev/null +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -0,0 +1,426 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Iterator, List, Optional, Union + +import torch + +import pytorch_lightning as pl +from pytorch_lightning import loops # import as loops to avoid circular imports +from pytorch_lightning.loops.batch import TrainingBatchLoop +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature +from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.warnings import WarningCache + + +class TrainingEpochLoop(loops.Loop): + """ Runs over all batches in a dataloader (one epoch). """ + + def __init__(self, min_steps: int, max_steps: int): + super().__init__() + self.min_steps: int = min_steps + self.max_steps: int = max_steps + self.global_step: int = 0 + # the total batch index across all epochs + self.total_batch_idx: int = 0 + # the current batch index in the loop that runs over the dataloader(s) + self.iteration_count: int = 0 + # the current split index when the batch gets split into chunks in truncated backprop through time + self.split_idx: Optional[int] = None + # the number of batches seen this run, updates immediately after batch_loop.run() + self.batches_seen: int = 0 + self.is_last_batch: Optional[bool] = None + + self.batch_loop = TrainingBatchLoop() + self.val_loop = loops.EvaluationLoop() + + self._results = ResultCollection(training=True) + self._dataloader_idx: Optional[int] = None + self._warning_cache: WarningCache = WarningCache() + self._epoch_output: Optional[List[List[STEP_OUTPUT]]] = None + + @property + def batch_idx(self) -> int: + """Returns the current batch index (within this epoch)""" + return self.iteration_count + + @property + def done(self) -> bool: + """Returns whether the training should be stopped. + The criteria are that the number of steps reached the max steps, + the last batch is reached or the trainer signals to stop (e.g. by early stopping). + """ + max_steps_reached = self.max_steps is not None and self.global_step >= self.max_steps + return max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(self.is_last_batch) + + def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None: + """Connects the loop with all necessary parts like trainer and accelerators""" + super().connect(trainer, *args, **kwargs) + self.batch_loop.connect(trainer) + self.val_loop.connect(trainer) + + def reset(self) -> None: + """Resets the internal state of the loop for a new run""" + self.iteration_count = 0 + self.batches_seen = 0 + self.is_last_batch = False + self._dataloader_idx = 0 + + # track epoch output + self._epoch_output = [[] for _ in range(self.batch_loop.num_active_optimizers(self.total_batch_idx))] + + def on_run_start(self, *args: Any, **kwargs: Any) -> None: + # hook + self.trainer.logger_connector.on_epoch_start() + self.trainer.call_hook("on_epoch_start") + self.trainer.call_hook("on_train_epoch_start") + + def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None: + """Runs a single training batch. + + Args: + dataloader_iter: the iterator over the dataloader producing the new batch + + Raises: + StopIteration: When the epoch is canceled by the user returning -1 + """ + _, (batch, is_last) = next(dataloader_iter) + self.is_last_batch = is_last + + # ------------------------------------ + # TRAINING_STEP + TRAINING_STEP_END + # ------------------------------------ + with self.trainer.profiler.profile("run_training_batch"): + batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx) + self.batches_seen += 1 + + # when returning -1 from train_step, we end epoch early + if batch_output.signal == -1: + raise StopIteration + + # update non-plateau LR schedulers + # update epoch-interval ones only when we are at the end of training epoch + self.update_lr_schedulers('step', update_plateau_schedulers=False) + if self._num_training_batches_reached(is_last): + self.update_lr_schedulers('epoch', update_plateau_schedulers=False) + + batch_end_outputs = [opt_idx_out for opt_idx_out in batch_output.training_step_output if len(opt_idx_out)] + processed_batch_end_outputs = self._prepare_outputs(batch_end_outputs, batch_mode=True) + + # hook + self.trainer.call_hook( + 'on_train_batch_end', processed_batch_end_outputs, batch, self.iteration_count, self._dataloader_idx + ) + self.trainer.call_hook('on_batch_end') + self.trainer.logger_connector.on_batch_end() + + # figure out what to track for epoch end + self._track_epoch_end_reduce_metrics(self._epoch_output, batch_end_outputs) + + # ----------------------------------------- + # SAVE METRICS TO LOGGERS AND PROGRESS_BAR + # ----------------------------------------- + self.trainer.logger_connector.update_train_step_metrics() + + def on_advance_end(self): + """Runs validation and Checkpointing if necessary. + + Raises: + StopIteration: if :attr:`done` evaluates to ``True`` to finish this epoch + """ + # ----------------------------------------- + # VALIDATE IF NEEDED + CHECKPOINT CALLBACK + # ----------------------------------------- + should_check_val = self._should_check_val_fx(self.iteration_count, self.is_last_batch) + if should_check_val: + self.trainer.validating = True + self._run_validation() + self.trainer.training = True + + # ----------------------------------------- + # SAVE LOGGERS (ie: Tensorboard, etc...) + # ----------------------------------------- + self._save_loggers_on_train_batch_end() + + # update plateau LR scheduler after metrics are logged + self.update_lr_schedulers('step', update_plateau_schedulers=True) + + self.total_batch_idx += 1 + + # progress global step according to grads progress + self._increment_accumulated_grad_global_step() + + if self.done: + raise StopIteration + + def on_run_end(self) -> List[List[STEP_OUTPUT]]: + """Calls the on_epoch_end hook. + + Returns: + The output of each training step for each optimizer + + Raises: + MisconfigurationException: ``train_epoch_end`` does not return ``None`` + """ + if self.batches_seen == 0: + # dataloader/iterator did not produce a batch + return + + # inform logger the batch loop has finished + self.trainer.logger_connector.epoch_end_reached() + + # prepare epoch output + processed_outputs = self._prepare_outputs(self._epoch_output, batch_mode=False) + + # get the model and call model.training_epoch_end + model = self.trainer.lightning_module + + if is_overridden('training_epoch_end', model): + # run training_epoch_end + # refresh the result for custom logging at the epoch level + model._current_fx_name = 'training_epoch_end' + + # lightningmodule hook + training_epoch_end_output = model.training_epoch_end(processed_outputs) + + if training_epoch_end_output is not None: + raise MisconfigurationException( + 'training_epoch_end expects a return of None. ' + 'HINT: remove the return statement in training_epoch_end' + ) + + # call train epoch end hooks + self._on_train_epoch_end_hook(processed_outputs) + self.trainer.call_hook('on_epoch_end') + self.trainer.logger_connector.on_epoch_end() + + epoch_output = self._epoch_output + # free memory + self._epoch_output = None + return epoch_output + + def teardown(self) -> None: + self._results.cpu() + self.batch_loop.teardown() + self.val_loop.teardown() + + def _run_validation(self): + # reload dataloaders + self.val_loop.reload_evaluation_dataloaders() + + with torch.no_grad(): + self.val_loop.run() + + def _on_train_epoch_end_hook(self, processed_epoch_output: List[List[STEP_OUTPUT]]) -> None: + """Runs ``on_train_epoch_end hook``.""" + # We cannot rely on Trainer.call_hook because the signatures might be different across + # lightning module and callback + # As a result, we need to inspect if the module accepts `outputs` in `on_train_epoch_end` + + # This implementation is copied from Trainer.call_hook + hook_name = "on_train_epoch_end" + prev_fx_name = self.trainer.lightning_module._current_fx_name + self.trainer.lightning_module._current_fx_name = hook_name + + # always profile hooks + with self.trainer.profiler.profile(hook_name): + + # first call trainer hook + if hasattr(self.trainer, hook_name): + trainer_hook = getattr(self.trainer, hook_name) + trainer_hook(processed_epoch_output) + + # next call hook in lightningModule + model_ref = self.trainer.lightning_module + if is_overridden(hook_name, model_ref): + hook_fx = getattr(model_ref, hook_name) + if is_param_in_hook_signature(hook_fx, "outputs"): + self._warning_cache.deprecation( + "The signature of `ModelHooks.on_train_epoch_end` has changed in v1.3." + " `outputs` parameter has been deprecated." + " Support for the old signature will be removed in v1.5", + ) + model_ref.on_train_epoch_end(processed_epoch_output) + else: + model_ref.on_train_epoch_end() + + # call the accelerator hook + if hasattr(self.trainer.accelerator, hook_name): + accelerator_hook = getattr(self.trainer.accelerator, hook_name) + accelerator_hook() + + # restore current_fx when nested context + self.trainer.lightning_module._current_fx_name = prev_fx_name + + def _num_training_batches_reached(self, is_last_batch: bool = False) -> bool: + """Checks if we are in the last batch or if there are more batches to follow.""" + + # TODO: Can we combine this with training_batch_loop's arg that does a similar check? + return self.batches_seen == self.trainer.num_training_batches or is_last_batch + + def _track_epoch_end_reduce_metrics( + self, epoch_output: List[List[STEP_OUTPUT]], batch_end_outputs: STEP_OUTPUT + ) -> None: + """Adds the batch outputs to the epoch outputs and prepares reduction""" + hook_overridden = self._should_add_batch_output_to_epoch_output() + if not hook_overridden: + return + + # track the outputs to reduce at the end of the epoch + for opt_idx, opt_outputs in enumerate(batch_end_outputs): + # with 1 step (no tbptt) don't use a sequence at epoch end + if ( + isinstance(opt_outputs, list) and len(opt_outputs) == 1 + and not isinstance(opt_outputs[0], ResultCollection) + ): + opt_outputs = opt_outputs[0] + + epoch_output[opt_idx].append(opt_outputs) + + def _should_add_batch_output_to_epoch_output(self) -> bool: + """ + We add to the epoch outputs if + 1. The model defines training_epoch_end OR + 2. The model overrides on_train_epoch_end which has `outputs` in the signature + """ + # TODO: in v1.5 this only needs to check if training_epoch_end is overridden + lightning_module = self.trainer.lightning_module + if is_overridden("training_epoch_end", lightning_module): + return True + + if is_overridden("on_train_epoch_end", lightning_module): + model_hook_fx = getattr(lightning_module, "on_train_epoch_end") + if is_param_in_hook_signature(model_hook_fx, "outputs"): + return True + + return False + + @staticmethod + def _prepare_outputs( + outputs: List[List[List['ResultCollection']]], + batch_mode: bool, + ) -> Union[List[List[List[Dict]]], List[List[Dict]], List[Dict], Dict]: + """ + Extract required information from batch or epoch end results. + + Args: + outputs: A 3-dimensional list of ``ResultCollection`` objects with dimensions: + ``[optimizer outs][batch outs][tbptt steps]``. + + batch_mode: If True, ignore the batch output dimension. + + Returns: + The cleaned outputs with ``ResultCollection`` objects converted to dictionaries. + All list dimensions of size one will be collapsed. + """ + processed_outputs = [] + for opt_outputs in outputs: + # handle an edge case where an optimizer output is the empty list + if len(opt_outputs) == 0: + continue + + processed_batch_outputs = [] + + if batch_mode: + opt_outputs = [opt_outputs] + + for batch_outputs in opt_outputs: + processed_tbptt_outputs = [] + + if isinstance(batch_outputs, ResultCollection): + batch_outputs = [batch_outputs] + + for tbptt_output in batch_outputs: + out = tbptt_output.extra + if tbptt_output.minimize is not None: + out['loss'] = tbptt_output.minimize.detach() + processed_tbptt_outputs.append(out) + + # if there was only one tbptt step then we can collapse that dimension + if len(processed_tbptt_outputs) == 1: + processed_tbptt_outputs = processed_tbptt_outputs[0] + processed_batch_outputs.append(processed_tbptt_outputs) + + # batch_outputs should be just one dict (or a list of dicts if using tbptt) per optimizer + if batch_mode: + processed_batch_outputs = processed_batch_outputs[0] + processed_outputs.append(processed_batch_outputs) + + # if there is only one optimiser then we collapse that dimension + if len(processed_outputs) == 1: + processed_outputs = processed_outputs[0] + return processed_outputs + + def update_lr_schedulers(self, interval: str, update_plateau_schedulers: bool) -> None: + """updates the lr schedulers based on the given interval""" + if interval == "step" and self.batch_loop.should_accumulate(): + return + self.trainer.optimizer_connector.update_learning_rates( + interval=interval, + update_plateau_schedulers=update_plateau_schedulers, + opt_indices=[opt_idx for opt_idx, _ in self.batch_loop.get_active_optimizers(self.total_batch_idx)], + ) + + def _increment_accumulated_grad_global_step(self) -> None: + """increments global step""" + num_accumulated_batches_reached = self.batch_loop._accumulated_batches_reached() + num_training_batches_reached = self._num_training_batches_reached() + + # progress global step according to grads progress + if num_accumulated_batches_reached or num_training_batches_reached: + self.global_step = self.trainer.accelerator.update_global_step( + self.total_batch_idx, self.trainer.global_step + ) + + def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: + """ Decide if we should run validation. """ + if not self.trainer.enable_validation: + return False + + is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0 + if not is_val_check_epoch: + return False + + # val_check_batch is inf for iterable datasets with no length defined + is_infinite_dataset = self.trainer.val_check_batch == float('inf') + if is_last_batch and is_infinite_dataset: + return True + + if self.trainer.should_stop: + return True + + # TODO(@awaelchli): let training/eval loop handle logic around limit_*_batches and val_check_batch + is_val_check_batch = is_last_batch + if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset: + is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0 + elif self.trainer.val_check_batch != float('inf'): + is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 + return is_val_check_batch + + def _save_loggers_on_train_batch_end(self) -> None: + """Flushes loggers to disk""" + # when loggers should save to disk + should_flush_logs = self.trainer.logger_connector.should_flush_logs + if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None: + self.trainer.logger.save() + + def state_dict(self) -> Dict: + return {"batch_loop": self.batch_loop.state_dict(), "val_loop": self.val_loop.state_dict()} + + def load_state_dict(self, state_dict: Dict) -> None: + self.batch_loop.load_state_dict(state_dict["batch_loop"]) + self.val_loop.load_state_dict(state_dict["val_loop"]) diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py new file mode 100644 index 0000000000000..c7207f2cf833f --- /dev/null +++ b/pytorch_lightning/loops/fit_loop.py @@ -0,0 +1,265 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from contextlib import suppress +from typing import Any, Dict, Optional + +import pytorch_lightning as pl +from pytorch_lightning.loops import Loop +from pytorch_lightning.loops.epoch import TrainingEpochLoop +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.trainer.supporters import TensorRunningAccum + +log = logging.getLogger(__name__) + + +class FitLoop(Loop): + """This Loop iterates over the epochs to run the training + + Args: + min_epochs: The minimum number of epochs + max_epochs: The maximum number of epochs + min_steps: The minimum number of steps + max_steps: The maximum number of epoch + + .. note:: + If neither the minimum epochs nor steps are specified the minimum number of epochs is set to 1 + and if neither the maximum steps nor epochs are specified, the maximum epochs are set to 1000. + """ + + def __init__( + self, + min_epochs: Optional[int] = None, + max_epochs: Optional[int] = None, + min_steps: Optional[int] = None, + max_steps: Optional[int] = None + ): + super().__init__() + self.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs + self.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs + + self.epoch_loop = TrainingEpochLoop(min_steps, max_steps) + + @property + def current_epoch(self) -> int: + """Return the current epoch""" + return self.iteration_count + + @current_epoch.setter + def current_epoch(self, value: int) -> None: + """Setter for the current epoch""" + self.iteration_count = value + + @property + def global_step(self) -> int: + """Returns the global step""" + return self.epoch_loop.global_step + + @global_step.setter + def global_step(self, value: int) -> None: + """Sets the global step (forwards to epoch_loop)""" + self.epoch_loop.global_step = value + + @property + def total_batch_idx(self) -> int: + """Returns the total number of batches already run (across all epochs)""" + return self.epoch_loop.total_batch_idx + + @property + def batch_idx(self) -> int: + """Returns the number of batches already run within this epoch""" + return self.epoch_loop.iteration_count + + @property + def split_idx(self) -> int: + """Returns the index of the current batch split (within the current batch) for bptt""" + return self.epoch_loop.split_idx + + @property + def min_steps(self) -> int: + # TODO(@justusschock): Why aren't we using the attribute in this class? + """Returns the minimum numnber of steps to run""" + return self.epoch_loop.min_steps + + @min_steps.setter + def min_steps(self, value: int) -> None: + """Sets the minimum number of steps (forwards to epoch_loop)""" + # TODO(@awaelchli): This setter is required by debugging connector (fast dev run), should be avoided + self.epoch_loop.min_steps = value + + @property + def max_steps(self) -> int: + """Returns the maximum number of steps to run""" + return self.epoch_loop.max_steps + + @max_steps.setter + def max_steps(self, value: int) -> None: + """Sets the maximum number of steps (forwards to epoch_loop)""" + # TODO(@awaelchli): This setter is required by debugging connector (fast dev run), should be avoided + self.epoch_loop.max_steps = value + + @property + def running_loss(self) -> TensorRunningAccum: + """Returns the running loss""" + return self.epoch_loop.batch_loop.running_loss + + @property + def _skip_backward(self) -> bool: + """ Determines whether the loop will skip backward during automatic optimization. """ + return self.epoch_loop.batch_loop._skip_backward + + @_skip_backward.setter + def _skip_backward(self, value: bool) -> None: + """ Determines whether the loop will skip backward during automatic optimization. """ + self.epoch_loop.batch_loop._skip_backward = value + + @property + def _results(self) -> ResultCollection: + if self.trainer.training: + return self.epoch_loop._results + if self.trainer.validating: + return self.epoch_loop.val_loop._results + raise RuntimeError("`FitLoop._results` property isn't defined. Accessed outside of scope") + + @property + def done(self) -> bool: + """Evaluates when to leave the loop. + + Returns True if trainer.should_stop was set (e.g. by early stopping) + or if the maximum number of steps or epochs is reached. + """ + # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop + stop_steps = self.max_steps is not None and self.global_step >= self.max_steps + stop_epochs = self.max_epochs is not None and self.current_epoch >= self.max_epochs + + should_stop = False + if self.trainer.should_stop: + # early stopping + met_min_epochs = self.current_epoch >= self.min_epochs if self.min_epochs else True + met_min_steps = self.global_step >= self.min_steps if self.min_steps else True + if met_min_epochs and met_min_steps: + should_stop = True + else: + log.info( + 'Trainer was signaled to stop but required minimum epochs' + f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has' + ' not been met. Training will continue...' + ) + self.trainer.should_stop = should_stop + + return stop_steps or should_stop or stop_epochs + + @property + def skip(self) -> bool: + """Whether we should skip the training and immediately return from the call to :meth:`run`.""" + return self.done or self.trainer.num_training_batches == 0 + + def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None: + """Connects the loop with necessary arguments like the trainer""" + super().connect(trainer, *args, **kwargs) + self.epoch_loop.connect(trainer) + + def reset(self) -> None: + """Resets the internal state of this loop""" + + def on_run_start(self) -> None: + """Calls the ``on_train_start`` hook.""" + self._results.to(device=self.trainer.lightning_module.device) + self.trainer.call_hook("on_train_start") + + def on_advance_start(self) -> None: + """Prepares the dataloader for training and calls the hooks ``on_epoch_start`` and ``on_train_epoch_start``""" + model = self.trainer.lightning_module + + # reset train dataloader + if self.current_epoch != 0 and self.trainer.reload_dataloaders_every_epoch: + self.trainer.reset_train_dataloader(model) + + # TODO: specify the possible exception + with suppress(Exception): + # set seed for distributed sampler (enables shuffling for each epoch) + self.trainer.train_dataloader.sampler.set_epoch(self.current_epoch) + + # changing gradient according accumulation_scheduler + self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module) + + # stores accumulated grad fractions per batch + self.epoch_loop.batch_loop.accumulated_loss = TensorRunningAccum( + window_length=self.trainer.accumulate_grad_batches + ) + + def advance(self) -> None: + """Runs one whole epoch.""" + train_dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader) + train_dataloader = self.trainer.data_connector.get_profiled_train_dataloader(train_dataloader) + + with self.trainer.profiler.profile("run_training_epoch"): + # run train epoch + epoch_output = self.epoch_loop.run(train_dataloader) + + if epoch_output is None: + return + + # the global step is manually decreased here due to backwards compatibility with existing loggers + # as they expect that the same step is used when logging epoch end metrics even when the batch loop has + # finished. this means the attribute does not exactly track the number of optimizer steps applied. + # TODO(@carmocca): deprecate and rename so users don't get confused + self.global_step -= 1 + # log epoch metrics + self.trainer.logger_connector.update_train_epoch_metrics() + self.global_step += 1 + + def on_advance_end(self) -> None: + """Updates the LR schedulers and does some internal bookkeeping""" + if self.epoch_loop.batches_seen == 0: + return + + self.epoch_loop.update_lr_schedulers('epoch', update_plateau_schedulers=True) + + def on_run_end(self) -> None: + """Calls the ``on_train_end`` hook""" + # NOTE: the iteration_count/current_epoch is already incremented + # Lightning today does not increment the current epoch at the last epoch run in Trainer.fit + # To simulate that current behavior, we decrement here. + # TODO: must be fixed by https://github.com/PyTorchLightning/pytorch-lightning/issues/5007 + self.current_epoch -= 1 + + # hook + self.trainer.call_hook("on_train_end") + + # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers. + # It might be related to xla tensors blocked when moving the cpu + # kill loggers + if self.trainer.logger is not None: + self.trainer.logger.finalize("success") + + # summarize profile results + self.trainer.profiler.describe() + + # give accelerators a chance to finish + self.trainer.accelerator.on_train_end() + + def should_accumulate(self) -> bool: + """Whether the gradients should be accumulated""" + return self.epoch_loop.batch_loop.should_accumulate() + + def state_dict(self) -> Dict: + return {"epoch_loop": self.epoch_loop.state_dict()} + + def load_state_dict(self, state_dict: Dict) -> None: + self.epoch_loop.load_state_dict(state_dict["epoch_loop"]) + + def teardown(self) -> None: + self.epoch_loop.teardown() diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py index 9b27fdf0cb253..da682e4840489 100644 --- a/pytorch_lightning/metrics/__init__.py +++ b/pytorch_lightning/metrics/__init__.py @@ -38,9 +38,3 @@ R2Score, SSIM, ) -from pytorch_lightning.utilities import rank_zero_deprecation - -rank_zero_deprecation( - "`pytorch_lightning.metrics.*` module has been renamed to `torchmetrics.*` and split off to its own package" - " (https://github.com/PyTorchLightning/metrics) since v1.3 and will be removed in v1.5" -) diff --git a/pytorch_lightning/metrics/classification/accuracy.py b/pytorch_lightning/metrics/classification/accuracy.py index 53a16a2a270d7..cf99bc5940a8f 100644 --- a/pytorch_lightning/metrics/classification/accuracy.py +++ b/pytorch_lightning/metrics/classification/accuracy.py @@ -15,7 +15,7 @@ from torchmetrics import Accuracy as _Accuracy -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class Accuracy(_Accuracy): @@ -37,4 +37,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.Accuracy`. Will be removed in v1.5.0. """ - _ = threshold, top_k, subset_accuracy, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(threshold, top_k, subset_accuracy, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/classification/auc.py b/pytorch_lightning/metrics/classification/auc.py index 917810d57b5dd..42813620758a5 100644 --- a/pytorch_lightning/metrics/classification/auc.py +++ b/pytorch_lightning/metrics/classification/auc.py @@ -15,7 +15,7 @@ from torchmetrics import AUC as _AUC -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class AUC(_AUC): @@ -35,4 +35,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.AUC`. Will be removed in v1.5.0. """ - _ = reorder, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(reorder, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/classification/auroc.py b/pytorch_lightning/metrics/classification/auroc.py index 78fb722ddf610..d1e797d956483 100644 --- a/pytorch_lightning/metrics/classification/auroc.py +++ b/pytorch_lightning/metrics/classification/auroc.py @@ -15,7 +15,7 @@ from torchmetrics import AUROC as _AUROC -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class AUROC(_AUROC): @@ -38,4 +38,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.AUROC`. Will be removed in v1.5.0. """ - _ = num_classes, pos_label, average, max_fpr, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(num_classes, pos_label, average, max_fpr, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/classification/average_precision.py b/pytorch_lightning/metrics/classification/average_precision.py index d7e0d3d387d39..fdb1b26178304 100644 --- a/pytorch_lightning/metrics/classification/average_precision.py +++ b/pytorch_lightning/metrics/classification/average_precision.py @@ -15,7 +15,7 @@ from torchmetrics import AveragePrecision as _AveragePrecision -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class AveragePrecision(_AveragePrecision): @@ -35,4 +35,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.AveragePrecision`. Will be removed in v1.5.0. """ - _ = num_classes, pos_label, compute_on_step, dist_sync_on_step, process_group + void(num_classes, pos_label, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/classification/confusion_matrix.py b/pytorch_lightning/metrics/classification/confusion_matrix.py index 7a4673b9a8495..e77df43e63524 100644 --- a/pytorch_lightning/metrics/classification/confusion_matrix.py +++ b/pytorch_lightning/metrics/classification/confusion_matrix.py @@ -15,7 +15,7 @@ from torchmetrics import ConfusionMatrix as _ConfusionMatrix -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class ConfusionMatrix(_ConfusionMatrix): @@ -36,4 +36,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.ConfusionMatrix`. Will be removed in v1.5.0. """ - _ = num_classes, normalize, threshold, compute_on_step, dist_sync_on_step, process_group + void(num_classes, normalize, threshold, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py index 57c5d67c6a5f1..58a50f163b08a 100644 --- a/pytorch_lightning/metrics/classification/f_beta.py +++ b/pytorch_lightning/metrics/classification/f_beta.py @@ -16,12 +16,12 @@ from torchmetrics import F1 as _F1 from torchmetrics import FBeta as _FBeta -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class FBeta(_FBeta): - @deprecated_metrics(target=_FBeta) + @deprecated_metrics(target=_FBeta, args_mapping={"multilabel": None}) def __init__( self, num_classes: int, @@ -44,7 +44,7 @@ def __init__( class F1(_F1): - @deprecated_metrics(target=_F1) + @deprecated_metrics(target=_F1, args_mapping={"multilabel": None}) def __init__( self, num_classes: int, @@ -61,4 +61,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.F1`. Will be removed in v1.5.0. """ - _ = num_classes, threshold, average, multilabel, compute_on_step, dist_sync_on_step, process_group + void(num_classes, threshold, average, multilabel, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/classification/hamming_distance.py b/pytorch_lightning/metrics/classification/hamming_distance.py index c06755d6c6c39..134bc33cf1267 100644 --- a/pytorch_lightning/metrics/classification/hamming_distance.py +++ b/pytorch_lightning/metrics/classification/hamming_distance.py @@ -15,7 +15,7 @@ from torchmetrics import HammingDistance as _HammingDistance -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class HammingDistance(_HammingDistance): @@ -35,4 +35,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.HammingDistance`. Will be removed in v1.5.0. """ - _ = threshold, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(threshold, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/classification/iou.py b/pytorch_lightning/metrics/classification/iou.py index 5fe8e4f11401d..00168b1924821 100644 --- a/pytorch_lightning/metrics/classification/iou.py +++ b/pytorch_lightning/metrics/classification/iou.py @@ -15,7 +15,7 @@ from torchmetrics import IoU as _IoU -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class IoU(_IoU): @@ -38,5 +38,7 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.IoU`. Will be removed in v1.5.0. """ - _ = num_classes, ignore_index, absent_score, threshold, reduction, \ - compute_on_step, dist_sync_on_step, process_group + void( + num_classes, ignore_index, absent_score, threshold, reduction, compute_on_step, dist_sync_on_step, + process_group + ) diff --git a/pytorch_lightning/metrics/classification/precision_recall.py b/pytorch_lightning/metrics/classification/precision_recall.py index b40c5a0c627e0..6507f6d071000 100644 --- a/pytorch_lightning/metrics/classification/precision_recall.py +++ b/pytorch_lightning/metrics/classification/precision_recall.py @@ -16,12 +16,12 @@ from torchmetrics import Precision as _Precision from torchmetrics import Recall as _Recall -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class Precision(_Precision): - @deprecated_metrics(target=_Precision) + @deprecated_metrics(target=_Precision, args_mapping={"multilabel": None, "is_multiclass": None}) def __init__( self, num_classes: Optional[int] = None, @@ -49,7 +49,7 @@ def __init__( class Recall(_Recall): - @deprecated_metrics(target=_Recall) + @deprecated_metrics(target=_Recall, args_mapping={"multilabel": None, "is_multiclass": None}) def __init__( self, num_classes: Optional[int] = None, @@ -71,3 +71,7 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.Recall`. Will be removed in v1.5.0. """ + void( + num_classes, threshold, average, multilabel, mdmc_average, ignore_index, top_k, is_multiclass, + compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + ) diff --git a/pytorch_lightning/metrics/classification/precision_recall_curve.py b/pytorch_lightning/metrics/classification/precision_recall_curve.py index a1a7f0dc665cd..c51650663073c 100644 --- a/pytorch_lightning/metrics/classification/precision_recall_curve.py +++ b/pytorch_lightning/metrics/classification/precision_recall_curve.py @@ -15,7 +15,7 @@ from torchmetrics import PrecisionRecallCurve as _PrecisionRecallCurve -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class PrecisionRecallCurve(_PrecisionRecallCurve): @@ -35,4 +35,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.PrecisionRecallCurve`. Will be removed in v1.5.0. """ - _ = num_classes, pos_label, compute_on_step, dist_sync_on_step, process_group + void(num_classes, pos_label, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/classification/roc.py b/pytorch_lightning/metrics/classification/roc.py index 55a2782e0408c..824d2a22c3951 100644 --- a/pytorch_lightning/metrics/classification/roc.py +++ b/pytorch_lightning/metrics/classification/roc.py @@ -15,7 +15,7 @@ from torchmetrics import ROC as _ROC -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class ROC(_ROC): @@ -35,4 +35,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.ROC`. Will be removed in v1.5.0. """ - _ = num_classes, pos_label, compute_on_step, dist_sync_on_step, process_group + void(num_classes, pos_label, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/classification/stat_scores.py b/pytorch_lightning/metrics/classification/stat_scores.py index 94fde40a392ba..806ee73e176dc 100644 --- a/pytorch_lightning/metrics/classification/stat_scores.py +++ b/pytorch_lightning/metrics/classification/stat_scores.py @@ -15,12 +15,12 @@ from torchmetrics import StatScores as _StatScores -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class StatScores(_StatScores): - @deprecated_metrics(target=_StatScores) + @deprecated_metrics(target=_StatScores, args_mapping={"is_multiclass": None}) def __init__( self, threshold: float = 0.5, @@ -41,5 +41,7 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.StatScores`. Will be removed in v1.5.0. """ - _ = threshold, top_k, reduce, num_classes, ignore_index, mdmc_reduce, is_multiclass, compute_on_step, \ + void( + threshold, top_k, reduce, num_classes, ignore_index, mdmc_reduce, is_multiclass, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + ) diff --git a/pytorch_lightning/metrics/compositional.py b/pytorch_lightning/metrics/compositional.py index 56bb1912e48e6..01189e129d69d 100644 --- a/pytorch_lightning/metrics/compositional.py +++ b/pytorch_lightning/metrics/compositional.py @@ -17,7 +17,7 @@ from torchmetrics import Metric from torchmetrics.metric import CompositionalMetric as _CompositionalMetric -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class CompositionalMetric(_CompositionalMetric): @@ -33,3 +33,4 @@ def __init__( .. deprecated:: Use :class:`torchmetrics.metric.CompositionalMetric`. Will be removed in v1.5.0. """ + void(operator, metric_a, metric_b) diff --git a/pytorch_lightning/metrics/functional/accuracy.py b/pytorch_lightning/metrics/functional/accuracy.py index 69fa9d75590e0..0dddcb37676e0 100644 --- a/pytorch_lightning/metrics/functional/accuracy.py +++ b/pytorch_lightning/metrics/functional/accuracy.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import accuracy as _accuracy -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_accuracy) @@ -31,3 +31,4 @@ def accuracy( .. deprecated:: Use :func:`torchmetrics.functional.accuracy`. Will be removed in v1.5.0. """ + return void(preds, target, threshold, top_k, subset_accuracy) diff --git a/pytorch_lightning/metrics/functional/auc.py b/pytorch_lightning/metrics/functional/auc.py index 7cc6aa458d397..f8b43e47d6eca 100644 --- a/pytorch_lightning/metrics/functional/auc.py +++ b/pytorch_lightning/metrics/functional/auc.py @@ -14,7 +14,7 @@ import torch from torchmetrics.functional import auc as _auc -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_auc) @@ -23,3 +23,4 @@ def auc(x: torch.Tensor, y: torch.Tensor, reorder: bool = False) -> torch.Tensor .. deprecated:: Use :func:`torchmetrics.functional.auc`. Will be removed in v1.5.0. """ + return void(x, y, reorder) diff --git a/pytorch_lightning/metrics/functional/auroc.py b/pytorch_lightning/metrics/functional/auroc.py index c49aa1a8fdc48..4815a2e88b410 100644 --- a/pytorch_lightning/metrics/functional/auroc.py +++ b/pytorch_lightning/metrics/functional/auroc.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import auroc as _auroc -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_auroc) @@ -33,3 +33,4 @@ def auroc( .. deprecated:: Use :func:`torchmetrics.functional.auroc`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, pos_label, average, max_fpr, sample_weights) diff --git a/pytorch_lightning/metrics/functional/average_precision.py b/pytorch_lightning/metrics/functional/average_precision.py index 017b34739a0f4..79712935ab70d 100644 --- a/pytorch_lightning/metrics/functional/average_precision.py +++ b/pytorch_lightning/metrics/functional/average_precision.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import average_precision as _average_precision -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_average_precision) @@ -31,3 +31,4 @@ def average_precision( .. deprecated:: Use :func:`torchmetrics.functional.average_precision`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, pos_label, sample_weights) diff --git a/pytorch_lightning/metrics/functional/confusion_matrix.py b/pytorch_lightning/metrics/functional/confusion_matrix.py index 038bd8b49b730..2607abc49ec07 100644 --- a/pytorch_lightning/metrics/functional/confusion_matrix.py +++ b/pytorch_lightning/metrics/functional/confusion_matrix.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import confusion_matrix as _confusion_matrix -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_confusion_matrix) @@ -31,3 +31,4 @@ def confusion_matrix( .. deprecated:: Use :func:`torchmetrics.functional.confusion_matrix`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, normalize, threshold) diff --git a/pytorch_lightning/metrics/functional/explained_variance.py b/pytorch_lightning/metrics/functional/explained_variance.py index 233a0851b8d56..7885c8e8b04a9 100644 --- a/pytorch_lightning/metrics/functional/explained_variance.py +++ b/pytorch_lightning/metrics/functional/explained_variance.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import explained_variance as _explained_variance -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_explained_variance) @@ -29,3 +29,4 @@ def explained_variance( .. deprecated:: Use :func:`torchmetrics.functional.explained_variance`. Will be removed in v1.5.0. """ + return void(preds, target, multioutput) diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py index 1130b700c6b8c..ed3d92e69ff23 100644 --- a/pytorch_lightning/metrics/functional/f_beta.py +++ b/pytorch_lightning/metrics/functional/f_beta.py @@ -17,10 +17,10 @@ from torchmetrics.functional import f1 as _f1 from torchmetrics.functional import fbeta as _fbeta -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void -@deprecated_metrics(target=_fbeta) +@deprecated_metrics(target=_fbeta, args_mapping={"multilabel": None}) def fbeta( preds: torch.Tensor, target: torch.Tensor, @@ -34,9 +34,10 @@ def fbeta( .. deprecated:: Use :func:`torchmetrics.functional.accuracy`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, beta, threshold, average, multilabel) -@deprecated_metrics(target=_f1) +@deprecated_metrics(target=_f1, args_mapping={"multilabel": None}) def f1( preds: torch.Tensor, target: torch.Tensor, @@ -49,3 +50,4 @@ def f1( .. deprecated:: Use :func:`torchmetrics.functional.f1`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, threshold, average, multilabel) diff --git a/pytorch_lightning/metrics/functional/hamming_distance.py b/pytorch_lightning/metrics/functional/hamming_distance.py index 6a390e776f111..a501184dc3bbf 100644 --- a/pytorch_lightning/metrics/functional/hamming_distance.py +++ b/pytorch_lightning/metrics/functional/hamming_distance.py @@ -14,7 +14,7 @@ import torch from torchmetrics.functional import hamming_distance as _hamming_distance -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_hamming_distance) @@ -23,3 +23,4 @@ def hamming_distance(preds: torch.Tensor, target: torch.Tensor, threshold: float .. deprecated:: Use :func:`torchmetrics.functional.hamming_distance`. Will be removed in v1.5.0. """ + return void(preds, target, threshold) diff --git a/pytorch_lightning/metrics/functional/image_gradients.py b/pytorch_lightning/metrics/functional/image_gradients.py index e2151c5fc1d93..539dc5ee4c55e 100644 --- a/pytorch_lightning/metrics/functional/image_gradients.py +++ b/pytorch_lightning/metrics/functional/image_gradients.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import image_gradients as _image_gradients -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_image_gradients) @@ -25,3 +25,4 @@ def image_gradients(img: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: .. deprecated:: Use :func:`torchmetrics.functional.image_gradients`. Will be removed in v1.5.0. """ + return void(img) diff --git a/pytorch_lightning/metrics/functional/iou.py b/pytorch_lightning/metrics/functional/iou.py index 5554a5d77a355..d73310e4e8ea9 100644 --- a/pytorch_lightning/metrics/functional/iou.py +++ b/pytorch_lightning/metrics/functional/iou.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import iou as _iou -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void from pytorch_lightning.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_3, _TORCHMETRICS_LOWER_THAN_0_3 @@ -35,3 +35,4 @@ def iou( .. deprecated:: Use :func:`torchmetrics.functional.iou`. Will be removed in v1.5.0. """ + return void(pred, target, ignore_index, absent_score, threshold, num_classes, reduction) diff --git a/pytorch_lightning/metrics/functional/mean_absolute_error.py b/pytorch_lightning/metrics/functional/mean_absolute_error.py index 219284d79d623..2ffd340196e81 100644 --- a/pytorch_lightning/metrics/functional/mean_absolute_error.py +++ b/pytorch_lightning/metrics/functional/mean_absolute_error.py @@ -15,7 +15,7 @@ import torch from torchmetrics.functional import mean_absolute_error as _mean_absolute_error -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_mean_absolute_error) @@ -24,3 +24,4 @@ def mean_absolute_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tens .. deprecated:: Use :func:`torchmetrics.functional.mean_absolute_error`. Will be removed in v1.5.0. """ + return void(preds, target) diff --git a/pytorch_lightning/metrics/functional/mean_relative_error.py b/pytorch_lightning/metrics/functional/mean_relative_error.py index 329fe040ebc7d..4d45b0d3e6141 100644 --- a/pytorch_lightning/metrics/functional/mean_relative_error.py +++ b/pytorch_lightning/metrics/functional/mean_relative_error.py @@ -15,7 +15,7 @@ import torch from torchmetrics.functional.regression.mean_relative_error import mean_relative_error as _mean_relative_error -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_mean_relative_error) @@ -24,3 +24,4 @@ def mean_relative_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tens .. deprecated:: Use :func:`torchmetrics.functional.regression.mean_relative_error`. Will be removed in v1.5.0. """ + return void(preds, target) diff --git a/pytorch_lightning/metrics/functional/mean_squared_error.py b/pytorch_lightning/metrics/functional/mean_squared_error.py index 5bbc0bb1c6a83..3ff06569f856d 100644 --- a/pytorch_lightning/metrics/functional/mean_squared_error.py +++ b/pytorch_lightning/metrics/functional/mean_squared_error.py @@ -15,7 +15,7 @@ import torch from torchmetrics.functional import mean_squared_error as _mean_squared_error -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_mean_squared_error) @@ -24,3 +24,4 @@ def mean_squared_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tenso .. deprecated:: Use :func:`torchmetrics.functional.mean_squared_error`. Will be removed in v1.5.0. """ + return void(preds, target) diff --git a/pytorch_lightning/metrics/functional/mean_squared_log_error.py b/pytorch_lightning/metrics/functional/mean_squared_log_error.py index 29786529381d5..b799ce4847f6e 100644 --- a/pytorch_lightning/metrics/functional/mean_squared_log_error.py +++ b/pytorch_lightning/metrics/functional/mean_squared_log_error.py @@ -15,7 +15,7 @@ import torch from torchmetrics.functional import mean_squared_log_error as _mean_squared_log_error -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_mean_squared_log_error) @@ -24,3 +24,4 @@ def mean_squared_log_error(preds: torch.Tensor, target: torch.Tensor) -> torch.T .. deprecated:: Use :func:`torchmetrics.functional.mean_squared_log_error`. Will be removed in v1.5.0. """ + return void(preds, target) diff --git a/pytorch_lightning/metrics/functional/nlp.py b/pytorch_lightning/metrics/functional/nlp.py index c59d7cf2b8976..3eaa5eff1cc5c 100644 --- a/pytorch_lightning/metrics/functional/nlp.py +++ b/pytorch_lightning/metrics/functional/nlp.py @@ -21,7 +21,7 @@ import torch from torchmetrics.functional import bleu_score as _bleu_score -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_bleu_score) @@ -35,3 +35,4 @@ def bleu_score( .. deprecated:: Use :func:`torchmetrics.functional.bleu_score`. Will be removed in v1.5.0. """ + return void(translate_corpus, reference_corpus, n_gram, smooth) diff --git a/pytorch_lightning/metrics/functional/precision_recall.py b/pytorch_lightning/metrics/functional/precision_recall.py index 7b6c8641b5829..367c9c9111f07 100644 --- a/pytorch_lightning/metrics/functional/precision_recall.py +++ b/pytorch_lightning/metrics/functional/precision_recall.py @@ -18,10 +18,10 @@ from torchmetrics.functional import precision_recall as _precision_recall from torchmetrics.functional import recall as _recall -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void -@deprecated_metrics(target=_precision) +@deprecated_metrics(target=_precision, args_mapping={"is_multiclass": None}) def precision( preds: torch.Tensor, target: torch.Tensor, @@ -37,9 +37,10 @@ def precision( .. deprecated:: Use :func:`torchmetrics.functional.precision`. Will be removed in v1.5.0. """ + return void(preds, target, average, mdmc_average, ignore_index, num_classes, threshold, top_k, is_multiclass) -@deprecated_metrics(target=_recall) +@deprecated_metrics(target=_recall, args_mapping={"is_multiclass": None}) def recall( preds: torch.Tensor, target: torch.Tensor, @@ -55,9 +56,10 @@ def recall( .. deprecated:: Use :func:`torchmetrics.functional.accuracy`. Will be removed in v1.5.0. """ + return void(preds, target, average, mdmc_average, ignore_index, num_classes, threshold, top_k, is_multiclass) -@deprecated_metrics(target=_precision_recall) +@deprecated_metrics(target=_precision_recall, args_mapping={"is_multiclass": None}) def precision_recall( preds: torch.Tensor, target: torch.Tensor, @@ -73,3 +75,4 @@ def precision_recall( .. deprecated:: Use :func:`torchmetrics.functional.precision_recall`. Will be removed in v1.5.0. """ + return void(preds, target, average, mdmc_average, ignore_index, num_classes, threshold, top_k, is_multiclass) diff --git a/pytorch_lightning/metrics/functional/precision_recall_curve.py b/pytorch_lightning/metrics/functional/precision_recall_curve.py index dc9863cbb47c4..58d35557cce11 100644 --- a/pytorch_lightning/metrics/functional/precision_recall_curve.py +++ b/pytorch_lightning/metrics/functional/precision_recall_curve.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import precision_recall_curve as _precision_recall_curve -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_precision_recall_curve) @@ -32,3 +32,4 @@ def precision_recall_curve( .. deprecated:: Use :func:`torchmetrics.functional.accuracy`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, pos_label, sample_weights) diff --git a/pytorch_lightning/metrics/functional/psnr.py b/pytorch_lightning/metrics/functional/psnr.py index 51be9d47b91f9..df54ae17adb32 100644 --- a/pytorch_lightning/metrics/functional/psnr.py +++ b/pytorch_lightning/metrics/functional/psnr.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import psnr as _psnr -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_psnr) @@ -32,3 +32,4 @@ def psnr( .. deprecated:: Use :func:`torchmetrics.functional.psnr`. Will be removed in v1.5.0. """ + return void(preds, target, data_range, base, reduction, dim) diff --git a/pytorch_lightning/metrics/functional/r2score.py b/pytorch_lightning/metrics/functional/r2score.py index fe4b541989358..d5df9c2bfb4a3 100644 --- a/pytorch_lightning/metrics/functional/r2score.py +++ b/pytorch_lightning/metrics/functional/r2score.py @@ -15,7 +15,7 @@ import torch from torchmetrics.functional import r2score as _r2score -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_r2score) @@ -29,3 +29,4 @@ def r2score( .. deprecated:: Use :func:`torchmetrics.functional.r2score`. Will be removed in v1.5.0. """ + return void(preds, target, adjusted, multioutput) diff --git a/pytorch_lightning/metrics/functional/roc.py b/pytorch_lightning/metrics/functional/roc.py index 928a0b40fca54..f7d58af15e557 100644 --- a/pytorch_lightning/metrics/functional/roc.py +++ b/pytorch_lightning/metrics/functional/roc.py @@ -16,7 +16,7 @@ from torch import Tensor from torchmetrics.functional import roc as _roc -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_roc) @@ -31,3 +31,4 @@ def roc( .. deprecated:: Use :func:`torchmetrics.functional.roc`. Will be removed in v1.5.0. """ + return void(preds, target, num_classes, pos_label, sample_weights) diff --git a/pytorch_lightning/metrics/functional/self_supervised.py b/pytorch_lightning/metrics/functional/self_supervised.py index 65dec211e938a..5de4383683844 100644 --- a/pytorch_lightning/metrics/functional/self_supervised.py +++ b/pytorch_lightning/metrics/functional/self_supervised.py @@ -14,7 +14,7 @@ import torch from torchmetrics.functional import embedding_similarity as _embedding_similarity -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_embedding_similarity) @@ -28,3 +28,4 @@ def embedding_similarity( .. deprecated:: Use :func:`torchmetrics.functional.embedding_similarity`. Will be removed in v1.5.0. """ + return void(batch, similarity, reduction, zero_diagonal) diff --git a/pytorch_lightning/metrics/functional/ssim.py b/pytorch_lightning/metrics/functional/ssim.py index 31cff7fcfb9b4..2033520f011b0 100644 --- a/pytorch_lightning/metrics/functional/ssim.py +++ b/pytorch_lightning/metrics/functional/ssim.py @@ -16,7 +16,7 @@ import torch from torchmetrics.functional import ssim as _ssim -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void @deprecated_metrics(target=_ssim) @@ -34,3 +34,4 @@ def ssim( .. deprecated:: Use :func:`torchmetrics.functional.ssim`. Will be removed in v1.5.0. """ + return void(preds, target, kernel_size, sigma, reduction, data_range, k1, k2) diff --git a/pytorch_lightning/metrics/functional/stat_scores.py b/pytorch_lightning/metrics/functional/stat_scores.py index 30c03da237fe6..da654a54e3bf6 100644 --- a/pytorch_lightning/metrics/functional/stat_scores.py +++ b/pytorch_lightning/metrics/functional/stat_scores.py @@ -16,10 +16,10 @@ import torch from torchmetrics.functional import stat_scores as _stat_scores -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void -@deprecated_metrics(target=_stat_scores) +@deprecated_metrics(target=_stat_scores, args_mapping={"is_multiclass": None}) def stat_scores( preds: torch.Tensor, target: torch.Tensor, @@ -35,3 +35,4 @@ def stat_scores( .. deprecated:: Use :func:`torchmetrics.functional.stat_scores`. Will be removed in v1.5.0. """ + return void(preds, target, reduce, mdmc_reduce, num_classes, top_k, threshold, is_multiclass, ignore_index) diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index ee0fcdb8a92e1..e5fc0866d7e8d 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -16,7 +16,7 @@ from torchmetrics import Metric as _Metric from torchmetrics.collections import MetricCollection as _MetricCollection -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class Metric(_Metric): @@ -33,6 +33,7 @@ def __init__( .. deprecated:: Use :class:`torchmetrics.Metric`. Will be removed in v1.5.0. """ + void(compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) class MetricCollection(_MetricCollection): @@ -43,3 +44,4 @@ def __init__(self, metrics: Union[List[Metric], Tuple[Metric], Dict[str, Metric] .. deprecated:: Use :class:`torchmetrics.MetricCollection`. Will be removed in v1.5.0. """ + void(metrics) diff --git a/pytorch_lightning/metrics/regression/explained_variance.py b/pytorch_lightning/metrics/regression/explained_variance.py index 50c620b82f87f..64a0eaa1a171b 100644 --- a/pytorch_lightning/metrics/regression/explained_variance.py +++ b/pytorch_lightning/metrics/regression/explained_variance.py @@ -15,7 +15,7 @@ from torchmetrics import ExplainedVariance as _ExplainedVariance -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class ExplainedVariance(_ExplainedVariance): @@ -35,4 +35,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.ExplainedVariance`. Will be removed in v1.5.0. """ - _ = multioutput, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(multioutput, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/regression/mean_absolute_error.py b/pytorch_lightning/metrics/regression/mean_absolute_error.py index 493294a2811b9..c0744bd1c2fa5 100644 --- a/pytorch_lightning/metrics/regression/mean_absolute_error.py +++ b/pytorch_lightning/metrics/regression/mean_absolute_error.py @@ -15,7 +15,7 @@ from torchmetrics import MeanAbsoluteError as _MeanAbsoluteError -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class MeanAbsoluteError(_MeanAbsoluteError): @@ -34,4 +34,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.MeanAbsoluteError`. Will be removed in v1.5.0. """ - _ = compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/regression/mean_squared_error.py b/pytorch_lightning/metrics/regression/mean_squared_error.py index ec2b34c4fd86e..bececd5633ea5 100644 --- a/pytorch_lightning/metrics/regression/mean_squared_error.py +++ b/pytorch_lightning/metrics/regression/mean_squared_error.py @@ -15,7 +15,7 @@ from torchmetrics import MeanSquaredError as _MeanSquaredError -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class MeanSquaredError(_MeanSquaredError): @@ -34,4 +34,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.MeanSquaredError`. Will be removed in v1.5.0. """ - _ = compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/regression/mean_squared_log_error.py b/pytorch_lightning/metrics/regression/mean_squared_log_error.py index b95ee46ea3fca..be010de4483d9 100644 --- a/pytorch_lightning/metrics/regression/mean_squared_log_error.py +++ b/pytorch_lightning/metrics/regression/mean_squared_log_error.py @@ -15,7 +15,7 @@ from torchmetrics import MeanSquaredLogError as _MeanSquaredLogError -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class MeanSquaredLogError(_MeanSquaredLogError): @@ -34,4 +34,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.MeanSquaredLogError`. Will be removed in v1.5.0. """ - _ = compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/regression/psnr.py b/pytorch_lightning/metrics/regression/psnr.py index d81cfb3035f3b..d647fae2622fe 100644 --- a/pytorch_lightning/metrics/regression/psnr.py +++ b/pytorch_lightning/metrics/regression/psnr.py @@ -15,7 +15,7 @@ from torchmetrics import PSNR as _PSNR -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class PSNR(_PSNR): @@ -37,4 +37,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.PSNR`. Will be removed in v1.5.0. """ - _ = data_range, base, reduction, dim, compute_on_step, dist_sync_on_step, process_group + void(data_range, base, reduction, dim, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/regression/r2score.py b/pytorch_lightning/metrics/regression/r2score.py index 7ec2f9a586c69..93b986b8620a5 100644 --- a/pytorch_lightning/metrics/regression/r2score.py +++ b/pytorch_lightning/metrics/regression/r2score.py @@ -15,7 +15,7 @@ from torchmetrics import R2Score as _R2Score -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class R2Score(_R2Score): @@ -37,4 +37,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.R2Score`. Will be removed in v1.5.0. """ - _ = num_outputs, adjusted, multioutput, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn + void(num_outputs, adjusted, multioutput, compute_on_step, dist_sync_on_step, process_group, dist_sync_fn) diff --git a/pytorch_lightning/metrics/regression/ssim.py b/pytorch_lightning/metrics/regression/ssim.py index 2ea8872e6ad53..e1d2e575cd882 100644 --- a/pytorch_lightning/metrics/regression/ssim.py +++ b/pytorch_lightning/metrics/regression/ssim.py @@ -15,7 +15,7 @@ from torchmetrics import SSIM as _SSIM -from pytorch_lightning.metrics.utils import deprecated_metrics +from pytorch_lightning.metrics.utils import deprecated_metrics, void class SSIM(_SSIM): @@ -39,4 +39,4 @@ def __init__( .. deprecated:: Use :class:`~torchmetrics.SSIM`. Will be removed in v1.5.0. """ - _ = kernel_size, sigma, reduction, data_range, k1, k2, compute_on_step, dist_sync_on_step, process_group + void(kernel_size, sigma, reduction, data_range, k1, k2, compute_on_step, dist_sync_on_step, process_group) diff --git a/pytorch_lightning/metrics/utils.py b/pytorch_lightning/metrics/utils.py index 30c2975c924d1..dd58e59751eb3 100644 --- a/pytorch_lightning/metrics/utils.py +++ b/pytorch_lightning/metrics/utils.py @@ -15,7 +15,7 @@ from typing import Optional import torch -from deprecate import deprecated +from deprecate import deprecated, void from torchmetrics.utilities.data import dim_zero_cat as _dim_zero_cat from torchmetrics.utilities.data import dim_zero_mean as _dim_zero_mean from torchmetrics.utilities.data import dim_zero_sum as _dim_zero_sum @@ -34,17 +34,17 @@ @deprecated_metrics(target=_dim_zero_cat) def dim_zero_cat(x): - pass + return void(x) @deprecated_metrics(target=_dim_zero_sum) def dim_zero_sum(x): - pass + return void(x) @deprecated_metrics(target=_dim_zero_mean) def dim_zero_mean(x): - pass + return void(x) @deprecated_metrics(target=_to_onehot) @@ -53,6 +53,7 @@ def to_onehot(label_tensor: torch.Tensor, num_classes: Optional[int] = None) -> .. deprecated:: Use :func:`torchmetrics.utilities.data.to_onehot`. Will be removed in v1.5.0. """ + return void(label_tensor, num_classes) @deprecated_metrics(target=_select_topk) @@ -61,14 +62,16 @@ def select_topk(prob_tensor: torch.Tensor, topk: int = 1, dim: int = 1) -> torch .. deprecated:: Use :func:`torchmetrics.utilities.data.select_topk`. Will be removed in v1.5.0. """ + return void(prob_tensor, topk, dim) -@deprecated_metrics(target=_to_categorical) +@deprecated_metrics(target=_to_categorical, args_mapping={"tensor": "x"}) def to_categorical(tensor: torch.Tensor, argmax_dim: int = 1) -> torch.Tensor: """ .. deprecated:: Use :func:`torchmetrics.utilities.data.to_categorical`. Will be removed in v1.5.0. """ + return void(tensor, argmax_dim) @deprecated_metrics(target=_get_num_classes, skip_if=_TORCHMETRICS_GREATER_EQUAL_0_3) @@ -78,6 +81,7 @@ def get_num_classes(pred: torch.Tensor, target: torch.Tensor, num_classes: Optio .. deprecated:: Use :func:`torchmetrics.utilities.data.get_num_classes`. Will be removed in v1.5.0. """ + return void(pred, target, num_classes) @deprecated_metrics(target=_reduce) @@ -86,6 +90,7 @@ def reduce(to_reduce: torch.Tensor, reduction: str) -> torch.Tensor: .. deprecated:: Use :func:`torchmetrics.utilities.reduce`. Will be removed in v1.5.0. """ + return void(to_reduce, reduction) @deprecated_metrics(target=_class_reduce) @@ -96,3 +101,4 @@ def class_reduce( .. deprecated:: Use :func:`torchmetrics.utilities.class_reduce`. Will be removed in v1.5.0. """ + return void(num, denom, weights, class_reduction) diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py index 88e8ed6375e1b..3f396c3d602f8 100644 --- a/pytorch_lightning/overrides/base.py +++ b/pytorch_lightning/overrides/base.py @@ -11,17 +11,54 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any, Union + import torch from torch.nn import DataParallel from torch.nn.parallel import DistributedDataParallel -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +class _LightningPrecisionModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): + + def __init__(self, pl_module: 'pl.LightningModule') -> None: + """ + Wraps the user's LightningModule. Requires overriding all ``*_step`` methods and ``forward`` so that it can + safely be wrapped by a ``_LightningModuleWrapperBase`` and a ``*DataParallel``. + + Args: + pl_module: the model to wrap + """ + super().__init__() + self.module = pl_module + + # set the parameters_to_ignore from LightningModule. + self._ddp_params_and_buffers_to_ignore = getattr(pl_module, "_ddp_params_and_buffers_to_ignore", []) + + def training_step(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + def validation_step(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + def test_step(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + def predict_step(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + def forward(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + def on_post_move_to_device(self) -> None: + pass + + class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): - def __init__(self, pl_module: LightningModule): + def __init__(self, pl_module: Union['pl.LightningModule', _LightningPrecisionModuleWrapperBase]): """ Wraps the user's LightningModule and redirects the forward call to the appropriate method, either ``training_step``, ``validation_step`` or ``test_step``. @@ -39,8 +76,9 @@ def __init__(self, pl_module: LightningModule): # set the parameters_to_ignore from LightningModule. self._ddp_params_and_buffers_to_ignore = getattr(pl_module, "_ddp_params_and_buffers_to_ignore", []) - def forward(self, *inputs, **kwargs): - trainer = self.module.trainer + def forward(self, *inputs: Any, **kwargs: Any) -> Any: + lightning_module = unwrap_lightning_module(self.module) + trainer = lightning_module.trainer if trainer and trainer.training: output = self.module.training_step(*inputs, **kwargs) @@ -49,7 +87,7 @@ def forward(self, *inputs, **kwargs): # it is done manually in ``LightningModule.manual_backward`` # `require_backward_grad_sync` will be reset in the # ddp_plugin ``post_training_step`` hook - if not self.module.automatic_optimization: + if not lightning_module.automatic_optimization: trainer.model.require_backward_grad_sync = False elif trainer and trainer.testing: output = self.module.test_step(*inputs, **kwargs) @@ -62,14 +100,14 @@ def forward(self, *inputs, **kwargs): return output - def on_post_move_to_device(self): + def on_post_move_to_device(self) -> None: pass -def unwrap_lightning_module(wrapped_model) -> LightningModule: +def unwrap_lightning_module(wrapped_model) -> 'pl.LightningModule': model = wrapped_model if isinstance(model, (DistributedDataParallel, DataParallel)): - model = model.module - if isinstance(model, _LightningModuleWrapperBase): - model = model.module + model = unwrap_lightning_module(model.module) + if isinstance(model, (_LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase)): + model = unwrap_lightning_module(model.module) return model diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 3d6e527ef95a9..57919db6ab221 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -17,7 +17,7 @@ import torch -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -53,7 +53,7 @@ class LightningParallelModule(_LightningModuleWrapperBase): """ - def __init__(self, pl_module: LightningModule): + def __init__(self, pl_module: 'pl.LightningModule') -> None: super().__init__(pl_module) _ignore_scalar_return_in_dp() diff --git a/pytorch_lightning/overrides/distributed.py b/pytorch_lightning/overrides/distributed.py index d064040d8e019..71ed9c8018ec3 100644 --- a/pytorch_lightning/overrides/distributed.py +++ b/pytorch_lightning/overrides/distributed.py @@ -18,13 +18,13 @@ from torch.nn.parallel import DistributedDataParallel from torch.utils.data import BatchSampler, DistributedSampler, Sampler -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase class LightningDistributedModule(_LightningModuleWrapperBase): - def __init__(self, pl_module: LightningModule): + def __init__(self, pl_module: 'pl.LightningModule') -> None: """ Wraps the user's LightningModule and redirects the forward call to the appropriate method, either ``training_step``, ``validation_step``, ``test_step`` or ``predict``. @@ -63,6 +63,9 @@ def _find_tensors(obj): # pragma: no-cover # Note: Keep track of Pytorch DDP and update if there is a change # https://github.com/pytorch/pytorch/blob/v1.7.1/torch/nn/parallel/distributed.py#L626-L638 def prepare_for_backward(model: DistributedDataParallel, output: Any): + # `prepare_for_backward` is `DistributedDataParallel` specific. + if not isinstance(model, DistributedDataParallel): + return if torch.is_grad_enabled() and model.require_backward_grad_sync: model.require_forward_param_sync = True # We'll return the output object verbatim since it is a freeform @@ -132,6 +135,9 @@ def __iter__(self) -> Iterator[List[int]]: self.batch_indices = batch yield batch + def __len__(self) -> int: + return len(self._sampler) + @property def drop_last(self) -> bool: return self._sampler.drop_last diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py index f7c3b8d5fd575..e531db6de77f3 100644 --- a/pytorch_lightning/overrides/fairscale.py +++ b/pytorch_lightning/overrides/fairscale.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE @@ -23,7 +23,7 @@ class LightningShardedDataParallel(_LightningModuleWrapperBase): # Just do this for later docstrings pass - def unwrap_lightning_module_sharded(wrapped_model) -> LightningModule: + def unwrap_lightning_module_sharded(wrapped_model) -> 'pl.LightningModule': model = wrapped_model if isinstance(model, ShardedDataParallel): model = model.module diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index 58d43dc54cb7f..f620ee28afe9a 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -9,6 +9,7 @@ from pytorch_lightning.plugins.precision.fully_sharded_native_amp import ( # noqa: F401 FullyShardedNativeMixedPrecisionPlugin, ) +from pytorch_lightning.plugins.precision.ipu_precision import IPUPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 @@ -20,9 +21,8 @@ from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.ipu import IPUPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.rpc import RPCPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin # noqa: F401 @@ -41,6 +41,8 @@ "DeepSpeedPrecisionPlugin", "DoublePrecisionPlugin", "HorovodPlugin", + "IPUPlugin", + "IPUPrecisionPlugin", "NativeMixedPrecisionPlugin", "PrecisionPlugin", "ShardedNativeMixedPrecisionPlugin", @@ -49,8 +51,6 @@ "SingleTPUPlugin", "TPUHalfPrecisionPlugin", "TPUSpawnPlugin", - "RPCPlugin", - "RPCSequentialPlugin", "TrainingTypePlugin", "ParallelPlugin", "Plugin", diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py index 71c2119e734fd..b2565e7dd34b4 100644 --- a/pytorch_lightning/plugins/precision/apex_amp.py +++ b/pytorch_lightning/plugins/precision/apex_amp.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, ContextManager, Sequence +from typing import Any, Callable, ContextManager, Dict, Sequence import torch from torch import Tensor from torch.optim import Optimizer import pytorch_lightning as pl -from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType from pytorch_lightning.utilities.types import _PARAMETERS @@ -39,7 +38,7 @@ def __init__(self, amp_level: str = "O2") -> None: def master_params(self, optimizer: Optimizer) -> _PARAMETERS: return amp.master_params(optimizer) - def dispatch(self, trainer: "pl.Trainer") -> None: + def dispatch(self, trainer: 'pl.Trainer') -> None: if not self._connected: accelerator = trainer.accelerator _, accelerator.optimizers = amp.initialize( @@ -50,7 +49,7 @@ def dispatch(self, trainer: "pl.Trainer") -> None: def backward( self, - model: LightningModule, + model: 'pl.LightningModule', closure_loss: Tensor, optimizer: Optimizer, opt_idx: int, @@ -76,7 +75,7 @@ def backward( # do backward pass # TODO: not entirely sure, why we need this - if model is not None and isinstance(model, LightningModule): + if model is not None and isinstance(model, pl.LightningModule): model.backward(closure_loss, optimizer, opt_idx, **kwargs) # TODO: avoid dev_debugger and track these calls with mock @@ -118,7 +117,7 @@ def reinit_scheduler_properties(optimizers: Sequence[Optimizer], schedulers: Seq def pre_optimizer_step( self, - pl_module: LightningModule, + pl_module: 'pl.LightningModule', optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, @@ -135,3 +134,10 @@ def pre_optimizer_step( optimizer.step(**kwargs) return False + + def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + if "amp_scaling_state" in checkpoint: + amp.load_state_dict(checkpoint["amp_scaling_state"]) + + def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + checkpoint["amp_scaling_state"] = amp.state_dict() diff --git a/pytorch_lightning/plugins/precision/double.py b/pytorch_lightning/plugins/precision/double.py index 6d985a0f4eb9d..387fac81c8614 100644 --- a/pytorch_lightning/plugins/precision/double.py +++ b/pytorch_lightning/plugins/precision/double.py @@ -12,28 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from functools import wraps -from typing import Any, Generator, List, Tuple +from typing import Any, cast, Generator, List, Tuple import torch import torch.nn as nn from torch.optim import Optimizer -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl +from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities.apply_func import apply_to_collection -class _DoublePrecisionPatch: - """Class to handle patching of methods in the ``LightningModule`` and subsequent teardown.""" +class LightningDoublePrecisionModule(_LightningPrecisionModuleWrapperBase): + """ + LightningModule wrapper which converts incoming floating point data in ``*_step`` and ``forward`` to double + (``torch.float64``) precision. - def __init__(self, model: nn.Module, method_name: str, old_method: Any) -> None: - self.model = model - self.method_name = method_name - self.old_method = old_method - - def teardown(self) -> None: - setattr(self.model, self.method_name, self.old_method) + Args: + pl_module: the model to wrap + """ @staticmethod def _to_double_precision(data: torch.Tensor) -> torch.Tensor: @@ -43,55 +41,63 @@ def _to_double_precision(data: torch.Tensor) -> torch.Tensor: @staticmethod def _move_float_tensors_to_double(collection: Any) -> Any: - return apply_to_collection(collection, torch.Tensor, function=_DoublePrecisionPatch._to_double_precision) - - @classmethod - def patch(cls, model: nn.Module, method_name: str) -> '_DoublePrecisionPatch': - old_method = getattr(model, method_name) - - @wraps(old_method) - def new_method(*args: Any, **kwargs: Any) -> Any: - return old_method( - *_DoublePrecisionPatch._move_float_tensors_to_double(args), - **_DoublePrecisionPatch._move_float_tensors_to_double(kwargs) - ) - - setattr(model, method_name, new_method if callable(old_method) else old_method) - return cls(model, method_name, old_method) + return apply_to_collection( + collection, + torch.Tensor, + LightningDoublePrecisionModule._to_double_precision, + ) + + def training_step(self, *args: Any, **kwargs: Any) -> Any: + return self.module.training_step( + *LightningDoublePrecisionModule._move_float_tensors_to_double(args), + **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs), + ) + + def validation_step(self, *args: Any, **kwargs: Any) -> Any: + return self.module.validation_step( + *LightningDoublePrecisionModule._move_float_tensors_to_double(args), + **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs), + ) + + def test_step(self, *args: Any, **kwargs: Any) -> Any: + return self.module.test_step( + *LightningDoublePrecisionModule._move_float_tensors_to_double(args), + **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs), + ) + + def predict_step(self, *args: Any, **kwargs: Any) -> Any: + return self.module.predict_step( + *LightningDoublePrecisionModule._move_float_tensors_to_double(args), + **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs), + ) + + def forward(self, *args: Any, **kwargs: Any) -> Any: + return self.module( + *LightningDoublePrecisionModule._move_float_tensors_to_double(args), + **LightningDoublePrecisionModule._move_float_tensors_to_double(kwargs), + ) class DoublePrecisionPlugin(PrecisionPlugin): - """Plugin for training with double (``torch.float64``) precision.""" + """ Plugin for training with double (``torch.float64``) precision. """ precision: int = 64 - def __init__(self) -> None: - super().__init__() - self.patches: List[_DoublePrecisionPatch] = [] - def connect( self, model: nn.Module, optimizers: List[Optimizer], lr_schedulers: List[Any], - ) -> Tuple[nn.Module, List[Optimizer], List[Any]]: - """Converts the model to double precision and wraps the `training_step`, `validation_step`, `test_step`, - `predict_step`, and `forward` methods to convert incoming floating point data to double. Does not alter - `optimizers` or `lr_schedulers`.""" - model = model.to(dtype=torch.float64) - if isinstance(model, LightningModule): - self.patches.append(_DoublePrecisionPatch.patch(model, 'training_step')) - self.patches.append(_DoublePrecisionPatch.patch(model, 'validation_step')) - self.patches.append(_DoublePrecisionPatch.patch(model, 'test_step')) - self.patches.append(_DoublePrecisionPatch.patch(model, 'predict_step')) - self.patches.append(_DoublePrecisionPatch.patch(model, 'forward')) + ) -> Tuple[nn.Module, List['Optimizer'], List[Any]]: + """Converts the model to double precision and wraps it in a ``LightningDoublePrecisionModule`` to convert + incoming floating point data to double (``torch.float64``) precision. Does not alter `optimizers` or + `lr_schedulers`. + """ + model = cast(pl.LightningModule, model.double()) + model = LightningDoublePrecisionModule(model) return super().connect(model, optimizers, lr_schedulers) - def post_dispatch(self) -> None: - while len(self.patches) > 0: - self.patches.pop().teardown() - @contextmanager def train_step_context(self) -> Generator[None, None, None]: """ diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py new file mode 100644 index 0000000000000..e6983966e166b --- /dev/null +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -0,0 +1,60 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, Union + +from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer + +import pytorch_lightning as pl +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.utilities import GradClipAlgorithmType +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class IPUPrecisionPlugin(PrecisionPlugin): + + def __init__(self, precision: int) -> None: + super().__init__() + self.precision = precision + + def backward( + self, + model: 'pl.LightningModule', + closure_loss: Tensor, + optimizer: Optimizer, + opt_idx: int, + should_accumulate: bool, + *args: Any, + **kwargs: Any, + ) -> Tensor: + # IPU internally manages bwd step. + return closure_loss + + def clip_gradients( + self, + optimizer: Optimizer, + clip_val: Union[int, float], + gradient_clip_algorithm: GradClipAlgorithmType = GradClipAlgorithmType.NORM, + model: Optional[Module] = None + ) -> None: + """Clips the gradients""" + if clip_val is None: + return + + clip_val = float(clip_val) + if clip_val <= 0: + return + + raise MisconfigurationException("IPUs currently do not support clipping gradients.") diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py index 994b7f26135ff..e25f46d9ec239 100644 --- a/pytorch_lightning/plugins/precision/native_amp.py +++ b/pytorch_lightning/plugins/precision/native_amp.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Any, Callable, Generator +from typing import Any, Callable, Dict, Generator import torch from torch.optim import LBFGS, Optimizer @@ -83,19 +83,21 @@ def pre_optimizer_step( f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})." " To request, please file a Github issue in PyTorch and tag @mcarilli" ) - lambda_closure() if not pl_module.automatic_optimization: self.scaler.unscale_(optimizer) pl_module.trainer.call_hook("on_after_backward") + self.scaler.step(optimizer) + self.scaler.update() + else: + result = lambda_closure() + # lambda_closure returning None indicates that backward has been skipped + if result is not None: + self.scaler.step(optimizer) + self.scaler.update() return False - def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int) -> None: - """Updates the GradScaler""" - self.scaler.step(optimizer) - self.scaler.update() - @contextmanager def train_step_context(self) -> Generator[None, None, None]: """Enable autocast context""" @@ -119,3 +121,10 @@ def predict_step_context(self) -> Generator[None, None, None]: """Enable autocast context""" with torch.cuda.amp.autocast(): yield + + def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + if "native_amp_scaling_state" in checkpoint: + self.scaler.load_state_dict(checkpoint["native_amp_scaling_state"]) + + def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + checkpoint["native_amp_scaling_state"] = self.scaler.state_dict() diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py index a5488623dc592..e8dccbed741fa 100644 --- a/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/pytorch_lightning/plugins/precision/precision_plugin.py @@ -19,12 +19,13 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from pytorch_lightning.core.hooks import CheckpointHooks from pytorch_lightning.plugins.base_plugin import Plugin from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.types import _PARAMETERS -class PrecisionPlugin(Plugin): +class PrecisionPlugin(Plugin, CheckpointHooks): """ Base class for all plugins handling the precision-specific parts of the training. The class attribute precision must be overwritten in child classes. diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py index 3cb43e44f5565..6a56d68e17db9 100644 --- a/pytorch_lightning/plugins/training_type/__init__.py +++ b/pytorch_lightning/plugins/training_type/__init__.py @@ -6,8 +6,6 @@ from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.rpc import RPCPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin # noqa: F401 diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index e65a6512d3846..a882390b78b0d 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -13,14 +13,19 @@ # limitations under the License. import logging import os +import shutil +import signal import subprocess import sys +import tempfile +import time from time import sleep from typing import Any, Dict, List, Optional, Union +import __main__ import numpy as np import torch -import torch.distributed as torch_distrib +import torch.distributed from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim import Optimizer @@ -36,8 +41,14 @@ rank_zero_deprecation, rank_zero_warn, ) -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.distributed import ( + distributed_available, + rank_zero_info, + rank_zero_only, + ReduceOp, + sync_ddp_if_available, +) +from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.seed import reset_seed if _HYDRA_AVAILABLE: @@ -89,12 +100,18 @@ def __init__( self.num_processes = len(self.parallel_devices) if self.parallel_devices is not None else 0 self._ddp_kwargs = kwargs self._has_spawned_children = False - self.task_idx = None + self._task_idx = None self._ddp_comm_state = ddp_comm_state self._ddp_comm_hook = ddp_comm_hook self._ddp_comm_wrapper = ddp_comm_wrapper + self._pids: Optional[List[int]] = None + self._sync_dir: Optional[str] = None self.set_world_ranks() + @property + def is_distributed(self) -> bool: + return True + @property def root_device(self) -> torch.device: return self.parallel_devices[self.local_rank] @@ -117,6 +134,18 @@ def sync_batchnorm(self) -> bool: def sync_batchnorm(self, sync_batchnorm: bool) -> None: self._sync_batchnorm = sync_batchnorm + @property + def task_idx(self) -> Optional[int]: + rank_zero_deprecation( + f'`{self.__class__.__name__}.task_idx` is deprecated in v1.4 and will be removed in v1.6. Use ' + f'`{self.__class__.__name__}.local_rank` instead.' + ) + return self._task_idx + + @task_idx.setter + def task_idx(self, task_idx: int) -> None: + self._task_idx = task_idx + @property def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) @@ -137,7 +166,6 @@ def setup_environment(self) -> None: self.setup_distributed() def _call_children_scripts(self): - # bookkeeping of spawned processes assert self.local_rank == 0 self._check_can_spawn_children() @@ -151,19 +179,28 @@ def _call_children_scripts(self): os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) - # when user is using hydra find the absolute path - path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path - - # pull out the commands used to run the script and resolve the abs file path - command = sys.argv - try: - full_path = path_lib(command[0]) - except Exception: - full_path = os.path.abspath(command[0]) - - command[0] = full_path - # use the same python interpreter and actually running - command = [sys.executable] + command + # create a temporary directory used to synchronize processes on deadlock. + os.environ["PL_DDP_SYNC_TMPDIR"] = self._sync_dir = tempfile.mkdtemp() + + # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` + # See https://docs.python.org/3/reference/import.html#main-spec + if __main__.__spec__ is None: # pragma: no-cover + # Script called as `python a/b/c.py` + # when user is using hydra find the absolute path + path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path + + # pull out the commands used to run the script and resolve the abs file path + command = sys.argv + try: + full_path = path_lib(command[0]) + except Exception: + full_path = os.path.abspath(command[0]) + + command[0] = full_path + # use the same python interpreter and actually running + command = [sys.executable] + command + else: # Script called as `python -m a.b.c` + command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time @@ -172,11 +209,9 @@ def _call_children_scripts(self): if self.parallel_devices is None: raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") - os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - num_gpus = len(self.parallel_devices) - os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" + os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" self.interactive_ddp_procs = [] @@ -222,13 +257,6 @@ def setup_distributed(self): # where to store ip_table self.init_ddp_connection() - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -295,7 +323,17 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) if not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + torch.distributed.init_process_group( + self.torch_distributed_backend, rank=global_rank, world_size=world_size + ) + + # on rank=0 let everyone know training is starting + rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={self.torch_distributed_backend}\n" + f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" + f"{'-' * 100}\n" + ) def pre_dispatch(self): # move the model to the correct device @@ -306,21 +344,26 @@ def pre_dispatch(self): self.configure_ddp() - self.barrier() + # share ddp pids to all processes + self._share_information_to_prevent_deadlock() def post_dispatch(self) -> None: self.cluster_environment.teardown() - def barrier(self, *args, **kwargs): - if torch_distrib.is_available() and torch_distrib.is_initialized(): - torch_distrib.barrier() + def barrier(self, *args, **kwargs) -> None: + if not distributed_available(): + return + if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl": + torch.distributed.barrier(device_ids=self.determine_ddp_device_ids()) + else: + torch.distributed.barrier() def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int): """Run before precision plugin executes backward""" - if not self.lightning_module.automatic_optimization and self.model.require_backward_grad_sync: + if not self.lightning_module.automatic_optimization: prepare_for_backward(self.model, closure_loss) def model_to_device(self): @@ -328,7 +371,7 @@ def model_to_device(self): torch.cuda.set_device(self.root_device) self.model.to(self.root_device) - def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"): + def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor: """ Reduces a tensor from several distributed processes to one aggregated tensor. @@ -342,7 +385,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ reduced value, except when the input was not a tensor the output remains is unchanged """ if isinstance(tensor, torch.Tensor): - tensor = sync_ddp_if_available(tensor, group, reduce_op=(reduce_op or "mean")) + tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) return tensor def training_step(self, *args, **kwargs): @@ -369,3 +412,41 @@ def register_plugins(cls, plugin_registry: Dict) -> None: description="DDP Plugin with `find_unused_parameters` as False", find_unused_parameters=False ) + + def _share_information_to_prevent_deadlock(self): + self._share_pids() + + # remove `PL_DDP_SYNC_TMPDIR` from os.environ + self._sync_dir = os.environ.pop("PL_DDP_SYNC_TMPDIR", None) + + def _share_pids(self): + """ + Make all DDP processes aware of all processes pids. + """ + self.barrier() + pids = self.all_gather(torch.tensor(os.getpid(), device=self.root_device)) + pids = pids.cpu().numpy().tolist() + self._pids = pids if isinstance(pids, list) else [pids] + + def reconciliate_processes(self, trace: str): + if self.world_size < 2: + return + + sync_dir = self._sync_dir + + # save a file locally. + torch.save(True, os.path.join(sync_dir, f"{self.global_rank}.pl")) + + # sleep for a short time + time.sleep(3) + + # return if all processes wrote a file in the `sync_dir`. + # todo (tchaton) Add support for non-shared file-system which will fail. + if len(os.listdir(sync_dir)) == self.world_size: + return + + for pid in self._pids: + if pid != os.getpid(): + os.kill(pid, signal.SIGKILL) + shutil.rmtree(sync_dir) + raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") diff --git a/pytorch_lightning/plugins/training_type/ddp2.py b/pytorch_lightning/plugins/training_type/ddp2.py index b6d21904d1933..185e955135141 100644 --- a/pytorch_lightning/plugins/training_type/ddp2.py +++ b/pytorch_lightning/plugins/training_type/ddp2.py @@ -13,8 +13,9 @@ # limitations under the License. import torch -from pytorch_lightning.core.step_result import Result from pytorch_lightning.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.types import _METRIC_COLLECTION class DDP2Plugin(DDPPlugin): @@ -34,26 +35,25 @@ def setup(self, model): self.task_idx = self.cluster_environment.local_rank() # the difference to DDP is that we don't call children processes here - def reduce(self, tensor, *args, **kwargs): + def reduce(self, collection: _METRIC_COLLECTION, *args, **kwargs) -> _METRIC_COLLECTION: """ - Reduces a tensor from all processes to one aggregated tensor. + Reduces a collection of tensors from all processes. It can be applied to just a single tensor. In DDP2, the reduction here is only across local devices within the node. Args: - tensor: the tensor to sync and reduce + collection: The collection of tensors to sync and reduce. *args: ignored for DDP2 **kwargs: ignored for DDP2 Return: - reduced value, except when the input was not a tensor the output remains is unchanged + Reduced tensor values or the same value if it was not or did not contain a tensor. """ - if isinstance(tensor, Result): - tensor.dp_reduce() - elif isinstance(tensor, torch.Tensor): - tensor = tensor.mean() + def mean(t: torch.Tensor) -> torch.Tensor: + original_dtype = t.dtype + return t.float().mean().to(original_dtype) - return tensor + return apply_to_collection(collection, torch.Tensor, mean) @property def root_device(self): diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index df9f0ee158ba3..e5084adb1a63e 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -17,7 +17,7 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib +import torch.distributed import torch.multiprocessing as mp from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim import Optimizer @@ -28,13 +28,18 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 +from pytorch_lightning.utilities import ( + _TORCH_GREATER_EQUAL_1_7, + _TORCH_GREATER_EQUAL_1_8, + rank_zero_deprecation, + rank_zero_warn, +) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.distributed import ( - rank_zero_deprecation, + distributed_available, + rank_zero_info, rank_zero_only, - rank_zero_warn, ReduceOp, sync_ddp_if_available, ) @@ -68,13 +73,13 @@ def __init__( super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) if num_nodes is not None: rank_zero_deprecation( - "Argument `num_nodes` in `DDPPlugin` is deprecated in v1.4, and will be removed in v1.6. " + "Argument `num_nodes` in `DDPSpawnPlugin` is deprecated in v1.4, and will be removed in v1.6. " "Notice that it will be overriden by the trainer setting." ) self._num_nodes = num_nodes or 1 if sync_batchnorm is not None: rank_zero_deprecation( - "Argument `sync_batchnorm` in `DDPPlugin` is deprecated in v1.4, and will be removed in v1.6. " + "Argument `sync_batchnorm` in `DDPSpawnPlugin` is deprecated in v1.4, and will be removed in v1.6. " "Notice that it will be overriden by the trainer setting." ) self._sync_batchnorm = sync_batchnorm or False @@ -183,13 +188,6 @@ def new_process(self, process_idx, trainer, mp_queue): # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -214,6 +212,9 @@ def post_dispatch(self): best_path = self.mp_queue.get() last_path = self.mp_queue.get() self._results = self.mp_queue.get() + # get the `callback_metrics` and set it to the trainer + # only in case the user does not override it. + self.lightning_module.get_from_queue(self.mp_queue) # recover the weights of the processes trained in the children self.__recover_child_process_weights(best_path, last_path) @@ -263,7 +264,17 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i if not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + torch.distributed.init_process_group( + self.torch_distributed_backend, rank=global_rank, world_size=world_size + ) + + # on rank=0 let everyone know training is starting + rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={self.torch_distributed_backend}\n" + f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" + f"{'-' * 100}\n" + ) def determine_ddp_device_ids(self): if self.root_device.type == "cpu": @@ -274,6 +285,9 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): checkpoint_callback = self.lightning_module.trainer.checkpoint_callback best_model_path = checkpoint_callback.best_model_path if checkpoint_callback else None + # requires to compute the state_dict on all processes in case Metrics are present + state_dict = self.lightning_module.state_dict() + if self.global_rank == 0 and self.mp_queue is not None: rank_zero_warn("cleaning up ddp environment...") @@ -284,12 +298,13 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): and len(best_model_path) > 0 ): last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path) - atomic_save(self.on_save(self.lightning_module.state_dict()), last_path) + atomic_save(self.on_save(state_dict), last_path) # todo, pass complete checkpoint as state dictionary self.mp_queue.put(best_model_path) self.mp_queue.put(last_path) self.mp_queue.put(results) + self.lightning_module.add_to_queue(self.mp_queue) # adds the `callback_metrics` to the queue def __recover_child_process_weights(self, best_path, last_path): # transfer back the best path to the trainer @@ -302,9 +317,13 @@ def __recover_child_process_weights(self, best_path, last_path): ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) self.lightning_module.load_state_dict(ckpt) - def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): - torch_distrib.barrier() + def barrier(self, *args, **kwargs) -> None: + if not distributed_available(): + return + if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl": + torch.distributed.barrier(device_ids=self.determine_ddp_device_ids()) + else: + torch.distributed.barrier() def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) @@ -319,7 +338,7 @@ def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, opti if not self.lightning_module.automatic_optimization and self.model.require_backward_grad_sync: prepare_for_backward(self.model, closure_loss) - def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"): + def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor: """ Reduces a tensor from several distributed processes to one aggregated tensor. @@ -333,7 +352,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ reduced value, except when the input was not a tensor the output remains is unchanged """ if isinstance(tensor, torch.Tensor): - tensor = sync_ddp_if_available(tensor, group, reduce_op=(reduce_op or "mean")) + tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) return tensor def training_step(self, *args, **kwargs): diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 8dd04aafa6b86..4d229e4bff43a 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -17,13 +17,12 @@ import os from collections import OrderedDict from pathlib import Path -from types import SimpleNamespace -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, Union import torch +import pytorch_lightning as pl from pytorch_lightning.callbacks import GradientAccumulationScheduler -from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.ddp import DDPPlugin @@ -33,6 +32,7 @@ from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE +from pytorch_lightning.utilities.warnings import _warn, LightningDeprecationWarning if _DEEPSPEED_AVAILABLE: import deepspeed @@ -51,7 +51,7 @@ def remove_module_hooks(model: torch.nn.Module) -> None: class LightningDeepSpeedModule(_LightningModuleWrapperBase): - def __init__(self, pl_module: LightningModule, precision: int): + def __init__(self, pl_module: 'pl.LightningModule', precision: int) -> None: super().__init__(pl_module) self.precision = precision @@ -78,9 +78,23 @@ def __init__( self, zero_optimization: bool = True, stage: int = 2, - cpu_offload: bool = False, - cpu_offload_params: bool = False, - cpu_offload_use_pin_memory: bool = False, + remote_device: str = 'cpu', + offload_optimizer: bool = False, + offload_parameters: bool = False, + offload_params_device: str = 'cpu', + nvme_path: str = '/local_nvme', + params_buffer_count: int = 5, + params_buffer_size: int = 1e8, + max_in_cpu: int = 1e9, + offload_optimizer_device: str = 'cpu', + optimizer_buffer_count: int = 4, + block_size: int = 1048576, + queue_depth: int = 8, + single_submit: bool = False, + overlap_events: bool = True, + thread_count: int = 1, + pin_memory: bool = False, + sub_group_size: int = 1e12, contiguous_gradients: bool = True, overlap_comm: bool = True, allgather_partitions: bool = True, @@ -104,11 +118,14 @@ def __init__( contiguous_memory_optimization: bool = False, synchronize_checkpoint_boundary: bool = False, save_full_weights: bool = True, + cpu_offload: bool = False, + cpu_offload_params: bool = False, + cpu_offload_use_pin_memory: bool = False, ) -> None: """ Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. - `For more information: https://www.deepspeed.ai/`. + `For more information: https://pytorch-lightning.readthedocs.io/en/latest/advanced/multi_gpu.html#deepspeed`. .. warning:: ``DeepSpeedPlugin`` is in beta and subject to change. @@ -118,36 +135,81 @@ def __init__( Arguments: - zero_optimization: Enable ZeRO optimization. This is only compatible with precision=16. (default: True) + zero_optimization: Enable ZeRO optimization. This is only compatible with precision=16. stage: Different stages of the ZeRO Optimizer. 0 is disabled, - 1 is optimizer state partitioning, 2 is optimizer+gradient state partitioning (default: 2) + 1 is optimizer state partitioning, 2 is optimizer+gradient state partitioning, + 3 is optimizer+gradient_parameter partitioning using the infinity engine. + + remote_device: Device to instantiate the model on initially (``cpu`` or ``nvme``). + + offload_optimizer: Enable offloading optimizer memory and computation to CPU or NVMe + based on ``offload_optimizer_device``. + + offload_parameters: When using ZeRO Stage 3, Enable offloading parameter memory and computation + to CPU or NVMe based on ``offload_params_device``. + + offload_params_device: When offloading parameters choose the device to offload to, ``cpu`` or ``nvme``. + + offload_optimizer_device: When offloading optimizer state choose the device to offload to, + ``cpu`` or ``nvme``. + + params_buffer_count: Number of buffers in buffer pool for + parameter offloading when ``offload_params_device`` is ``nvme``. + + params_buffer_size: Size of buffers in buffer pool for parameter offloading + when ``offload_params_device`` is ``nvme``. + + max_in_cpu: Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. - cpu_offload: Enable offloading optimizer memory and computation to CPU + nvme_path: Filesystem path for NVMe device for optimizer/parameter state offloading. - cpu_offload_params: When using ZeRO stage 3, offload parameters to CPU + optimizer_buffer_count: Number of buffers in buffer pool for optimizer state offloading + when ``offload_optimizer_device`` is set to to ``nvme``. + This should be at least the number of states maintained per parameter by the optimizer. + For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). - cpu_offload_use_pin_memory: When using ZeRO stage 3, pin memory on CPU + block_size: When using NVMe Offloading, the I/O block size in bytes. + + queue_depth: When using NVMe Offloading, the I/O queue depth. + + single_submit: When using NVMe Offloading, + submit requests to storage device as multiple individual requests, + as opposed to one block of requests. + + overlap_events: When using NVMe Offloading, + submit requests to storage device in an overlapped fashion + without waiting for completion of earlier requests. + + thread_count: When using NVMe Offloading, + Intra-request parallelism for each read/write submitted by a user thread. + + pin_memory: When using ZeRO stage 3, pin optimizer state memory on CPU. + This could boost throughput at the cost of extra memory overhead. + + sub_group_size: When using ZeRO stage 3, defines the number of parameters + within a sub group to offload at a time. + Smaller numbers require more communication, but improve memory efficiency. contiguous_gradients: Copies gradients to a continuous buffer as they are produced. - Avoids memory fragmentation during backwards. Useful when training large models. (default: True) + Avoids memory fragmentation during backwards. Useful when training large models. overlap_comm: Overlap the reduction (synchronization) of gradients with the backwards computation. - This is a speed optimization when training across multiple GPUs/machines. (default: True) + This is a speed optimization when training across multiple GPUs/machines. allgather_partitions: All gather updated parameters at the end of training step, - instead of using a series of broadcast collectives (default: True) + instead of using a series of broadcast collectives. - reduce_scatter: Use reduce/scatter instead of allreduce to average gradients (default:True) + reduce_scatter: Use reduce/scatter instead of allreduce to average gradients. allgather_bucket_size: Number of elements to allgather at once. - Used to limit the memory required for larger model sizes, with a tradeoff with speed. (default: 2e8) + Used to limit the memory required for larger model sizes, with a tradeoff with speed. reduce_bucket_size: Number of elements to reduce at once. - Used to limit the memory required for larger model sizes, with a tradeoff with speed (default: 2e8) + Used to limit the memory required for larger model sizes, with a tradeoff with speed. zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a - DeepSpeed supported optimizer when using ZeRO (default: True) + DeepSpeed supported optimizer when using ZeRO. logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging on a per sample per second basis (only displayed if logging=logging.INFO). @@ -158,45 +220,56 @@ def __init__( config: Pass in a deepspeed formatted config dict, or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json. - All defaults will be ignored if a config is passed in. (Default: ``None``) + All defaults will be ignored if a config is passed in. - logging_level: Set logging level for deepspeed. (Default: ``logging.WARN``) + logging_level: Set logging level for deepspeed. loss_scale: Loss scaling value for FP16 training. - 0.0 results in dynamic loss scaling, otherwise static (Default: 0) + 0.0 results in dynamic loss scaling, otherwise static. initial_scale_power: Power of the initial dynamic loss scale value. Loss scale is computed - by ``2^initial_scale_power`` (Default: 32) + by ``2^initial_scale_power``. - loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value (Default: 1000) + loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value. - hysteresis: FP16 Delay shift in Dynamic Loss scaling (Default: 2) + hysteresis: FP16 Delay shift in Dynamic Loss scaling. - min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000) + min_loss_scale: The minimum FP16 dynamic loss scaling value. - partition_activations: Enables partition activation when used with ZeRO stage 3. + partition_activations: Enables partition activation when used with ZeRO stage 3 and model parallelism. Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. See `deepspeed tutorial - `_ + `_. - cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled + cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled. contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory. - Not supported by all models + Not supported by all models. synchronize_checkpoint_boundary: Insert :func:`torch.cuda.synchronize` at each checkpoint boundary. save_full_weights: Gathers weights across all processes before saving to disk when using ZeRO Stage 3. This allows a single weight file to contain the entire model, rather than individual sharded weight files. - Disable to save sharded states individually. (Default: True) - + Disable to save sharded states individually. """ if not _DEEPSPEED_AVAILABLE: raise MisconfigurationException( "To use the DeepSpeed plugin, you must have DeepSpeed installed." " pip install deepspeed" ) + + if cpu_offload or cpu_offload_params or cpu_offload_use_pin_memory: + _warn( + "The usage of `cpu_offload`, `cpu_offload_params`, and `cpu_offload_use_pin_memory` " + "is deprecated since v1.4 and will be removed in v1.5." + " From now on use `offload_optimizer`, `offload_parameters` and `pin_memory`.", + category=LightningDeprecationWarning + ) + offload_optimizer = cpu_offload + offload_parameters = cpu_offload_params + pin_memory = cpu_offload_use_pin_memory + super().__init__( parallel_devices=parallel_devices, num_nodes=num_nodes, cluster_environment=cluster_environment ) @@ -207,24 +280,38 @@ def __init__( zero_optimization, zero_allow_untested_optimizer, logging_batch_size_per_gpu, + offload_optimizer=offload_optimizer, + offload_parameters=offload_parameters, + nvme_path=nvme_path, + offload_params_device=offload_params_device, + params_buffer_count=params_buffer_count, + params_buffer_size=params_buffer_size, + max_in_cpu=max_in_cpu, + pin_memory=pin_memory, + offload_optimizer_device=offload_optimizer_device, + optimizer_buffer_count=optimizer_buffer_count, + block_size=block_size, + queue_depth=queue_depth, + single_submit=single_submit, + overlap_events=overlap_events, + thread_count=thread_count, partition_activations=partition_activations, cpu_checkpointing=cpu_checkpointing, contiguous_memory_optimization=contiguous_memory_optimization, synchronize_checkpoint_boundary=synchronize_checkpoint_boundary, stage=stage, - cpu_offload=cpu_offload, - cpu_offload_params=cpu_offload_params, - cpu_offload_use_pin_memory=cpu_offload_use_pin_memory, contiguous_gradients=contiguous_gradients, overlap_comm=overlap_comm, allgather_partitions=allgather_partitions, reduce_scatter=reduce_scatter, allgather_bucket_size=allgather_bucket_size, reduce_bucket_size=reduce_bucket_size, + sub_group_size=sub_group_size, ) self._config_initialized = False deepspeed.utils.logging.logger.setLevel(logging_level) + self.remote_device = remote_device self.save_full_weights = save_full_weights # default FP16 parameters. @@ -247,22 +334,30 @@ def _load_config(self, config): config = json.load(f) return config + def setup_distributed(self): + super().setup_distributed() + if not self._config_initialized: + self._format_config() + self._config_initialized = True + if self.on_gpu: + torch.cuda.set_device(self.root_device) + def pre_dispatch(self): self.init_deepspeed() self.barrier() def init_deepspeed(self): - if not self._config_initialized: - self._format_config() - self._config_initialized = True - self._handle_gradient_accumulation_steps() precision = self.lightning_module.trainer.accelerator.precision model = LightningDeepSpeedModule(pl_module=self.model, precision=precision) - if self.on_gpu: - torch.cuda.set_device(self.root_device) + if self.zero_stage_3: + # Ensure the entire model has been moved to the appropriate device + dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32 + deepspeed.zero.Init( + module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype + ) if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) @@ -287,6 +382,7 @@ def zero_stage_3(self) -> bool: def _initialize_deepspeed_train(self, model): optimizer, lightning_scheduler, optimizer_frequencies = None, None, None + if "optimizer" not in self.config: rank_zero_info( "You have not specified an optimizer or scheduler within the DeepSpeed config." @@ -295,12 +391,12 @@ def _initialize_deepspeed_train(self, model): optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) model, optimizer, _, lr_scheduler = deepspeed.initialize( - args=SimpleNamespace(local_rank=self.local_rank), + config=self.config, model=model, model_parameters=model_parameters, optimizer=optimizer, lr_scheduler=lightning_scheduler, - config_params=self.config, + dist_init_required=False ) self._set_deepspeed_activation_checkpointing() @@ -312,13 +408,21 @@ def _initialize_deepspeed_train(self, model): @contextlib.contextmanager def model_sharded_context(self) -> Generator[None, None, None]: if self.zero_stage_3: - model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True) + assert self._config_initialized + dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32 + model_parallel_context = deepspeed.zero.Init( + remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype + ) else: model_parallel_context = super().model_sharded_context() with model_parallel_context: yield + @property + def precision(self) -> Union[str, int]: + return self.lightning_module.trainer.precision + def _set_deepspeed_activation_checkpointing(self): if self.config.get('activation_checkpointing'): checkpoint_config = self.config['activation_checkpointing'] @@ -353,12 +457,12 @@ def _initialize_deepspeed_inference(self, model): # Remove all module hooks before initializing new model remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( - args=SimpleNamespace(local_rank=self.local_rank), + config=inference_config, model=model, optimizer=optimizer, lr_scheduler=lightning_scheduler, - config_params=inference_config, model_parameters=[], + dist_init_required=False ) self.model = model @@ -378,7 +482,7 @@ def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank) return distributed_sampler_kwargs - def init_optimizers(self, trainer, model: LightningModule) -> Tuple[List, List, List]: + def init_optimizers(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> Tuple[List, List, List]: # Skip initializing optimizers here as DeepSpeed handles optimizers via config. # User may have specified config options instead in configure_optimizers, but this is handled # via `_initialize_deepspeed_train` @@ -469,6 +573,21 @@ def _create_default_config( cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, + offload_optimizer: bool, + offload_parameters: bool, + nvme_path: str, + offload_params_device: str, + params_buffer_count: int, + params_buffer_size: int, + max_in_cpu: int, + offload_optimizer_device: str, + optimizer_buffer_count: int, + pin_memory: bool, + block_size: int, + queue_depth: int, + single_submit: bool, + overlap_events: bool, + thread_count: int, **zero_kwargs, ) -> Dict: cfg = { @@ -477,12 +596,37 @@ def _create_default_config( "cpu_checkpointing": cpu_checkpointing, "contiguous_memory_optimization": contiguous_memory_optimization, "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary - } + }, + "aio": { + "block_size": block_size, + "queue_depth": queue_depth, + "single_submit": single_submit, + "overlap_events": overlap_events, + "thread_count": thread_count + }, } if zero_optimization: + zero_config = zero_kwargs + + if offload_optimizer: + zero_config["offload_optimizer"] = { + 'device': offload_optimizer_device, + 'nvme_path': nvme_path, + 'buffer_count': optimizer_buffer_count, + 'pin_memory': pin_memory + } + if offload_parameters: + zero_config['offload_param'] = { + 'device': offload_params_device, + 'nvme_path': nvme_path, + 'buffer_count': params_buffer_count, + 'buffer_size': params_buffer_size, + 'max_in_cpu': max_in_cpu, + 'pin_memory': pin_memory + } cfg = { "zero_allow_untested_optimizer": zero_allow_untested_optimizer, - "zero_optimization": zero_kwargs, + "zero_optimization": zero_config, **cfg } if logging_batch_size_per_gpu != 'auto': @@ -524,45 +668,41 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: else: super().save_checkpoint(checkpoint, filepath) - def restore_model_state_from_ckpt_path( - self, - ckpt_path: str, - map_location: Callable = lambda storage, loc: storage, - ) -> Tuple[Dict, bool]: - if not self.save_full_weights and self.world_size > 1: - # Rely on deepspeed to load the checkpoint and necessary information - from pytorch_lightning.trainer.states import TrainerFn - is_fitting = self.lightning_module.trainer.state.fn == TrainerFn.FITTING - save_dir = self._filepath_to_dir(ckpt_path) - - if self.zero_stage_3: - # TODO: Currently required as this call is missing within the deepspeed engine. - self.deepspeed_engine.optimizer._partition_all_parameters() - - _, client_state = self.deepspeed_engine.load_checkpoint( - save_dir, load_optimizer_states=is_fitting, load_lr_scheduler_states=is_fitting - ) + def load_checkpoint_file(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: + if self.save_full_weights or self.world_size == 1: + # Broadcast to ensure we load from the rank 0 checkpoint + # This doesn't have to be the case when using deepspeed sharded checkpointing + checkpoint_path = self.broadcast(checkpoint_path) + return super().load_checkpoint_file(checkpoint_path) + + # Rely on deepspeed to load the checkpoint and necessary information + from pytorch_lightning.trainer.states import TrainerFn + is_fitting = self.lightning_module.trainer.state.fn == TrainerFn.FITTING + save_dir = self._filepath_to_dir(checkpoint_path) + + if self.zero_stage_3: + # TODO: Currently required as this call is missing within the deepspeed engine. + self.deepspeed_engine.optimizer._partition_all_parameters() - # restore datamodule states - if self.lightning_module.trainer.datamodule is not None: - self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state) + _, client_state = self.deepspeed_engine.load_checkpoint( + save_dir, load_optimizer_states=is_fitting, load_lr_scheduler_states=is_fitting + ) + return client_state - # hook: give user access to checkpoint if needed. - self.lightning_module.on_load_checkpoint(client_state) - return client_state, False + def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + # override to do nothing, deepspeed engine already loaded the weights in `load_checkpoint_file()` + pass - # Broadcast to ensure we load from the rank 0 checkpoint - # This doesn't have to be the case when using deepspeed sharded checkpointing - ckpt_path = self.broadcast(ckpt_path) - return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location) + def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + # override to do nothing, deepspeed engine already loaded the states in `load_checkpoint_file()` + pass def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: if self._original_accumulate_grad_batches is None: return super().update_global_step(total_batch_idx, current_global_step) - else: - if total_batch_idx % self._original_accumulate_grad_batches == 0: - current_global_step += 1 - return current_global_step + if total_batch_idx % self._original_accumulate_grad_batches == 0: + current_global_step += 1 + return current_global_step @classmethod def register_plugins(cls, plugin_registry: Dict) -> None: @@ -573,7 +713,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None: cls, description="DeepSpeed ZeRO Stage 2 and CPU Offload", stage=2, - cpu_offload=True + offload_optimizer=True ) plugin_registry.register("deepspeed_stage_3", cls, description="DeepSpeed ZeRO Stage 3", stage=3) plugin_registry.register( @@ -581,5 +721,17 @@ def register_plugins(cls, plugin_registry: Dict) -> None: cls, description="DeepSpeed ZeRO Stage 3 and CPU Offload", stage=3, - cpu_offload=True + offload_optimizer=True, + offload_parameters=True, + ) + plugin_registry.register( + "deepspeed_stage_3_offload_nvme", + cls, + description="DeepSpeed ZeRO Stage 3 and NVMe Offload", + stage=3, + offload_optimizer=True, + offload_parameters=True, + remote_device='nvme', + offload_params_device='nvme', + offload_optimizer_device='nvme' ) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 08caa7398ab8c..2787ab5644ccd 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -16,10 +16,11 @@ import torch from torch.nn import DataParallel -from pytorch_lightning.core.step_result import Result from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.types import _METRIC_COLLECTION class DataParallelPlugin(ParallelPlugin): @@ -52,30 +53,24 @@ def setup(self, model): model.to(self.root_device) self._model = DataParallel(LightningParallelModule(model), self.parallel_devices) - def reduce(self, tensor, *args, **kwargs): + def reduce(self, collection: _METRIC_COLLECTION, *args, **kwargs) -> _METRIC_COLLECTION: """ - Reduces a tensor from all parallel processes to one aggregated tensor. + Reduces a collection of tensors from all processes. It can be applied to just a single tensor. Args: - tensor: the tensor to sync and reduce + collection: The collection of tensors to sync and reduce. *args: ignored for DP **kwargs: ignored for DP Return: - reduced value, except when the input was not a tensor the output remains is unchanged + Reduced tensor values or the same value if it was not or did not contain a tensor. """ - if isinstance(tensor, Result): - tensor.dp_reduce() - else: + def mean(t: torch.Tensor) -> torch.Tensor: + original_dtype = t.dtype + return t.float().mean().to(original_dtype) - def _reduce(t: torch.Tensor): - dtype_tensor = t.dtype - return t.float().mean().type(dtype_tensor) - - tensor = apply_to_collection(tensor, torch.Tensor, _reduce) - - return tensor + return apply_to_collection(collection, torch.Tensor, mean) @property def root_device(self): @@ -107,10 +102,16 @@ def predict_step(self, *args, **kwargs): return self.model(*args, **kwargs) def training_step_end(self, output): - return self.reduce(output) + if not is_overridden("training_step_end", self.lightning_module): + return self.reduce(output) + return output def validation_step_end(self, output): - return self.reduce(output) + if not is_overridden("validation_step_end", self.lightning_module): + return self.reduce(output) + return output def test_step_end(self, output): - return self.reduce(output) + if not is_overridden("test_step_end", self.lightning_module): + return self.reduce(output) + return output diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 99899aed11753..a402f4b19a36f 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -15,13 +15,12 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib from torch.optim.lr_scheduler import _LRScheduler, Optimizer from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HOROVOD_AVAILABLE -from pytorch_lightning.utilities.distributed import group, rank_zero_only, ReduceOp +from pytorch_lightning.utilities.distributed import distributed_available, group, rank_zero_only, ReduceOp if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -125,7 +124,7 @@ def start_predicting(self, trainer): self.join() def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): + if distributed_available(): self.join() def broadcast(self, obj: object, src: int = 0) -> object: diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py new file mode 100644 index 0000000000000..9de4e81447f0e --- /dev/null +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -0,0 +1,393 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +import json +import os +from typing import Any, Iterable, List, Optional, Union + +import torch +from torch.utils.data import DataLoader + +import pytorch_lightning as pl +from pytorch_lightning.callbacks import GradientAccumulationScheduler +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.trainer.supporters import CombinedLoader +from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.cloud_io import get_filesystem +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +if _POPTORCH_AVAILABLE: + import poptorch + + +class LightningIPUModule(_LightningModuleWrapperBase): + + def __init__(self, pl_module: 'pl.LightningModule', precision: Union[str, int]): + super().__init__(pl_module) + self.precision = precision + + def forward(self, *inputs: Any, **kwargs: Any) -> Any: + if self.precision in ("mixed", 16): + inputs = self._move_float_tensors_to_half(inputs) + + return super().forward(*inputs, **kwargs) + + @staticmethod + def batch_to(data: torch.Tensor) -> torch.Tensor: + return data.half() + + def _move_float_tensors_to_half(self, batch: Any) -> Any: + batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) + return batch + + +class IPUPlugin(ParallelPlugin): + """ + Plugin for training on IPU devices. + """ + + def __init__( + self, + device_iterations: int = 1, + autoreport: bool = True, + autoreport_dir: Optional[str] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + training_opts: Optional['poptorch.Options'] = None, + inference_opts: Optional['poptorch.Options'] = None + ) -> None: + """ + Arguments: + + device_iterations: Number of iterations to run on device at once before returning to host. + This can be used as an optimization to speed up training. + https://docs.graphcore.ai/projects/poptorch-user-guide/en/0.1.67/batching.html + autoreport: Enable auto-reporting for IPUs using PopVision + https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html + autoreport_dir: Optional directory to store autoReport output. + training_opts: Optional ``poptorch.Options`` to override the default created options for training. + inference_opts: Optional ``poptorch.Options`` to override the default + created options for validation/testing and predicting. + """ + super().__init__(parallel_devices, cluster_environment) + if not _POPTORCH_AVAILABLE or not poptorch.ipuHardwareIsAvailable(): + raise MisconfigurationException( + "The IPU Accelerator requires IPU devices to run. " + "Learn more or get started with IPUs at https://www.graphcore.ai/getstarted" + ) + + self.device_iterations = device_iterations + self.autoreport = autoreport + self.autoreport_dir = autoreport_dir + self.poptorch_models = {} + self._original_accumulate_grad_batches = None + self._training_opts = training_opts + self._inference_opts = inference_opts + + if self.autoreport: + options = {"autoReport.all": self.autoreport} + if self.autoreport_dir: + self._fs = get_filesystem(str(self.autoreport_dir)) + self._fs.makedirs(self.autoreport_dir, exist_ok=True) + options["autoReport.directory"] = self.autoreport_dir + os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) + + def pre_dispatch(self) -> None: + precision = self.lightning_module.trainer.precision + model = LightningIPUModule(self.lightning_module, precision) + self.model = model + + # Separate models are instantiated for different stages, but they share the same weights on host. + # When validation/test models are run, weights are synced first. + + if self.lightning_module.trainer.state.stage is RunningStage.TRAINING: + # Create model for training which will run training. + optimizer = self.lightning_module.trainer.optimizers[0] + model = poptorch.trainingModel(model=model, options=self.training_opts, optimizer=optimizer) + self.poptorch_models[RunningStage.TRAINING] = model + for x in (RunningStage.VALIDATING, RunningStage.TESTING, RunningStage.PREDICTING): + model = poptorch.inferenceModel( + model=model, + options=self.inference_opts, + ) + self.poptorch_models[x] = model + self._handle_gradient_accumulation_steps() + + @property + def replication_factor(self): + return len(self.parallel_devices) + + def _create_opts(self, training: bool): + opts = poptorch.Options() + opts.deviceIterations(self.device_iterations) + opts.replicationFactor(self.replication_factor) + gradient_accumulation = self.accumulate_grad_batches if training else 1 + opts.Training.gradientAccumulation(gradient_accumulation) + + if os.environ.get("PL_GLOBAL_SEED"): + opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"])) + return opts + + @property + def training_opts(self) -> 'poptorch.Options': + if self._training_opts is None: + self._training_opts = self._create_opts(training=True) + self._validate_opts(self._training_opts, training=True) + return self._training_opts + + @property + def inference_opts(self) -> 'poptorch.Options': + if self._inference_opts is None: + self._inference_opts = self._create_opts(training=False) + self._validate_opts(self._inference_opts, training=False) + return self._inference_opts + + def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: + if opts is not None: + if opts.replication_factor != self.replication_factor: + rank_zero_warn( + f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} " + f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. " + f"Setting to {self.replication_factor} in the poptorch.Options." + ) + opts.set(replication_factor=self.replication_factor) + if training: + accumulate_grad_batches = self.accumulate_grad_batches + if opts.Training.gradient_accumulation != accumulate_grad_batches: + rank_zero_warn( + f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. " + f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. " + f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " + f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}" + ) + opts.Training.set(gradient_accumulation=accumulate_grad_batches) + elif opts.Training.gradient_accumulation != 1: + rank_zero_warn( + "Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options." + ) + opts.Training.set(gradient_accumulation=1) + + @property + def lightning_module(self) -> Optional['pl.LightningModule']: + return self.model.module if isinstance(self.model, LightningIPUModule) else self.model + + def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + if isinstance(dataloader, CombinedLoader): + dataloader.loaders = apply_to_collection( + dataloader.loaders, + DataLoader, + self.process_dataloader, + ) + return dataloader + if isinstance(dataloader, list): + dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader) + return dataloader + if not isinstance(dataloader, poptorch.DataLoader): + is_training = self.lightning_module.trainer.training + opts = self.training_opts if is_training else self.inference_opts + dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts) + return dataloader + + def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], + opts: 'poptorch.Options') -> Union[Iterable, DataLoader]: + skip_keys = ('sampler', 'batch_sampler', 'dataset_kind') + + attrs = {k: v for k, v in vars(dataloader).items() if not k.startswith("_")} + + params = set(inspect.signature(dataloader.__init__).parameters) + contains_dataset = True + + if type(dataloader) is not DataLoader: + contains_dataset = "dataset" in params + params.update(inspect.signature(DataLoader.__init__).parameters) + + dl_args = {name: attrs[name] for name in params if name in attrs and name not in skip_keys} + + multiprocessing_context = dataloader.multiprocessing_context + dl_args['multiprocessing_context'] = multiprocessing_context + if not contains_dataset: + dl_args.pop('dataset') + # Override to drop last uneven batch, as IPUs does not support uneven inputs. + dl_args['drop_last'] = True + + dataloader = poptorch.DataLoader(**dl_args, options=opts) + dataloader.multiprocessing_context = multiprocessing_context + return dataloader + + @property + def accumulate_grad_batches(self) -> int: + """ + Tracks lazily the set accumulate_grad_batches in the trainer. + The IPUPlugin replaces the original accumulate_grad_batches. + """ + if self._original_accumulate_grad_batches is None: + self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + if not isinstance(self._original_accumulate_grad_batches, int): + raise MisconfigurationException( + "IPUs currently only support accumulate_grad_batches being an integer value. " + f"Received {self.accumulate_grad_batches}" + ) + return self._original_accumulate_grad_batches + + def _handle_gradient_accumulation_steps(self): + """ + This functions overrides the trainer.accumulation_scheduler to generate + ``accumulate_grad_batches=1``. + Therefore, ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation. + """ + if self.accumulate_grad_batches > 1: + self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1}) + + def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + if self.accumulate_grad_batches > 1: + if total_batch_idx % self.accumulate_grad_batches == 0: + current_global_step += 1 + return current_global_step + return super().update_global_step(total_batch_idx, current_global_step) + + @property + def _n_replicate(self): + opts = self.training_opts if self.lightning_module.training else self.inference_opts + accumulate_grad_batches = opts.Training.gradient_accumulation + device_iterations = opts.device_iterations + replication_factor = opts.replication_factor + return replication_factor * device_iterations * accumulate_grad_batches + + def _prepare_input(self, args: Any): + + def to_tuple(x): + return tuple(x) + + def to_tensor(x): + return torch.tensor(x).unsqueeze(0).repeat(self._n_replicate) + + args = apply_to_collection(args, dtype=list, function=to_tuple) + args = apply_to_collection(args, dtype=(int, float), function=to_tensor) + return args + + def training_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.TRAINING](*args, **kwargs) + + def validation_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.VALIDATING](*args, **kwargs) + + def test_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.TESTING](*args, **kwargs) + + def predict_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.PREDICTING](*args, **kwargs) + + def teardown(self) -> None: + for model in self.poptorch_models.values(): + model.destroy() + + def _compiled(self, model: Any): + # Required to ensure we only attach compiled models, as they are compiled lazily. + return model._executable is not None + + def _detach_models(self): + """ + Detaches all stage specific models from IPU devices. + """ + for k, model in self.poptorch_models.items(): + if self._compiled(model) and model.isAttachedToDevice(): + model.detachFromDevice() + + def _load_model(self, stage: str): + """ + Loads the stage specific accelerator model onto device if compiled and not attached to IPU devices. + Args: + stage: The stage to load + """ + self._detach_models() + model = self.poptorch_models[stage] + if self._compiled(model) and not model.isAttachedToDevice(): + model.attachToDevice() + + def on_train_start(self): + self._load_model(RunningStage.TRAINING) + + def on_validation_start(self): + self._load_model(RunningStage.VALIDATING) + + def on_test_start(self): + self._load_model(RunningStage.TESTING) + + def on_predict_start(self): + self._load_model(RunningStage.PREDICTING) + + def on_train_end(self): + self._detach_models() + + def on_validation_end(self): + self._detach_models() + + def on_test_end(self): + self._detach_models() + + def on_predict_end(self): + self._detach_models() + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + # Updates optimizer stats if LR scheduler modified the optimizer state + optimizer = self.lightning_module.trainer.optimizers[0] + self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer) + + @property + def on_gpu(self) -> bool: + return False + + @property + def root_device(self) -> torch.device: + pass + + def model_to_device(self) -> None: + pass + + @property + def is_global_zero(self) -> bool: + return True + + def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: + return tensor + + def barrier(self, name: Optional[str] = None) -> None: + pass + + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + return tensor + + def broadcast(self, obj: object, src: int = 0) -> object: + return obj diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index a8028e5be1a69..e1c9a7149d066 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -19,7 +19,7 @@ import torch from torch.nn.parallel import DistributedDataParallel -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin @@ -81,6 +81,11 @@ def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=len(self.parallel_devices), rank=self.global_rank) return distributed_sampler_kwargs + def reconciliate_processes(self, trace: str): + """ + Function to re-conciliate processes on failure + """ + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: """Perform a all_gather on all processes """ return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) @@ -99,7 +104,7 @@ def torch_distributed_backend(self): return torch_backend @staticmethod - def configure_sync_batchnorm(model: LightningModule) -> LightningModule: + def configure_sync_batchnorm(model: 'pl.LightningModule') -> 'pl.LightningModule': """ Add global batchnorm for a model spread across multiple GPUs and nodes. @@ -112,8 +117,7 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule: Return: LightningModule with batchnorm layers synchronized between process groups """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) - return model + return torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) @contextmanager def block_backward_sync(self): @@ -133,5 +137,4 @@ def teardown(self) -> None: # GPU teardown self.lightning_module.cpu() # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py deleted file mode 100644 index 3e0f57daef001..0000000000000 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -from contextlib import suppress -from typing import Callable, List, Optional - -import torch - -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.plugins.training_type.ddp import DDPPlugin -from pytorch_lightning.utilities import _RPC_AVAILABLE - -DEFAULT_RPC_TIMEOUT_SEC = 60. -if _RPC_AVAILABLE: - from torch.distributed import rpc - - with suppress(ModuleNotFoundError, ImportError): - from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC - - -class RPCPlugin(DDPPlugin): - """ - Backbone for RPC Plugins built on top of DDP. - RPC introduces different communication behaviour than DDP. Unlike DDP, processes potentially are not - required to run the same code as the main process. - This leads to edge cases where logic needs to be re-defined. This class contains special cases - that need to be addressed when using RPC communication when building custom RPC Plugins. - """ - - def __init__( - self, - rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC, - parallel_devices: Optional[List[torch.device]] = None, - num_nodes: Optional[int] = None, - cluster_environment: Optional[ClusterEnvironment] = None, - sync_batchnorm: Optional[bool] = None, - **kwargs - ): - self.rpc_timeout_sec = rpc_timeout_sec - self._is_rpc_initialized = False - super().__init__( - parallel_devices=parallel_devices, - num_nodes=num_nodes, - cluster_environment=cluster_environment, - sync_batchnorm=sync_batchnorm, - **kwargs - ) - - def init_rpc_connection(self, global_rank: int, world_size: int) -> None: - os.environ['MASTER_PORT'] = os.getenv('RPC_MASTER_PORT', '15000') - rpc.init_rpc(f"worker{global_rank}", rank=global_rank, world_size=world_size) - rpc._set_rpc_timeout(self.rpc_timeout_sec) - self._is_rpc_initialized = True - - def rpc_save_model(self, trainer, save_model_fn: Callable, filepath: str) -> None: - """ - Override to save model to disk. - This is required as the main process will be required to handle aggregating model states from RPC processes. - - Args: - trainer: The trainer object. - save_model_fn: The saving function to save final model. - filepath: The filepath to save the model to. - """ - raise NotImplementedError - - def exit_rpc_process(self): - if self._is_rpc_initialized: - torch.distributed.rpc.shutdown() - self._is_rpc_initialized = False - - @property - def rpc_enabled(self) -> bool: - return True diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py deleted file mode 100644 index a75839cbdb714..0000000000000 --- a/pytorch_lightning/plugins/training_type/rpc_sequential.py +++ /dev/null @@ -1,408 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import logging -import os -from typing import Callable, List, Optional - -import torch -import torch.distributed as torch_distrib -from torch import nn -from torch.nn.parallel import DistributedDataParallel -from torch.optim import Optimizer - -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.overrides.distributed import LightningDistributedModule -from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin -from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only -from pytorch_lightning.utilities.exceptions import MisconfigurationException - -if _FAIRSCALE_PIPE_AVAILABLE: - import fairscale.nn.model_parallel as mpu - from fairscale.nn import PipeRPCWrapper - from fairscale.nn.pipe import balance as pipe_balance - from fairscale.nn.pipe import rpc as rpc_pipe - from fairscale.nn.pipe.pipeline import PipelineStyle - -log = logging.getLogger(__name__) - - -class RPCSequentialPlugin(RPCPlugin): - - def __init__( - self, - balance: Optional[List[int]] = None, - microbatches: int = 8, - checkpoint: str = 'except_last', - balance_mode: str = "balance_by_size", - pipelined_backward: Optional[bool] = True, - rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC, - **kwargs - ): - """ - Provides sequential model parallelism for :class:`nn.Sequential ` module. - If the module requires lots of memory, Pipe can be used to reduce this by leveraging multiple GPUs. - - .. _RPCSequentialPlugin: https://arxiv.org/abs/1811.06965 - - Pipeline parallelism comes with with checkpointing to reduce peak - memory required to train while minimizing device under-utilization. - This is turned on by default and can be turned off via the checkpoint argument. - - You should determine the balance when defining the plugin, - or you can pass an example input array via the LightningModule to infer a balance. - The module will be partitioned into multiple devices according to the given balance. You may also rely on - your own heuristics to find your own optimal configuration. - - Args: - balance: The balance of the model, i.e [2, 2] (two layers on each GPU). - If not provided assumes user provides an input example array to find a balance on all GPUs. - - microbatches: Allows for parallelization to reduce device utilization - by splitting the batch into further smaller batches. - - checkpoint: Enables gradient checkpointing. ['always', 'except_last', 'never'] - - balance_mode: Type of balance heuristic to use if balance to be inferred. - - - 'balance_by_size': checks memory usage of each layer and determines balance - - - 'balance_by_time': checks time of each layer and determines balance - - pipelined_backward: if True, call torch.autograd.backward once per microbatch on the - - backward pass (instead of once for the whole batch). This works - around a potential deadlock in pytorch when using tensor parallelism - at the same time. Defaults to `True` if - `get_model_parallel_world_size() > 1` - """ - self._check_pipe_available() - super().__init__(rpc_timeout_sec=rpc_timeout_sec, **kwargs) - - self.balance = balance - - self.microbatches = microbatches - self.checkpoint = checkpoint - self.balance_mode = balance_mode - self.pipelined_backward = pipelined_backward - self._main_rpc_process = True - - def init_ddp_connection( - self, - global_rank: Optional[int] = None, - world_size: Optional[int] = None, - ) -> None: - if self.lightning_module.trainer.amp_backend is not None: - raise MisconfigurationException( - '`RPCSequentialPlugin` is currently not supported in Automatic Mixed Precision' - ) - - if self._skip_init_connections(): - return - - global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank() - world_size = world_size if world_size is not None else self.cluster_environment.world_size() - super().init_ddp_connection(global_rank, world_size) - super().init_rpc_connection(global_rank=global_rank, world_size=world_size) - model = self.lightning_module - self.gpus_per_model = self._infer_check_num_gpus() - self.init_model_parallel_groups() - self.set_main_rpc_process() - - self._check_sequential_model_exists(model) - - # check if user given balance is valid - if self.balance is not None: - self._assert_valid_model_balance() - - if self.main_rpc_process: - if self.balance is None: - self._infer_model_balance() - self.init_pipe_module() - else: - self.handle_transferred_pipe_module() - self.exit_rpc_process() - - def _infer_model_balance(self): - log.info(f'Inferring model balance using {self.balance_mode} mode') - model = self.lightning_module - if model.example_input_array is None: - raise MisconfigurationException( - 'Please set example_input_array to your model, so we can infer the right model balance for you' - ) - balance_func = getattr(pipe_balance, self.balance_mode) - self.balance = balance_func(self.gpus_per_model, model.sequential_module, model.example_input_array) - self._sync_balance_to_all_parallel_groups() - - log.info(f'The following model balance {self.balance.tolist()} was inferred using {self.balance_mode} mode') - - def _sync_balance_to_all_parallel_groups(self, main_rank=0): - """ - Ensures that we sync the balance to all main processes, so that the balance is the same per replica. - Args: - main_rank: The rank with the balance we'd like to replicate. - """ - self.balance = torch.tensor(self.balance, dtype=torch.int, device='cuda') - # Ensure we sync to all processes within the main data parallel group - # We use the data parallel group as all main processes are found within the same group - torch_distrib.broadcast(self.balance, src=main_rank, group=mpu.get_data_parallel_group()) - self.balance = self.balance.cpu() - - def _check_sequential_model_exists(self, model): - if not hasattr(model, "sequential_module") or not isinstance(model.sequential_module, nn.Sequential): - raise MisconfigurationException( - 'Could not find a PipeLightningModule within the model. ' - 'Did you set your sequential model as the `sequential_module` attribute of your model?' - ) - - def _find_and_init_pipe_module(self, model): - if hasattr(model, "sequential_module") and isinstance(model.sequential_module, LightningPipeModule): - # model has been wrapped already - return - elif hasattr(model, "sequential_module") and isinstance(model.sequential_module, nn.Sequential): - # try to wrap model for the user - model.sequential_module = LightningPipeModule( - model.sequential_module, - balance=self.balance, - microbatches=self.microbatches, - checkpoint=self.checkpoint, - ) - # Update references for workers to access correct lightning functions when calling RPC - model.sequential_module.trainer = model.trainer - model.sequential_module.configure_optimizers = model.configure_optimizers - - # Update references for main process to access correct lightning functions when calling RPC - model.sequential_module.module.model.trainer = model.trainer - model.sequential_module.module.model.configure_optimizers = model.configure_optimizers - - self.model = model - - else: - raise MisconfigurationException( - 'Could not find a PipeLightningModule within the model. ' - 'Did you defined set your sequential model as a `sequential_module` attribute of your model?' - ) - - def _assert_valid_model_balance(self): - model = self.lightning_module - if sum(self.balance) != len(model.sequential_module): - raise MisconfigurationException( - f'The provided balance sum: {sum(self.balance)} does not' - f' match your Sequential length: {len(model.sequential_module)}' - ) - - def _skip_init_connections(self): - """ - Skip initialization if torch is already initialized and we're in testing. - Returns: Whether to skip initialization - - """ - return torch_distrib.is_initialized() and self.lightning_module.trainer.state.fn != TrainerFn.FITTING - - def init_model_parallel_groups(self): - num_model_parallel = 1 # TODO currently no support for vertical model parallel - mpu.initialize_model_parallel(model_parallel_size_=num_model_parallel, pipeline_length=self.gpus_per_model) - - def _infer_check_num_gpus(self): - """ - Infer the number of GPUs per model. - - Returns: The appropriate balance for the model - """ - if isinstance(self.balance, list): - if len(self.balance) != (self.world_size / self.num_nodes): - raise MisconfigurationException( - "Pipe currently only supports splitting the module onto all available GPUs" - ) - # User has defined a balance for his model - return len(self.balance) - # Assume that the user wants to balance his model on all GPUs - return self.world_size - - def handle_transferred_pipe_module(self) -> None: - if self.lightning_module.trainer.state.fn == TrainerFn.FITTING: - torch_distrib.barrier() # Ensure we await main process initialization - # Add trainer/configure_optimizers to the pipe model for access in all worker processes - rpc_pipe.PipeModel.trainer = self.lightning_module.trainer - del rpc_pipe.PipeModel.trainer.model.sequential_module - rpc_pipe.PipeModel.trainer.model.sequential_module = rpc_pipe.PipeModel - rpc_pipe.PipeModel.configure_optimizers = self.lightning_module.configure_optimizers - - def init_pipe_module(self) -> None: - # Create pipe_module - model = self.lightning_module - self._find_and_init_pipe_module(model) - if self.lightning_module.trainer.state.fn == TrainerFn.FITTING: - torch_distrib.barrier() # Ensure we join main process initialization - model.sequential_module.foreach_worker(register_optimizers, include_self=True) - - # TODO: Move this to the connector - - def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int): - """Run before precision plugin executes backward""" - - def configure_ddp(self): - if self.main_rpc_process: - self.pre_configure_ddp() - - self._model = DistributedDataParallel( - LightningDistributedModule(self.model), - device_ids=self.determine_ddp_device_ids(), - process_group=mpu.get_data_parallel_group(), - **self._ddp_kwargs, - ) - # Plugin handle backwards across processes. Currently not supported for DDP + pipe parallel - self._model.require_backward_grad_sync = False - - @rank_zero_only - def rpc_save_model(self, trainer, save_model_fn: Callable, filepath: str) -> None: - model = self.lightning_module - if not hasattr(model.sequential_module, "foreach_worker"): - return - current_layers = model.sequential_module - model.sequential_module.foreach_worker( - save_layers_on_all_rank_zero_workers, {"gpus_per_model": self.gpus_per_model}, include_self=True - ) - model.sequential_module = load_sequential_from_saved_layers(self.gpus_per_model) - save_model_fn(trainer, filepath) - model.sequential_module = current_layers - - def worker_optimizer_step(self, model: LightningModule, opt_idx: int, *args, **kwargs) -> None: - model.sequential_module.foreach_worker( - run_optimizer, { - "opt_idx": opt_idx, - "args": args, - "kwargs": kwargs - }, include_self=False - ) - - @property - def distributed_sampler_kwargs(self): - return dict( - num_replicas=mpu.get_data_parallel_world_size(), - rank=mpu.get_data_parallel_rank(), - ) - - @property - def data_parallel_group(self): - return mpu.get_data_parallel_group() - - def set_main_rpc_process(self): - self.main_rpc_process = torch_distrib.get_rank(group=mpu.get_pipeline_parallel_group()) == 0 - - @property - def main_rpc_process(self) -> bool: - return self._main_rpc_process - - @main_rpc_process.setter - def main_rpc_process(self, is_main_process): - self._main_rpc_process = is_main_process - - def barrier(self, name: Optional[str] = None) -> None: - if torch_distrib.is_initialized() and self.main_rpc_process: - torch_distrib.barrier(group=self.data_parallel_group) - - def _check_pipe_available(self): - if not _FAIRSCALE_PIPE_AVAILABLE: - raise MisconfigurationException( - 'PipeRPCPlugin requires FairScale and currently is only supported on PyTorch 1.6.' - ) - - def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, **kwargs) -> None: - """Hook to do something after each optimizer step.""" - if self.rpc_enabled and self.main_rpc_process: - # Initialize optimizer step on main process - self.worker_optimizer_step(model=self.lightning_module, opt_idx=optimizer_idx, **kwargs) - - def post_training_step(self): - if self.main_rpc_process: - super().post_training_step() - - def start_training(self, trainer) -> None: - if self.main_rpc_process: - super().start_training(trainer) - - def start_evaluating(self, trainer) -> None: - if self.main_rpc_process: - super().start_evaluating(trainer) - - -class LightningPipeModule(nn.Module): - """ - This class wraps Fairscale Pipe and PipeRCPWrapper class. - """ - - def __init__(self, module: nn.Sequential, balance: List[int], microbatches: int = 8, checkpoint='never'): - super().__init__() - self.module = module - self.balance = balance - self.microbatches = microbatches - self.checkpoint = checkpoint - self._init_pipe() - - def _init_pipe(self): - device = torch.device("cuda", torch_distrib.get_rank()) - - self.module = PipeRPCWrapper( - module=self.module, - balance=self.balance, - chunks=self.microbatches, - style=PipelineStyle.MultiProcess, - input_device=device, - worker_map=self.get_worker_map(), - checkpoint=self.checkpoint, - ) - - def foreach_worker(self, *args, **kwargs): - self.module.foreach_worker(*args, **kwargs) - - def forward(self, *args, **kwargs): - return self.module(*args, **kwargs) - - def get_worker_map(self): - # TODO, is this correct with multinodes? We also assume "worker" is the same as defined in the RPCPlugin - return {rank: f"worker{rank}" for rank in range(torch_distrib.get_world_size())} - - -def register_optimizers(ctx, model): - optimizers, lr_schedulers, optimizer_frequencies = model.trainer.init_optimizers(model) - model.trainer.optimizers = optimizers - model.trainer.lr_schedulers = lr_schedulers - model.trainer.optimizer_frequencies = optimizer_frequencies - - -def run_optimizer(ctx, model): - trainer = model.trainer - opt_idx = ctx["opt_idx"] - optimizer = trainer.optimizers[opt_idx] - optimizer.step(*ctx["args"], **ctx["kwargs"]) - - -def save_layers_on_all_rank_zero_workers(ctx, model): - gpus_per_model = ctx["gpus_per_model"] - rank = torch_distrib.get_rank() - if rank in range(gpus_per_model): - seq = list(model.children())[0] - torch.save(seq, f"seq_{rank}.pt") - - -def load_sequential_from_saved_layers(gpus_per_model): - partial_seqs = [torch.load(f"seq_{rank}.pt", map_location='cpu') for rank in range(gpus_per_model)] - seq = nn.Sequential() - for p_seq in partial_seqs: - for name, child in p_seq.named_children(): - seq.add_module(name, child) - # delete tmp files - [os.remove(f"seq_{rank}.pt") for rank in range(gpus_per_model)] - return seq diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py index 02da937286dcc..7e5796d5b5668 100644 --- a/pytorch_lightning/plugins/training_type/sharded.py +++ b/pytorch_lightning/plugins/training_type/sharded.py @@ -16,7 +16,7 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.trainer.states import TrainerFn @@ -54,7 +54,8 @@ def _reinit_optimizers_with_oss(self): optim_class = type(optimizer) zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) if _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE: - is_fp16 = self.lightning_module.trainer.precision == 16 + precision = self.lightning_module.trainer.precision + is_fp16 = precision in ("mixed", 16) # For multi-node training, compressing the model shards in fp16 before broadcasting # improves performance. When using PyTorch AMP, it will not degrade # the model performance. @@ -85,7 +86,7 @@ def _optim_state_dict(self, optimizer): return optimizer.state_dict() @property - def lightning_module(self) -> LightningModule: + def lightning_module(self) -> 'pl.LightningModule': if not _FAIRSCALE_AVAILABLE: # pragma: no cover raise MisconfigurationException( "`DDPShardedPlugin` requires `fairscale` to be installed." diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py index 5daf4e5be3735..c583ac756cd0f 100644 --- a/pytorch_lightning/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py @@ -16,7 +16,7 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.trainer.states import TrainerFn @@ -71,7 +71,7 @@ def _optim_state_dict(self, optimizer): return optimizer.state_dict() @property - def lightning_module(self) -> LightningModule: + def lightning_module(self) -> 'pl.LightningModule': if not _FAIRSCALE_AVAILABLE: # pragma: no cover raise MisconfigurationException( "`DDPSpawnShardedPlugin` requires `fairscale` to be installed." diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 1816f5838c948..d4a328902eba0 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -85,5 +85,4 @@ def teardown(self) -> None: # GPU teardown self.lightning_module.cpu() # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index 99abff992ebeb..afc692951ce80 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -12,17 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. import os - -import torch +from typing import Any, Dict from pytorch_lightning.core.decorators import parameter_validation from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin -from pytorch_lightning.utilities import _TPU_AVAILABLE -from pytorch_lightning.utilities.apply_func import move_data_to_device +from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE +from pytorch_lightning.utilities.apply_func import apply_to_collection if _TPU_AVAILABLE: import torch_xla.core.xla_model as xm +if _OMEGACONF_AVAILABLE: + from omegaconf import DictConfig, ListConfig, OmegaConf + class SingleTPUPlugin(SingleDevicePlugin): """ Plugin for training on a single TPU device. """ @@ -54,13 +56,20 @@ def pre_dispatch(self) -> None: self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() - def on_save(self, checkpoint: dict) -> dict: - """ - Move XLA tensors to CPU before saving - Recommended on XLA Guide: - https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors + def save(self, state_dict: Dict, path: str) -> None: + xm.save(state_dict, path) + + def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: str) -> None: + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + checkpoint: dict containing model and trainer state + filepath: write-target file's path """ - return move_data_to_device(checkpoint, torch.device("cpu")) + # Related Issue: https://github.com/pytorch/xla/issues/2773 + if _OMEGACONF_AVAILABLE: + checkpoint = apply_to_collection(checkpoint, (DictConfig, ListConfig), OmegaConf.to_container) + self.save({k: v for k, v in checkpoint.items() if k != "callbacks"}, filepath) def teardown(self) -> None: # TPU teardown diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 9a27e6230b201..2a30ddce23841 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -52,7 +52,7 @@ class TPUSpawnPlugin(DDPSpawnPlugin): """ Plugin for training multiple TPU devices using the :func:`torch.multiprocessing.spawn` method. """ def __init__(self, parallel_devices: Optional[List[int]] = None, debug: bool = False, **_: Any) -> None: - super().__init__(parallel_devices, num_nodes=1, cluster_environment=None, sync_batchnorm=False) + super().__init__(parallel_devices) self.debug = debug self.tpu_local_core_rank = 0 self.tpu_global_core_rank = 0 @@ -185,6 +185,9 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): checkpoint_callback = self.lightning_module.trainer.checkpoint_callback best_model_path = checkpoint_callback.best_model_path if checkpoint_callback else None + # requires to compute the state_dict on all processes in case Metrics are present + state_dict = self.lightning_module.state_dict() + if self.mp_queue is not None: rank_zero_warn("cleaning up tpu spawn environment...") @@ -195,13 +198,14 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): and len(best_model_path) > 0 ): last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path) - self.save(self.lightning_module.state_dict(), last_path) + self.save(state_dict, last_path) if self.local_rank == 0: # todo, pass complete checkpoint as state dictionary self.mp_queue.put(best_model_path) self.mp_queue.put(last_path) self.mp_queue.put(results) + self.lightning_module.add_to_queue(self.mp_queue) # adds the `callback_metrics` to the queue def save(self, state_dict: Dict, path: str) -> None: xm.save(state_dict, path) @@ -312,3 +316,7 @@ def teardown(self) -> None: @property def should_rank_save_checkpoint(self) -> bool: return self.local_rank == 0 + + @classmethod + def register_plugins(cls, plugin_registry: Dict) -> None: + plugin_registry.register("tpu_spawn_debug", cls, description="TPUSpawn Plugin with `debug` as True", debug=True) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 8d27fd4ac6a2f..e7ca73bc9f40d 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -13,7 +13,8 @@ # limitations under the License. import contextlib from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Generator, Iterable, Optional, Tuple, TypeVar, Union +from pathlib import Path +from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, TypeVar, Union import torch from torch import Tensor @@ -144,9 +145,16 @@ def results(self) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]: """ return self._results - @property - def rpc_enabled(self) -> bool: - return False + def load_checkpoint_file(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: + return pl_load(checkpoint_path, map_location=(lambda storage, loc: storage)) + + def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + self.lightning_module.load_state_dict(checkpoint["state_dict"]) + + def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + optimizer_states = checkpoint["optimizer_states"] + for optimizer, opt_state in zip(self.lightning_module.trainer.accelerator.optimizers, optimizer_states): + optimizer.load_state_dict(opt_state) def start_training(self, trainer: 'pl.Trainer') -> None: # double dispatch to initiate the training loop @@ -161,19 +169,19 @@ def start_predicting(self, trainer: 'pl.Trainer') -> None: self._results = trainer.run_stage() def training_step(self, *args, **kwargs): - return self.lightning_module.training_step(*args, **kwargs) + return self.model.training_step(*args, **kwargs) def post_training_step(self): pass def validation_step(self, *args, **kwargs): - return self.lightning_module.validation_step(*args, **kwargs) + return self.model.validation_step(*args, **kwargs) def test_step(self, *args, **kwargs): - return self.lightning_module.test_step(*args, **kwargs) + return self.model.test_step(*args, **kwargs) def predict_step(self, *args, **kwargs): - return self.lightning_module.predict_step(*args, **kwargs) + return self.model.predict_step(*args, **kwargs) def training_step_end(self, output): return output @@ -195,6 +203,22 @@ def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[I """ return dataloader + def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the train dataloader.""" + return dataloader + + def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the val dataloader.""" + return dataloader + + def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the test dataloader.""" + return dataloader + + def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + """Called before resetting the predict dataloader.""" + return dataloader + def init_optimizers(self, trainer: 'pl.Trainer', model: 'pl.LightningModule'): return trainer.init_optimizers(model) @@ -211,33 +235,6 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: """ return False - def restore_model_state_from_ckpt_path( - self, - ckpt_path: str, - map_location: Callable = lambda storage, loc: storage, - ) -> Tuple[Dict, bool]: - """ - This function is used to load and restore the model state. - - Args: - ckpt_path: Path to a checkpoint - map_location: lambda function to map checkpoint location - - Return - checkpoint: Return loaded checkpoint - bool: Wether to load optimizer / lr_schedulers states from checkpoint - - """ - ckpt = pl_load(ckpt_path, map_location=map_location) - # restore datamodule states - if self.lightning_module.trainer.datamodule is not None: - self.lightning_module.trainer.datamodule.on_load_checkpoint(ckpt) - - # hook: give user access to checkpoint if needed. - self.lightning_module.on_load_checkpoint(ckpt) - self.lightning_module.load_state_dict(ckpt['state_dict']) - return ckpt, True - def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: """ Provide a hook to count optimizer step calls. @@ -314,3 +311,41 @@ def register_plugins(cls, plugin_registry): def should_rank_save_checkpoint(self) -> bool: """Returns whether the checkpoint should be saved (rank based)""" return self.is_global_zero + + def on_train_start(self) -> None: + """Called when train begins.""" + pass + + def on_validation_start(self) -> None: + """Called when validation begins.""" + pass + + def on_test_start(self) -> None: + """Called when test begins.""" + pass + + def on_predict_start(self) -> None: + """Called when predict begins.""" + pass + + def on_train_end(self) -> None: + """Called when train ends.""" + pass + + def on_validation_end(self) -> None: + """Called when validation ends.""" + pass + + def on_test_end(self) -> None: + """Called when test end.""" + pass + + def on_predict_end(self): + """Called when predict ends.""" + pass + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the training loop before anything happens for that batch. + """ + pass diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py index 6ac6e16c18529..a21b3f173c26e 100644 --- a/pytorch_lightning/profiler/__init__.py +++ b/pytorch_lightning/profiler/__init__.py @@ -194,14 +194,18 @@ def custom_processing_step(self, data): python -c 'import torch; print(torch.autograd.profiler.load_nvprof("trace_name.prof"))' """ - -from pytorch_lightning.profiler.profilers import AdvancedProfiler, BaseProfiler, PassThroughProfiler, SimpleProfiler +from pytorch_lightning.profiler.advanced import AdvancedProfiler +from pytorch_lightning.profiler.base import AbstractProfiler, BaseProfiler, PassThroughProfiler from pytorch_lightning.profiler.pytorch import PyTorchProfiler +from pytorch_lightning.profiler.simple import SimpleProfiler +from pytorch_lightning.profiler.xla import XLAProfiler __all__ = [ + 'AbstractProfiler', 'BaseProfiler', - 'SimpleProfiler', 'AdvancedProfiler', 'PassThroughProfiler', - "PyTorchProfiler", + 'PyTorchProfiler', + 'SimpleProfiler', + 'XLAProfiler', ] diff --git a/pytorch_lightning/profiler/advanced.py b/pytorch_lightning/profiler/advanced.py new file mode 100644 index 0000000000000..3a017d72ff5e0 --- /dev/null +++ b/pytorch_lightning/profiler/advanced.py @@ -0,0 +1,92 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Profiler to check if there are any bottlenecks in your code.""" +import cProfile +import io +import logging +import pstats +from pathlib import Path +from typing import Dict, Optional, Union + +from pytorch_lightning.profiler.base import BaseProfiler + +log = logging.getLogger(__name__) + + +class AdvancedProfiler(BaseProfiler): + """ + This profiler uses Python's cProfiler to record more detailed information about + time spent in each function call recorded during a given action. The output is quite + verbose and you should only use this if you want very detailed reports. + """ + + def __init__( + self, + dirpath: Optional[Union[str, Path]] = None, + filename: Optional[str] = None, + line_count_restriction: float = 1.0, + output_filename: Optional[str] = None, + ) -> None: + """ + Args: + dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the + ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`) + will be used. + + filename: If present, filename where the profiler results will be saved instead of printing to stdout. + The ``.txt`` extension will be used automatically. + + line_count_restriction: this can be used to limit the number of functions + reported for each action. either an integer (to select a count of lines), + or a decimal fraction between 0.0 and 1.0 inclusive (to select a percentage of lines) + + Raises: + ValueError: + If you attempt to stop recording an action which was never started. + """ + super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename) + self.profiled_actions: Dict[str, cProfile.Profile] = {} + self.line_count_restriction = line_count_restriction + + def start(self, action_name: str) -> None: + if action_name not in self.profiled_actions: + self.profiled_actions[action_name] = cProfile.Profile() + self.profiled_actions[action_name].enable() + + def stop(self, action_name: str) -> None: + pr = self.profiled_actions.get(action_name) + if pr is None: + raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.") + pr.disable() + + def summary(self) -> str: + recorded_stats = {} + for action_name, pr in self.profiled_actions.items(): + s = io.StringIO() + ps = pstats.Stats(pr, stream=s).strip_dirs().sort_stats('cumulative') + ps.print_stats(self.line_count_restriction) + recorded_stats[action_name] = s.getvalue() + return self._stats_to_str(recorded_stats) + + def teardown(self, stage: Optional[str] = None) -> None: + super().teardown(stage=stage) + self.profiled_actions = {} + + def __reduce__(self): + # avoids `TypeError: cannot pickle 'cProfile.Profile' object` + return ( + self.__class__, + tuple(), + dict(dirpath=self.dirpath, filename=self.filename, line_count_restriction=self.line_count_restriction), + ) diff --git a/pytorch_lightning/profiler/base.py b/pytorch_lightning/profiler/base.py new file mode 100644 index 0000000000000..8b5bf5483d976 --- /dev/null +++ b/pytorch_lightning/profiler/base.py @@ -0,0 +1,219 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Profiler to check if there are any bottlenecks in your code.""" +import logging +import os +from abc import ABC, abstractmethod +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Callable, Dict, Optional, TextIO, Union + +from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.cloud_io import get_filesystem + +log = logging.getLogger(__name__) + + +class AbstractProfiler(ABC): + """Specification of a profiler.""" + + @abstractmethod + def start(self, action_name: str) -> None: + """Defines how to start recording an action.""" + + @abstractmethod + def stop(self, action_name: str) -> None: + """Defines how to record the duration once an action is complete.""" + + @abstractmethod + def summary(self) -> str: + """Create profiler summary in text format.""" + + @abstractmethod + def setup(self, **kwargs: Any) -> None: + """Execute arbitrary pre-profiling set-up steps as defined by subclass.""" + + @abstractmethod + def teardown(self, **kwargs: Any) -> None: + """Execute arbitrary post-profiling tear-down steps as defined by subclass.""" + + +class BaseProfiler(AbstractProfiler): + """ + If you wish to write a custom profiler, you should inherit from this class. + """ + + def __init__( + self, + dirpath: Optional[Union[str, Path]] = None, + filename: Optional[str] = None, + output_filename: Optional[str] = None, + ) -> None: + self.dirpath = dirpath + self.filename = filename + if output_filename is not None: + rank_zero_deprecation( + "`Profiler` signature has changed in v1.3. The `output_filename` parameter has been removed in" + " favor of `dirpath` and `filename`. Support for the old signature will be removed in v1.5", + ) + filepath = Path(output_filename) + self.dirpath = filepath.parent + self.filename = filepath.stem + + self._output_file: Optional[TextIO] = None + self._write_stream: Optional[Callable] = None + self._local_rank: Optional[int] = None + self._log_dir: Optional[str] = None + self._stage: Optional[str] = None + + @contextmanager + def profile(self, action_name: str) -> None: + """ + Yields a context manager to encapsulate the scope of a profiled action. + + Example:: + + with self.profile('load training data'): + # load training data code + + The profiler will start once you've entered the context and will automatically + stop once you exit the code block. + """ + try: + self.start(action_name) + yield action_name + finally: + self.stop(action_name) + + def profile_iterable(self, iterable, action_name: str) -> None: + iterator = iter(iterable) + while True: + try: + self.start(action_name) + value = next(iterator) + self.stop(action_name) + yield value + except StopIteration: + self.stop(action_name) + break + + def _rank_zero_info(self, *args, **kwargs) -> None: + if self._local_rank in (None, 0): + log.info(*args, **kwargs) + + def _prepare_filename( + self, action_name: Optional[str] = None, extension: str = ".txt", split_token: str = "-" + ) -> str: + args = [] + if self._stage is not None: + args.append(self._stage) + if self.filename: + args.append(self.filename) + if self._local_rank is not None: + args.append(str(self._local_rank)) + if action_name is not None: + args.append(action_name) + filename = split_token.join(args) + extension + return filename + + def _prepare_streams(self) -> None: + if self._write_stream is not None: + return + if self.filename: + filepath = os.path.join(self.dirpath, self._prepare_filename()) + fs = get_filesystem(filepath) + file = fs.open(filepath, "a") + self._output_file = file + self._write_stream = file.write + else: + self._write_stream = self._rank_zero_info + + def describe(self) -> None: + """Logs a profile report after the conclusion of run.""" + # there are pickling issues with open file handles in Python 3.6 + # so to avoid them, we open and close the files within this function + # by calling `_prepare_streams` and `teardown` + self._prepare_streams() + summary = self.summary() + if summary: + self._write_stream(summary) + if self._output_file is not None: + self._output_file.flush() + self.teardown(stage=self._stage) + + def _stats_to_str(self, stats: Dict[str, str]) -> str: + stage = f"{self._stage.upper()} " if self._stage is not None else "" + output = [stage + "Profiler Report"] + for action, value in stats.items(): + header = f"Profile stats for: {action}" + if self._local_rank is not None: + header += f" rank: {self._local_rank}" + output.append(header) + output.append(value) + return os.linesep.join(output) + + def setup( + self, + stage: Optional[str] = None, + local_rank: Optional[int] = None, + log_dir: Optional[str] = None, + ) -> None: + """Execute arbitrary pre-profiling set-up steps.""" + self._stage = stage + self._local_rank = local_rank + self._log_dir = log_dir + self.dirpath = self.dirpath or log_dir + + def teardown(self, stage: Optional[str] = None) -> None: + """ + Execute arbitrary post-profiling tear-down steps. + + Closes the currently open file and stream. + """ + self._write_stream = None + if self._output_file is not None: + self._output_file.close() + self._output_file = None # can't pickle TextIOWrapper + + def __del__(self) -> None: + self.teardown(stage=self._stage) + + def start(self, action_name: str) -> None: + raise NotImplementedError + + def stop(self, action_name: str) -> None: + raise NotImplementedError + + def summary(self) -> str: + raise NotImplementedError + + @property + def local_rank(self) -> int: + return 0 if self._local_rank is None else self._local_rank + + +class PassThroughProfiler(BaseProfiler): + """ + This class should be used when you don't want the (small) overhead of profiling. + The Trainer uses this class by default. + """ + + def start(self, action_name: str) -> None: + pass + + def stop(self, action_name: str) -> None: + pass + + def summary(self) -> str: + return "" diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index 78327fa0a91d8..fb29ec5289744 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -1,387 +1,22 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Profiler to check if there are any bottlenecks in your code.""" -import cProfile -import io -import logging -import os -import pstats -import time -from abc import ABC, abstractmethod -from collections import defaultdict -from contextlib import contextmanager -from pathlib import Path -from typing import Any, Callable, Dict, Optional, TextIO, Tuple, Union - -import numpy as np - -from pytorch_lightning.utilities import rank_zero_warn -from pytorch_lightning.utilities.cloud_io import get_filesystem - -log = logging.getLogger(__name__) - - -class AbstractProfiler(ABC): - """Specification of a profiler.""" - - @abstractmethod - def start(self, action_name: str) -> None: - """Defines how to start recording an action.""" - - @abstractmethod - def stop(self, action_name: str) -> None: - """Defines how to record the duration once an action is complete.""" - - @abstractmethod - def summary(self) -> str: - """Create profiler summary in text format.""" - - @abstractmethod - def setup(self, **kwargs: Any) -> None: - """Execute arbitrary pre-profiling set-up steps as defined by subclass.""" - - @abstractmethod - def teardown(self, **kwargs: Any) -> None: - """Execute arbitrary post-profiling tear-down steps as defined by subclass.""" - - -class BaseProfiler(AbstractProfiler): - """ - If you wish to write a custom profiler, you should inherit from this class. - """ - - def __init__( - self, - dirpath: Optional[Union[str, Path]] = None, - filename: Optional[str] = None, - output_filename: Optional[str] = None, - ) -> None: - self.dirpath = dirpath - self.filename = filename - if output_filename is not None: - rank_zero_warn( - "`Profiler` signature has changed in v1.3. The `output_filename` parameter has been removed in" - " favor of `dirpath` and `filename`. Support for the old signature will be removed in v1.5", - DeprecationWarning - ) - filepath = Path(output_filename) - self.dirpath = filepath.parent - self.filename = filepath.stem - - self._output_file: Optional[TextIO] = None - self._write_stream: Optional[Callable] = None - self._local_rank: Optional[int] = None - self._log_dir: Optional[str] = None - self._stage: Optional[str] = None - - @contextmanager - def profile(self, action_name: str) -> None: - """ - Yields a context manager to encapsulate the scope of a profiled action. - - Example:: - - with self.profile('load training data'): - # load training data code - - The profiler will start once you've entered the context and will automatically - stop once you exit the code block. - """ - try: - self.start(action_name) - yield action_name - finally: - self.stop(action_name) - - def profile_iterable(self, iterable, action_name: str) -> None: - iterator = iter(iterable) - while True: - try: - self.start(action_name) - value = next(iterator) - self.stop(action_name) - yield value - except StopIteration: - self.stop(action_name) - break - - def _rank_zero_info(self, *args, **kwargs) -> None: - if self._local_rank in (None, 0): - log.info(*args, **kwargs) - - def _prepare_filename(self, extension: str = ".txt") -> str: - filename = "" - if self._stage is not None: - filename += f"{self._stage}-" - filename += str(self.filename) - if self._local_rank is not None: - filename += f"-{self._local_rank}" - filename += extension - return filename - - def _prepare_streams(self) -> None: - if self._write_stream is not None: - return - if self.filename: - filepath = os.path.join(self.dirpath, self._prepare_filename()) - fs = get_filesystem(filepath) - file = fs.open(filepath, "a") - self._output_file = file - self._write_stream = file.write - else: - self._write_stream = self._rank_zero_info - - def describe(self) -> None: - """Logs a profile report after the conclusion of run.""" - # there are pickling issues with open file handles in Python 3.6 - # so to avoid them, we open and close the files within this function - # by calling `_prepare_streams` and `teardown` - self._prepare_streams() - summary = self.summary() - if summary: - self._write_stream(summary) - if self._output_file is not None: - self._output_file.flush() - self.teardown(stage=self._stage) - - def _stats_to_str(self, stats: Dict[str, str]) -> str: - stage = f"{self._stage.upper()} " if self._stage is not None else "" - output = [stage + "Profiler Report"] - for action, value in stats.items(): - header = f"Profile stats for: {action}" - if self._local_rank is not None: - header += f" rank: {self._local_rank}" - output.append(header) - output.append(value) - return os.linesep.join(output) - - def setup( - self, - stage: Optional[str] = None, - local_rank: Optional[int] = None, - log_dir: Optional[str] = None, - ) -> None: - """Execute arbitrary pre-profiling set-up steps.""" - self._stage = stage - self._local_rank = local_rank - self._log_dir = log_dir - self.dirpath = self.dirpath or log_dir - - def teardown(self, stage: Optional[str] = None) -> None: - """ - Execute arbitrary post-profiling tear-down steps. - - Closes the currently open file and stream. - """ - self._write_stream = None - if self._output_file is not None: - self._output_file.close() - self._output_file = None # can't pickle TextIOWrapper - - def __del__(self) -> None: - self.teardown(stage=self._stage) - - def start(self, action_name: str) -> None: - raise NotImplementedError - - def stop(self, action_name: str) -> None: - raise NotImplementedError - - def summary(self) -> str: - raise NotImplementedError - - @property - def local_rank(self) -> int: - return 0 if self._local_rank is None else self._local_rank - - -class PassThroughProfiler(BaseProfiler): - """ - This class should be used when you don't want the (small) overhead of profiling. - The Trainer uses this class by default. - """ - - def start(self, action_name: str) -> None: - pass - - def stop(self, action_name: str) -> None: - pass - - def summary(self) -> str: - return "" - - -class SimpleProfiler(BaseProfiler): - """ - This profiler simply records the duration of actions (in seconds) and reports - the mean duration of each action and the total time spent over the entire training run. - """ - - def __init__( - self, - dirpath: Optional[Union[str, Path]] = None, - filename: Optional[str] = None, - extended: bool = True, - output_filename: Optional[str] = None, - ) -> None: - """ - Args: - dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the - ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`) - will be used. - - filename: If present, filename where the profiler results will be saved instead of printing to stdout. - The ``.txt`` extension will be used automatically. - - Raises: - ValueError: - If you attempt to start an action which has already started, or - if you attempt to stop recording an action which was never started. - """ - super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename) - self.current_actions: Dict[str, float] = {} - self.recorded_durations = defaultdict(list) - self.extended = extended - self.start_time = time.monotonic() - - def start(self, action_name: str) -> None: - if action_name in self.current_actions: - raise ValueError(f"Attempted to start {action_name} which has already started.") - self.current_actions[action_name] = time.monotonic() - - def stop(self, action_name: str) -> None: - end_time = time.monotonic() - if action_name not in self.current_actions: - raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.") - start_time = self.current_actions.pop(action_name) - duration = end_time - start_time - self.recorded_durations[action_name].append(duration) - - def _make_report(self) -> Tuple[list, float]: - total_duration = time.monotonic() - self.start_time - report = [[a, d, 100. * np.sum(d) / total_duration] for a, d in self.recorded_durations.items()] - report.sort(key=lambda x: x[2], reverse=True) - return report, total_duration - - def summary(self) -> str: - sep = os.linesep - output_string = "" - if self._stage is not None: - output_string += f"{self._stage.upper()} " - output_string += f"Profiler Report{sep}" - - if self.extended: - - if len(self.recorded_durations) > 0: - max_key = np.max([len(k) for k in self.recorded_durations.keys()]) - - def log_row(action, mean, num_calls, total, per): - row = f"{sep}{action:<{max_key}s}\t| {mean:<15}\t|" - row += f"{num_calls:<15}\t| {total:<15}\t| {per:<15}\t|" - return row - - output_string += log_row("Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %") - output_string_len = len(output_string) - output_string += f"{sep}{'-' * output_string_len}" - report, total_duration = self._make_report() - output_string += log_row("Total", "-", "_", f"{total_duration:.5}", "100 %") - output_string += f"{sep}{'-' * output_string_len}" - for action, durations, duration_per in report: - output_string += log_row( - action, - f"{np.mean(durations):.5}", - f"{len(durations):}", - f"{np.sum(durations):.5}", - f"{duration_per:.5}", - ) - else: - - def log_row(action, mean, total): - return f"{sep}{action:<20s}\t| {mean:<15}\t| {total:<15}" - - output_string += log_row("Action", "Mean duration (s)", "Total time (s)") - output_string += f"{sep}{'-' * 65}" - - for action, durations in self.recorded_durations.items(): - output_string += log_row(action, f"{np.mean(durations):.5}", f"{np.sum(durations):.5}") - output_string += sep - return output_string - - -class AdvancedProfiler(BaseProfiler): - """ - This profiler uses Python's cProfiler to record more detailed information about - time spent in each function call recorded during a given action. The output is quite - verbose and you should only use this if you want very detailed reports. - """ - - def __init__( - self, - dirpath: Optional[Union[str, Path]] = None, - filename: Optional[str] = None, - line_count_restriction: float = 1.0, - output_filename: Optional[str] = None, - ) -> None: - """ - Args: - dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the - ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`) - will be used. - - filename: If present, filename where the profiler results will be saved instead of printing to stdout. - The ``.txt`` extension will be used automatically. - - line_count_restriction: this can be used to limit the number of functions - reported for each action. either an integer (to select a count of lines), - or a decimal fraction between 0.0 and 1.0 inclusive (to select a percentage of lines) - - Raises: - ValueError: - If you attempt to stop recording an action which was never started. - """ - super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename) - self.profiled_actions: Dict[str, cProfile.Profile] = {} - self.line_count_restriction = line_count_restriction - - def start(self, action_name: str) -> None: - if action_name not in self.profiled_actions: - self.profiled_actions[action_name] = cProfile.Profile() - self.profiled_actions[action_name].enable() - - def stop(self, action_name: str) -> None: - pr = self.profiled_actions.get(action_name) - if pr is None: - raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.") - pr.disable() - - def summary(self) -> str: - recorded_stats = {} - for action_name, pr in self.profiled_actions.items(): - s = io.StringIO() - ps = pstats.Stats(pr, stream=s).strip_dirs().sort_stats('cumulative') - ps.print_stats(self.line_count_restriction) - recorded_stats[action_name] = s.getvalue() - return self._stats_to_str(recorded_stats) - - def teardown(self, stage: Optional[str] = None) -> None: - super().teardown(stage=stage) - self.profiled_actions = {} - - def __reduce__(self): - # avoids `TypeError: cannot pickle 'cProfile.Profile' object` - return ( - self.__class__, - tuple(), - dict(dirpath=self.dirpath, filename=self.filename, line_count_restriction=self.line_count_restriction), - ) +from pytorch_lightning.utilities import rank_zero_deprecation + +rank_zero_deprecation( + "Using ``import pytorch_lightning.profiler.profilers`` is deprecated in v1.4, and will be removed in v1.6. " + "HINT: Use ``import pytorch_lightning.profiler`` directly." +) + +from pytorch_lightning.profiler.advanced import AdvancedProfiler # noqa E402 +from pytorch_lightning.profiler.base import AbstractProfiler, BaseProfiler, PassThroughProfiler # noqa E402 +from pytorch_lightning.profiler.pytorch import PyTorchProfiler # noqa E402 +from pytorch_lightning.profiler.simple import SimpleProfiler # noqa E402 +from pytorch_lightning.profiler.xla import XLAProfiler # noqa E402 + +__all__ = [ + 'AbstractProfiler', + 'BaseProfiler', + 'AdvancedProfiler', + 'PassThroughProfiler', + 'PyTorchProfiler', + 'SimpleProfiler', + 'XLAProfiler', +] diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py index fa2c2917f98a2..6e8e21456e915 100644 --- a/pytorch_lightning/profiler/pytorch.py +++ b/pytorch_lightning/profiler/pytorch.py @@ -23,8 +23,8 @@ from torch import nn, Tensor from torch.autograd.profiler import record_function -from pytorch_lightning.profiler.profilers import BaseProfiler -from pytorch_lightning.utilities.distributed import rank_zero_warn +from pytorch_lightning.profiler.base import BaseProfiler +from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE @@ -90,7 +90,7 @@ def __enter__(self) -> None: self._handles[module_name] = [pre_forward_handle, post_forward_handle] - def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + def __exit__(self, type: Any, value: Any, traceback: Any) -> None: for handles in self._handles.values(): for h in handles: h.remove() @@ -132,14 +132,13 @@ def reset(self): def num_step(self) -> int: if self._current_action == "training_step_and_backward": return self._num_training_step_and_backward - elif self._current_action == "validation_step": + if self._current_action == "validation_step": return self._num_validation_step - elif self._current_action == "test_step": + if self._current_action == "test_step": return self._num_test_step - elif self._current_action == "predict_step": + if self._current_action == "predict_step": return self._num_predict_step - else: - return 0 + return 0 def _step(self) -> None: if self._current_action == "training_step_and_backward": @@ -159,11 +158,11 @@ def _step(self) -> None: def has_finished(self) -> bool: if self._current_action == "training_step_and_backward": return self._training_step_and_backward_reached_end - elif self._current_action == "validation_step": + if self._current_action == "validation_step": return self._validation_step_reached_end - elif self._current_action == "test_step": + if self._current_action == "test_step": return self._test_step_reached_end - elif self._current_action == "predict_step": + if self._current_action == "predict_step": return self._predict_step_reached_end return False @@ -349,9 +348,9 @@ def __deprecation_check( record_functions = set() if profiled_functions is not None: - rank_zero_warn( + rank_zero_deprecation( "`PyTorchProfiler.profiled_functions` has been renamed to" - " `record_functions` in v1.3 and will be removed in v1.5", DeprecationWarning + " `record_functions` in v1.3 and will be removed in v1.5" ) if not record_functions: record_functions |= set(profiled_functions) @@ -427,11 +426,15 @@ def stop(self, action_name: str) -> None: def on_trace_ready(profiler): if self.dirpath is not None: if self._export_to_chrome: - handler = tensorboard_trace_handler(self.dirpath, self._prepare_filename(extension="")) + handler = tensorboard_trace_handler( + self.dirpath, self._prepare_filename(action_name=action_name, extension="") + ) handler(profiler) if self._export_to_flame_graph: - path = os.path.join(self.dirpath, self._prepare_filename(extension=".stack")) + path = os.path.join( + self.dirpath, self._prepare_filename(action_name=action_name, extension=".stack") + ) profiler.export_stacks(path, metric=self._metric) else: rank_zero_warn("The PyTorchProfiler failed to export trace as `dirpath` is None") diff --git a/pytorch_lightning/profiler/simple.py b/pytorch_lightning/profiler/simple.py new file mode 100644 index 0000000000000..7fb8ac5be0c92 --- /dev/null +++ b/pytorch_lightning/profiler/simple.py @@ -0,0 +1,123 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Profiler to check if there are any bottlenecks in your code.""" +import logging +import os +import time +from collections import defaultdict +from pathlib import Path +from typing import Dict, Optional, Tuple, Union + +import numpy as np + +from pytorch_lightning.profiler.base import BaseProfiler + +log = logging.getLogger(__name__) + + +class SimpleProfiler(BaseProfiler): + """ + This profiler simply records the duration of actions (in seconds) and reports + the mean duration of each action and the total time spent over the entire training run. + """ + + def __init__( + self, + dirpath: Optional[Union[str, Path]] = None, + filename: Optional[str] = None, + extended: bool = True, + output_filename: Optional[str] = None, + ) -> None: + """ + Args: + dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the + ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`) + will be used. + + filename: If present, filename where the profiler results will be saved instead of printing to stdout. + The ``.txt`` extension will be used automatically. + + Raises: + ValueError: + If you attempt to start an action which has already started, or + if you attempt to stop recording an action which was never started. + """ + super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename) + self.current_actions: Dict[str, float] = {} + self.recorded_durations = defaultdict(list) + self.extended = extended + self.start_time = time.monotonic() + + def start(self, action_name: str) -> None: + if action_name in self.current_actions: + raise ValueError(f"Attempted to start {action_name} which has already started.") + self.current_actions[action_name] = time.monotonic() + + def stop(self, action_name: str) -> None: + end_time = time.monotonic() + if action_name not in self.current_actions: + raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.") + start_time = self.current_actions.pop(action_name) + duration = end_time - start_time + self.recorded_durations[action_name].append(duration) + + def _make_report(self) -> Tuple[list, float]: + total_duration = time.monotonic() - self.start_time + report = [[a, d, 100. * np.sum(d) / total_duration] for a, d in self.recorded_durations.items()] + report.sort(key=lambda x: x[2], reverse=True) + return report, total_duration + + def summary(self) -> str: + sep = os.linesep + output_string = "" + if self._stage is not None: + output_string += f"{self._stage.upper()} " + output_string += f"Profiler Report{sep}" + + if self.extended: + + if len(self.recorded_durations) > 0: + max_key = np.max([len(k) for k in self.recorded_durations.keys()]) + + def log_row(action, mean, num_calls, total, per): + row = f"{sep}{action:<{max_key}s}\t| {mean:<15}\t|" + row += f"{num_calls:<15}\t| {total:<15}\t| {per:<15}\t|" + return row + + output_string += log_row("Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %") + output_string_len = len(output_string) + output_string += f"{sep}{'-' * output_string_len}" + report, total_duration = self._make_report() + output_string += log_row("Total", "-", "_", f"{total_duration:.5}", "100 %") + output_string += f"{sep}{'-' * output_string_len}" + for action, durations, duration_per in report: + output_string += log_row( + action, + f"{np.mean(durations):.5}", + f"{len(durations):}", + f"{np.sum(durations):.5}", + f"{duration_per:.5}", + ) + else: + + def log_row(action, mean, total): + return f"{sep}{action:<20s}\t| {mean:<15}\t| {total:<15}" + + output_string += log_row("Action", "Mean duration (s)", "Total time (s)") + output_string += f"{sep}{'-' * 65}" + + for action, durations in self.recorded_durations.items(): + output_string += log_row(action, f"{np.mean(durations):.5}", f"{np.sum(durations):.5}") + output_string += sep + return output_string diff --git a/pytorch_lightning/profiler/xla.py b/pytorch_lightning/profiler/xla.py new file mode 100644 index 0000000000000..35b8e7f264c31 --- /dev/null +++ b/pytorch_lightning/profiler/xla.py @@ -0,0 +1,110 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +XLA Profiler will help you debug and optimize training workload performance +for your models using Cloud TPU performance tools. + +Manual capture via TensorBoard + +The following instructions are for capturing trace from a running program + +0. This [guide](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm#tpu-vm) will +help you with the Cloud TPU setup with the required installations + +1. Start a TensorBoard Server + +>> tensorboard --logdir ./tensorboard --port 9001 + +You could view the TensorBoard output at http://localhost:9001 on your local machine, and then open the +``PROFILE`` plugn from the top right dropdown or open http://localhost:9001/#profile + +2. Once the code you'd like to profile is running, click on ``CAPTURE PROFILE`` button. You could enter +``localhost:9012`` (default port for XLA Profiler) as the Profile Service URL. Then, you could enter +the number of milliseconds for the profiling duration, and click ``CAPTURE`` + +3. Make sure the code is running, while you are trying to capture the traces. Also, it would lead to better +performance insights if the profiling duration is longer than the step time + +4. Once the capture is finished, the page will refresh and you could browse through the insights using the +``Tools`` dropdown at the top left + +""" +import logging +from typing import Dict + +from pytorch_lightning.profiler.base import BaseProfiler +from pytorch_lightning.utilities import _TPU_AVAILABLE + +if _TPU_AVAILABLE: + import torch_xla.debug.profiler as xp + +log = logging.getLogger(__name__) + + +class XLAProfiler(BaseProfiler): + + STEP_FUNCTIONS = { + "training_step_and_backward", + "validation_step", + "test_step", + "predict_step", + } + RECORD_FUNCTIONS = { + "training_step_and_backward", + "training_step", + "backward", + "validation_step", + "test_step", + "predict_step", + } + + def __init__(self, port: int = 9012) -> None: + """ + This Profiler will help you debug and optimize training workload performance + for your models using Cloud TPU performance tools. + """ + super().__init__(dirpath=None, filename=None, output_filename=None) + self.port = port + self._recording_map: Dict = {} + self._step_recoding_map: Dict = {} + self._start_trace: bool = False + + def start(self, action_name: str) -> None: + if action_name in self.RECORD_FUNCTIONS: + if not self._start_trace: + self.server = xp.start_server(self.port) + self._start_trace = True + + if action_name in self.STEP_FUNCTIONS: + step = self._get_step_num(action_name) + recording = xp.StepTrace(action_name, step_num=step) + else: + recording = xp.Trace(action_name) + recording.__enter__() + self._recording_map[action_name] = recording + + def stop(self, action_name: str) -> None: + if action_name in self._recording_map: + self._recording_map[action_name].__exit__(None, None, None) + del self._recording_map[action_name] + + def _get_step_num(self, action_name: str) -> int: + if action_name not in self._step_recoding_map: + self._step_recoding_map[action_name] = 1 + else: + self._step_recoding_map[action_name] += 1 + return self._step_recoding_map[action_name] + + def summary(self) -> str: + return "" diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 23df26b410a03..4f4e44e57d3a3 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -17,8 +17,8 @@ from inspect import signature from typing import Any, Callable, Dict, List, Optional, Type +import pytorch_lightning as pl from pytorch_lightning.callbacks import Callback -from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT @@ -32,27 +32,27 @@ class TrainerCallbackHookMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class callbacks: List[Callback] = [] - lightning_module: LightningModule + lightning_module: 'pl.LightningModule' - def on_before_accelerator_backend_setup(self, model: LightningModule) -> None: + def on_before_accelerator_backend_setup(self, model: 'pl.LightningModule') -> None: """Called at the beginning of fit (train + validate), validate, test, or predict, or tune.""" for callback in self.callbacks: callback.on_before_accelerator_backend_setup(self, model) - def configure_sharded_model(self, model: LightningModule) -> None: + def configure_sharded_model(self, model: 'pl.LightningModule') -> None: """Called at the beginning of fit (train + validate), validate, test, or predict, or tune.""" for callback in self.callbacks: callback.on_configure_sharded_model(self, model) - def setup(self, model: LightningModule, stage: Optional[str]) -> None: + def setup(self, model: 'pl.LightningModule', stage: Optional[str]) -> None: """Called at the beginning of fit (train + validate), validate, test, or predict, or tune.""" for callback in self.callbacks: - callback.setup(self, model, stage) + callback.setup(self, model, stage=stage) def teardown(self, stage: Optional[str] = None) -> None: """Called at the end of fit (train + validate), validate, test, or predict, or tune.""" for callback in self.callbacks: - callback.teardown(self, self.lightning_module, stage) + callback.teardown(self, self.lightning_module, stage=stage) def on_init_start(self): """Called when the trainer initialization begins, model has not yet been set.""" @@ -97,10 +97,10 @@ def on_train_epoch_end(self, outputs: EPOCH_OUTPUT): """ for callback in self.callbacks: if is_param_in_hook_signature(callback.on_train_epoch_end, "outputs"): - warning_cache.warn( + warning_cache.deprecation( "The signature of `Callback.on_train_epoch_end` has changed in v1.3." " `outputs` parameter has been removed." - " Support for the old signature will be removed in v1.5", DeprecationWarning + " Support for the old signature will be removed in v1.5" ) callback.on_train_epoch_end(self, self.lightning_module, outputs) else: @@ -254,7 +254,7 @@ def on_keyboard_interrupt(self): @staticmethod def __is_old_signature_on_save_checkpoint(fn: Callable) -> bool: parameters = list(signature(fn).parameters) - return len(parameters) == 2 and parameters[1] != "args" + return len(parameters) == 2 and parameters[0] != "args" @staticmethod def __is_old_signature_on_load_checkpoint(fn: Callable) -> bool: diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py index e73bee761a241..8caeebb9ed3dd 100644 --- a/pytorch_lightning/trainer/configuration_validator.py +++ b/pytorch_lightning/trainer/configuration_validator.py @@ -34,6 +34,7 @@ def verify_loop_configurations(self, model: 'pl.LightningModule') -> None: if self.trainer.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING): self.__verify_train_loop_configuration(model) self.__verify_eval_loop_configuration(model, 'val') + self.__verify_manual_optimization_support(model) elif self.trainer.state.fn == TrainerFn.VALIDATING: self.__verify_eval_loop_configuration(model, 'val') elif self.trainer.state.fn == TrainerFn.TESTING: @@ -82,10 +83,10 @@ def __verify_train_loop_configuration(self, model: 'pl.LightningModule') -> None has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad if has_overriden_optimization_functions and going_to_accumulate_grad_batches and automatic_optimization: - raise MisconfigurationException( - 'When overriding `LightningModule` optimizer_step or optimizer_zero_grad,' - ' `accumulate_grad_batches` in `Trainer` should be 1.' - ' It ensures optimizer_step or optimizer_zero_grad are called on every batch.' + rank_zero_warn( + 'When using `Trainer(accumulate_grad_batches != 1)` and overriding' + '`LightningModule.optimizer_{step,zero_grad}`, the hooks will not be called on every batch' + '(rather, they are called on every optimization step).' ) def __verify_eval_loop_configuration(self, model: 'pl.LightningModule', stage: str) -> None: @@ -112,3 +113,19 @@ def __verify_dp_batch_transfer_support(self, model: 'pl.LightningModule') -> Non for hook in batch_transfer_hooks: if self.trainer.accelerator_connector.use_dp and is_overridden(hook, model): raise MisconfigurationException(f'Overriding `{hook}` is not supported in DP mode.') + + def __verify_manual_optimization_support(self, model: 'pl.LightningModule') -> None: + if model.automatic_optimization: + return + if self.trainer.gradient_clip_val > 0: + raise MisconfigurationException( + f"Automatic gradient clipping is not supported for manual optimization." + f" Remove `Trainer(gradient_clip_val={self.trainer.gradient_clip_val})`" + f" or switch to automatic optimization." + ) + if self.trainer.accumulate_grad_batches != 1: + raise MisconfigurationException( + f"Automatic gradient accumulation is not supported for manual optimization." + f" Remove `Trainer(accumulate_grad_batches={self.trainer.accumulate_grad_batches})`" + f" or switch to automatic optimization." + ) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4d692ec517d19..f283c38d4dd7b 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -15,12 +15,14 @@ import logging import os from typing import List, Optional, Sequence, Union +from weakref import proxy import torch from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator from pytorch_lightning.plugins import ( ApexMixedPrecisionPlugin, @@ -36,6 +38,8 @@ DoublePrecisionPlugin, FullyShardedNativeMixedPrecisionPlugin, HorovodPlugin, + IPUPlugin, + IPUPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin, ShardedNativeMixedPrecisionPlugin, @@ -58,13 +62,14 @@ _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE, - _TPU_AVAILABLE, AMPType, device_parser, DeviceType, DistributedType, + rank_zero_deprecation, + rank_zero_info, + rank_zero_warn, ) -from pytorch_lightning.utilities.distributed import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException if _HOROVOD_AVAILABLE: @@ -79,6 +84,7 @@ def __init__( self, num_processes, tpu_cores, + ipus, distributed_backend, auto_select_gpus, gpus, @@ -98,6 +104,7 @@ def __init__( self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + self.ipus = ipus self.distributed_backend = distributed_backend self.auto_select_gpus = auto_select_gpus self.gpus = gpus @@ -238,6 +245,8 @@ def training_type_plugin(self) -> TrainingTypePlugin: @property def cluster_environment(self) -> ClusterEnvironment: + if self._cluster_environment is None: + self._cluster_environment = self.select_cluster_environment() return self._cluster_environment @property @@ -248,6 +257,10 @@ def on_cpu(self) -> bool: def on_tpu(self) -> bool: return self.tpu_cores is not None + @property + def on_ipu(self) -> bool: + return self.ipus is not None + @property def tpu_id(self) -> Optional[int]: if self.on_tpu and isinstance(self.tpu_cores, list): @@ -323,13 +336,18 @@ def parallel_devices(self) -> List[Union[torch.device, int]]: # https://github.com/PyTorchLightning/pytorch-lightning/issues/3169 if isinstance(self.tpu_cores, int): devices = list(range(self.tpu_cores)) + elif self.on_ipu: + if isinstance(self.ipus, int): + devices = list(range(self.ipus)) else: devices = [torch.device("cpu")] * self.num_processes return devices @property def root_gpu(self) -> Optional[int]: - return self.accelerator.root_device.index if not isinstance(self.accelerator, TPUAccelerator) else None + return self.accelerator.root_device.index if not isinstance( + self.accelerator, (IPUAccelerator, TPUAccelerator) + ) else None @property def is_training_type_in_plugins(self) -> bool: @@ -353,14 +371,17 @@ def select_precision_plugin(self) -> PrecisionPlugin: # set precision type self.amp_type = AMPType.from_str(self.amp_type) + if self.on_ipu: + return IPUPrecisionPlugin(self.precision) + if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) if self.precision == 32: return PrecisionPlugin() - elif self.precision == 64: + if self.precision == 64: return DoublePrecisionPlugin() - elif self.precision == 16: + if self.precision == 16: if self.on_tpu: return TPUHalfPrecisionPlugin() @@ -403,7 +424,11 @@ def select_precision_plugin(self) -> PrecisionPlugin: raise NotImplementedError("We only support precisions 64, 32 and 16!") def select_training_type_plugin(self) -> TrainingTypePlugin: - if self.use_ddp2: + if isinstance( + self.distributed_backend, Accelerator + ) and self.distributed_backend.training_type_plugin is not None: + plugin = self.distributed_backend.training_type_plugin + elif self.use_ddp2: plugin = DDP2Plugin( parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment, @@ -459,6 +484,8 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) + elif self.on_ipu: + plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) @@ -472,7 +499,9 @@ def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> Tra training_type.num_processes = len(self.parallel_devices) if hasattr(training_type, 'cluster_environment') and getattr(training_type, 'cluster_environment') is None: - training_type.cluster_environment = self.select_cluster_environment() + # transfer ownership of the cluster environment to the training type + training_type.cluster_environment = self.cluster_environment + self._cluster_environment = proxy(self.cluster_environment) if hasattr(training_type, 'num_nodes'): # set num_nodes for training_type from trainer setting @@ -499,14 +528,21 @@ def select_accelerator(self) -> Accelerator: acc_cls = GPUAccelerator elif self.on_tpu: acc_cls = TPUAccelerator + elif self.on_ipu: + acc_cls = IPUAccelerator else: acc_cls = CPUAccelerator # as precision_plugin is dependent on training_type_plugin, make sure # that we first select training_type_plugin, then precision_plugin - return acc_cls( + accelerator = acc_cls( training_type_plugin=self.training_type_plugin, precision_plugin=self.precision_plugin, ) + # transfer ownership of the plugins to the accelerator + self._training_type_plugin = proxy(self.training_type_plugin) + self._precision_plugin = proxy(self.precision_plugin) + + return accelerator def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: @@ -562,6 +598,8 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): self._device_type = DeviceType.TPU if isinstance(self.tpu_cores, int): self._distrib_type = DistributedType.TPU_SPAWN + elif self.distributed_backend == 'ipu': + self._device_type = DeviceType.IPU elif self.distributed_backend and self._distrib_type is None: self._distrib_type = DistributedType(self.distributed_backend) @@ -608,16 +646,6 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): 'Please set accelerator=ddp or accelerator=ddp2.' ) - rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') - num_cores = self.tpu_cores if self.tpu_cores is not None else 0 - rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') - - if torch.cuda.is_available() and self._device_type != DeviceType.GPU: - rank_zero_warn( - "GPU available but not used. Set the gpus flag in your trainer" - " `Trainer(gpus=1)` or script `--gpus=1`." - ) - def _set_horovod_backend(self): self.check_horovod() self._distrib_type = DistributedType.HOROVOD diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 98d0c292f92d0..2b14a229ce4f6 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -13,12 +13,11 @@ # limitations under the License. import os from datetime import timedelta -from pathlib import Path from typing import Dict, List, Optional, Union +import pytorch_lightning as pl from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBar, ProgressBarBase from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -36,12 +35,9 @@ def on_trainer_init( process_position: int, default_root_dir: Optional[str], weights_save_path: Optional[str], - resume_from_checkpoint: Optional[Union[Path, str]], stochastic_weight_avg: bool, max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None, ): - self.trainer.resume_from_checkpoint = resume_from_checkpoint - # init folder paths for checkpoint + weights save callbacks self.trainer._default_root_dir = default_root_dir or os.getcwd() self.trainer._weights_save_path = weights_save_path or self.trainer._default_root_dir @@ -141,7 +137,7 @@ def attach_model_logging_functions(self, model): callback.log_dict = model.log_dict @staticmethod - def _attach_model_callbacks(model: LightningModule, trainer) -> None: + def _attach_model_callbacks(model: 'pl.LightningModule', trainer) -> None: """ Attaches the callbacks defined in the model. If a callback returned by the model's configure_callback method has the same type as one or several @@ -157,8 +153,8 @@ def _attach_model_callbacks(model: LightningModule, trainer) -> None: model_callbacks = model.configure_callbacks() if not model_callbacks: return - model_callback_types = set(type(c) for c in model_callbacks) - trainer_callback_types = set(type(c) for c in trainer.callbacks) + model_callback_types = {type(c) for c in model_callbacks} + trainer_callback_types = {type(c) for c in trainer.callbacks} override_types = model_callback_types.intersection(trainer_callback_types) if override_types: rank_zero_info( diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 1181c4f3efd1e..ab74c3bccfc8d 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -19,147 +19,183 @@ import torch -import pytorch_lightning -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import ( - _APEX_AVAILABLE, - _OMEGACONF_AVAILABLE, - AMPType, - DeviceType, - rank_zero_info, - rank_zero_warn, -) +import pytorch_lightning as pl +from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem -from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS -if _APEX_AVAILABLE: - from apex import amp - if _OMEGACONF_AVAILABLE: from omegaconf import Container class CheckpointConnector: - def __init__(self, trainer): + def __init__(self, trainer, resume_from_checkpoint: Optional[Union[str, Path]] = None): self.trainer = trainer + self.resume_checkpoint_path = resume_from_checkpoint + self._loaded_checkpoint = dict() - # used to validate checkpointing logic - self.has_trained = False + @property + def hpc_resume_path(self) -> Optional[str]: + dir_path_hpc = str(self.trainer.weights_save_path) + max_version = self.max_ckpt_version_in_folder(dir_path_hpc, "hpc_ckpt_") + if max_version is not None: + return os.path.join(dir_path_hpc, f"hpc_ckpt_{max_version}.ckpt") - def restore_weights(self) -> None: + def resume_start(self) -> None: """ - Attempt to restore a checkpoint (e.g. weights) in this priority: - 1. from HPC weights - 2. from `resume_from_checkpoint` file + Attempts to pre-load the checkpoint file to memory, with the source path determined in this priority: + + 1. from HPC weights if found + 2. from `resume_from_checkpoint` file if provided 3. don't restore + + Raises: + FileNotFoundError: If the path to the checkpoint file is provided but the file does not exist. """ + self.resume_checkpoint_path = self.hpc_resume_path or self.resume_checkpoint_path + checkpoint_path = self.resume_checkpoint_path + if not checkpoint_path: + return + # clear cache before restore - if self.trainer._device_type == DeviceType.GPU: - torch.cuda.empty_cache() + torch.cuda.empty_cache() - # 1. Attempt to restore states from HPC checkpoint - dir_path_hpc = str(self.trainer.weights_save_path) - max_suffix = self.max_ckpt_in_folder(dir_path_hpc, "hpc_ckpt_") - if max_suffix is not None: - checkpoint_path = f'{dir_path_hpc}/hpc_ckpt_{max_suffix}.ckpt' - self.hpc_load(checkpoint_path, self.trainer._device_type == DeviceType.GPU) - rank_zero_info(f'restored hpc model from: {checkpoint_path}') + # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint. + fs = get_filesystem(checkpoint_path) + if not fs.exists(checkpoint_path): + raise FileNotFoundError(f"Checkpoint at {checkpoint_path} not found. Aborting training.") - # 2. Attempt to restore states from `resume_from_checkpoint` file - elif self.trainer.resume_from_checkpoint is not None: - self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer._device_type == DeviceType.GPU) + rank_zero_info(f"Restoring states from the checkpoint file at {checkpoint_path}") + self._loaded_checkpoint = self.trainer.training_type_plugin.load_checkpoint_file(checkpoint_path) - # wait for all to catch up - self.trainer.training_type_plugin.barrier('TrainerIOMixin.restore_weights') + def resume_end(self) -> None: + """ Signal the connector that all states have resumed and memory for the checkpoint object can be released. """ + if self.resume_checkpoint_path: + rank_zero_info(f"Restored all states from the checkpoint file at {self.resume_checkpoint_path}") + self.resume_checkpoint_path = None + self._loaded_checkpoint = dict() # clear cache after restore - if self.trainer._device_type == DeviceType.GPU: - torch.cuda.empty_cache() + torch.cuda.empty_cache() + + # wait for all to catch up + self.trainer.training_type_plugin.barrier("CheckpointConnector.resume_end") - def restore(self, checkpoint_path: str, on_gpu: bool) -> bool: + def restore(self, checkpoint_path: Optional[Union[Path, str]] = None) -> None: """ - Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore. + Attempt to restore everything at once from a 'PyTorch-Lightning checkpoint' file + through file-read and state-restore, in this priority: + + 1. from HPC weights if found + 2. from `resume_from_checkpoint` file if provided + 3. don't restore + All restored states are listed in return value description of `dump_checkpoint`. - """ - # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint. - fs = get_filesystem(checkpoint_path) - if not fs.exists(checkpoint_path): - raise FileNotFoundError(f"Checkpoint at {checkpoint_path} not found. Aborting training.") - checkpoint, load_optimizer_states = self.trainer.training_type_plugin.restore_model_state_from_ckpt_path( - checkpoint_path, map_location=lambda storage, loc: storage - ) + Args: + checkpoint_path: Path to a PyTorch Lightning checkpoint file. + """ + self.resume_checkpoint_path = checkpoint_path + self.resume_start() - model = self.trainer.lightning_module + # restore module states + self.restore_datamodule() + self.restore_model() - if on_gpu: - model.cuda(self.trainer.root_gpu) + # restore callback states + self.restore_callbacks() # restore training state - self.restore_training_state(checkpoint, load_optimizer_states) + self.restore_training_state() + self.resume_end() - rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}") - return True + def restore_datamodule(self) -> None: + """ Calls hooks on the datamodule to give it a chance to restore its state from the checkpoint. """ + if not self._loaded_checkpoint: + return + + datamodule = self.trainer.datamodule + if datamodule is not None: + datamodule.on_load_checkpoint(self._loaded_checkpoint) - def restore_model_state(self, model: LightningModule, checkpoint) -> None: + def restore_model(self) -> None: """ - Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object + Restores a model's weights from a PyTorch Lightning checkpoint. Hooks are called first go give + the LightningModule a chance to modify the contents, then finally the model gets updated with + the loaded weights. """ + if not self._loaded_checkpoint: + return - # restore datamodule states - if self.trainer.datamodule is not None: - self.trainer.datamodule.on_load_checkpoint(checkpoint) + model = self.trainer.lightning_module # hook: give user access to checkpoint if needed. - model.on_load_checkpoint(checkpoint) + model.on_load_checkpoint(self._loaded_checkpoint) + + # call hpc specific hook + if self.hpc_resume_path is not None: + model.on_hpc_load(self._loaded_checkpoint) # restore model state_dict - model.load_state_dict(checkpoint['state_dict']) + self.trainer.training_type_plugin.load_model_state_dict(self._loaded_checkpoint) + + def restore_model_weights(self, checkpoint_path: Optional[Union[str, Path]]) -> None: + """ Restore only the model weights. """ + checkpoint = self._loaded_checkpoint + if checkpoint_path is not None: + checkpoint = self.trainer.training_type_plugin.load_checkpoint_file(checkpoint_path) + + self.trainer.lightning_module.on_load_checkpoint(checkpoint) + self.trainer.training_type_plugin.load_model_state_dict(checkpoint) - def restore_training_state(self, checkpoint, load_optimizer_states: bool = True): + def restore_training_state(self) -> None: """ - Restore trainer state. - Model will get its change to update - :param checkpoint: - :return: + Restore the trainer state from the pre-loaded checkpoint. This includes the precision settings, loop progress, + optimizer states and learning rate scheduler states. """ - # validation - if load_optimizer_states and ('optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint): - raise KeyError( - 'Trying to restore training state but checkpoint contains only the model.' - ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.' - ) + if not self._loaded_checkpoint: + return + + # restore precision plugin (scaler etc.) + self.trainer.precision_plugin.on_load_checkpoint(self._loaded_checkpoint) + # restore progress (loops etc.) + self.restore_progress() - if any([key in checkpoint for key in DEPRECATED_CHECKPOINT_KEYS]): + self.restore_optimizers_and_schedulers() + + def restore_callbacks(self) -> None: + """ Restores all callbacks from the pre-loaded checkpoint. """ + if not self._loaded_checkpoint: + return + + if any(key in self._loaded_checkpoint for key in DEPRECATED_CHECKPOINT_KEYS): raise ValueError( "The checkpoint you're attempting to load follows an" " outdated schema. You can upgrade to the current schema by running" " `python -m pytorch_lightning.utilities.upgrade_checkpoint --file model.ckpt`" " where `model.ckpt` is your checkpoint file." ) + self.trainer.on_load_checkpoint(self._loaded_checkpoint) - # restore amp scaling - if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: - self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) - elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: - amp.load_state_dict(checkpoint['amp_scaling_state']) - - # restore callback states - self.trainer.on_load_checkpoint(checkpoint) + def restore_progress(self) -> None: + """ + Restores the training progress from the pre-loaded checkpoint. This currently includes only the global step + and current epoch. + """ + if not self._loaded_checkpoint: + return - self.trainer.train_loop.global_step = checkpoint['global_step'] - self.trainer.train_loop.current_epoch = checkpoint['epoch'] + self.trainer.fit_loop.global_step = self._loaded_checkpoint['global_step'] + self.trainer.fit_loop.current_epoch = self._loaded_checkpoint['epoch'] # crash if max_epochs is lower then the current epoch from the checkpoint if self.trainer.max_epochs is not None and self.trainer.current_epoch > self.trainer.max_epochs: - m = f""" - you restored a checkpoint with current_epoch={self.trainer.current_epoch} - but the Trainer(max_epochs={self.trainer.max_epochs}) - """ - raise MisconfigurationException(m) + raise MisconfigurationException( + f"You restored a checkpoint with current_epoch={self.trainer.current_epoch}," + f" but you have set Trainer(max_epochs={self.trainer.max_epochs})." + ) # Division deals with global step stepping once per accumulated batch # Inequality deals with different global step for odd vs even num_training_batches @@ -173,14 +209,28 @@ def restore_training_state(self, checkpoint, load_optimizer_states: bool = True) " consider using an end of epoch checkpoint." ) - if not load_optimizer_states: + def restore_optimizers_and_schedulers(self) -> None: + """ Restores the optimizers and learning rate scheduler states from the pre-loaded checkpoint. """ + if not self._loaded_checkpoint: return - # restore the optimizers - optimizer_states = checkpoint['optimizer_states'] - for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states): - optimizer.load_state_dict(opt_state) + # validation + if "optimizer_states" not in self._loaded_checkpoint or "lr_schedulers" not in self._loaded_checkpoint: + raise KeyError( + "Trying to restore training state but checkpoint contains only the model." + " This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`." + ) + self.restore_optimizers() + self.restore_lr_schedulers() + def restore_optimizers(self) -> None: + """ Restores the optimizer states from the pre-loaded checkpoint. """ + if not self._loaded_checkpoint: + return + + # restore the optimizers + self.trainer.training_type_plugin.load_optimizer_state_dict(self._loaded_checkpoint) + for optimizer in self.trainer.optimizers: # move optimizer to GPU 1 weight at a time # avoids OOM if self.trainer.root_gpu is not None: @@ -189,14 +239,20 @@ def restore_training_state(self, checkpoint, load_optimizer_states: bool = True) if isinstance(v, torch.Tensor): state[k] = v.cuda(self.trainer.root_gpu) + def restore_lr_schedulers(self) -> None: + """ Restores the learning rate scheduler states from the pre-loaded checkpoint. """ + if not self._loaded_checkpoint: + return + # restore the lr schedulers - lr_schedulers = checkpoint['lr_schedulers'] + lr_schedulers = self._loaded_checkpoint['lr_schedulers'] for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers): scheduler['scheduler'].load_state_dict(lrs_state) # ---------------------------------- # PRIVATE OPS # ---------------------------------- + def hpc_save(self, folderpath: str, logger): # make sure the checkpoint folder exists folderpath = str(folderpath) # because the tests pass a path object @@ -206,7 +262,7 @@ def hpc_save(self, folderpath: str, logger): # save logger to make sure we get all the metrics logger.save() - max_suffix = self.max_ckpt_in_folder(folderpath) + max_suffix = self.max_ckpt_version_in_folder(folderpath) ckpt_number = (max_suffix if max_suffix is not None else 0) + 1 fs.makedirs(folderpath, exist_ok=True) @@ -225,8 +281,8 @@ def hpc_save(self, folderpath: str, logger): try: atomic_save(checkpoint, filepath) except AttributeError as err: - if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: - del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] + if pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: + del checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] rank_zero_warn( 'warning, `hyper_parameters` dropped from checkpoint.' f' An attribute is not picklable {err}' @@ -272,7 +328,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: checkpoint = { 'epoch': current_epoch, 'global_step': global_step, - 'pytorch-lightning_version': pytorch_lightning.__version__, + 'pytorch-lightning_version': pl.__version__, 'state_dict': self.trainer.accelerator.lightning_module_state_dict(), } @@ -294,25 +350,18 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: lr_schedulers.append(scheduler['scheduler'].state_dict()) checkpoint['lr_schedulers'] = lr_schedulers - # dump amp scaling - if ( - self.trainer.amp_backend == AMPType.NATIVE and self.trainer._device_type != DeviceType.TPU - and self.trainer.scaler is not None - ): - checkpoint['native_amp_scaling_state'] = self.trainer.scaler.state_dict() - elif self.trainer.amp_backend == AMPType.APEX: - checkpoint['amp_scaling_state'] = amp.state_dict() + self.trainer.precision_plugin.on_save_checkpoint(checkpoint) # dump hyper-parameters if model.hparams: if hasattr(model, '_hparams_name'): - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name + checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name # dump arguments if _OMEGACONF_AVAILABLE and isinstance(model.hparams, Container): - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams) + checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams + checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams) else: - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams) + checkpoint[pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams) # give the model a chance to dump a few things model.on_save_checkpoint(checkpoint) @@ -321,31 +370,20 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: return checkpoint - def hpc_load(self, checkpoint_path: str, on_gpu: bool): - """ - Load model/training states from a 'PyTorch-Lightning checkpoint' file for hpc. - All restored states are listed in return value description of `dump_checkpoint`. + def hpc_load(self, checkpoint_path: str) -> None: """ + Attempts to restore the full training and model state from a HPC checkpoint file. - # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path` - checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) - - # acquire the model - model = self.trainer.lightning_module - - # restore model and datamodule state - self.restore_model_state(model, checkpoint) - - if self.trainer.root_gpu is not None: - model.cuda(self.trainer.root_gpu) - - # restore training state - self.restore_training_state(checkpoint) - - # call hpc specific hook - model.on_hpc_load(checkpoint) + .. deprecated::v1.4 + Will be removed in v1.6. Use :meth:`restore` instead. + """ + rank_zero_deprecation( + "`CheckpointConnector.hpc_load()` was deprecated in v1.4 and will be removed in v1.6." + " Use `CheckpointConnector.restore()` instead." + ) + self.restore(checkpoint_path) - def max_ckpt_in_folder(self, dir_path: Union[str, Path], name_key: str = 'ckpt_') -> Optional[int]: + def max_ckpt_version_in_folder(self, dir_path: Union[str, Path], name_key: str = 'ckpt_') -> Optional[int]: """List up files in `dir_path` with `name_key`, then yield maximum suffix number. Args: dir_path: path of directory which may contain files whose name include `name_key` @@ -377,7 +415,7 @@ def max_ckpt_in_folder(self, dir_path: Union[str, Path], name_key: str = 'ckpt_' def get_max_ckpt_path_from_folder(self, folder_path: Union[str, Path]) -> str: """Get path of maximum-epoch checkpoint in the folder.""" - max_suffix = self.max_ckpt_in_folder(folder_path) + max_suffix = self.max_ckpt_version_in_folder(folder_path) ckpt_number = max_suffix if max_suffix is not None else 0 return f'{folder_path}/hpc_ckpt_{ckpt_number}.ckpt' diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index a867bf96a8d77..6785b25a2112c 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -12,17 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Union - -from torch.utils.data import DataLoader +from typing import Optional, Union import pytorch_lightning as pl from pytorch_lightning.trainer.supporters import prefetch_iterator from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -class DataConnector(object): +class DataConnector: def __init__(self, trainer: "pl.Trainer", multiple_trainloader_mode: str = "max_size_cycle"): self.trainer = trainer @@ -65,22 +64,21 @@ def can_prepare_data(self): if self.trainer.prepare_data_per_node: return self.trainer.local_rank == 0 and should_call_dm_prepare_data - else: - return self.trainer.node_rank == 0 and self.trainer.local_rank == 0 and should_call_dm_prepare_data + return self.trainer.node_rank == 0 and self.trainer.local_rank == 0 and should_call_dm_prepare_data def attach_data( self, model: 'pl.LightningModule', - train_dataloader: Optional[Union[DataLoader, List[DataLoader]]] = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, - test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, - predict_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + train_dataloaders: Optional[TRAIN_DATALOADERS] = None, + val_dataloaders: Optional[EVAL_DATALOADERS] = None, + test_dataloaders: Optional[EVAL_DATALOADERS] = None, + predict_dataloaders: Optional[EVAL_DATALOADERS] = None, datamodule: Optional['pl.LightningDataModule'] = None ) -> None: # set up the passed in dataloaders (if needed) self.attach_dataloaders( model, - train_dataloader=train_dataloader, + train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, test_dataloaders=test_dataloaders, predict_dataloaders=predict_dataloaders, @@ -92,15 +90,15 @@ def attach_data( def attach_dataloaders( self, model: 'pl.LightningModule', - train_dataloader: Optional[Union[DataLoader, List[DataLoader]]] = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, - test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, - predict_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + train_dataloaders: Optional[TRAIN_DATALOADERS] = None, + val_dataloaders: Optional[EVAL_DATALOADERS] = None, + test_dataloaders: Optional[EVAL_DATALOADERS] = None, + predict_dataloaders: Optional[EVAL_DATALOADERS] = None, ) -> None: # when dataloader is passed via fit, patch the train_dataloader # functions to overwrite with these implementations - if train_dataloader is not None: - model.train_dataloader = _PatchDataLoader(train_dataloader) + if train_dataloaders is not None: + model.train_dataloader = _PatchDataLoader(train_dataloaders) if val_dataloaders is not None: model.val_dataloader = _PatchDataLoader(val_dataloaders) @@ -114,43 +112,40 @@ def attach_dataloaders( def attach_datamodule( self, model: 'pl.LightningModule', datamodule: Optional['pl.LightningDataModule'] = None ) -> None: - # We use datamodule if it's been provided, otherwise we check model for it - datamodule = datamodule or getattr(model, 'datamodule', None) - # If we have a datamodule, attach necessary hooks + dataloaders - if datamodule: + if datamodule is None: + return - # Override loader hooks - dl_methods = ('train_dataloader', 'val_dataloader', 'test_dataloader', 'predict_dataloader') - for method in dl_methods: - if is_overridden(method, datamodule): - setattr(model, method, getattr(datamodule, method)) + # Override loader hooks + dl_methods = ('train_dataloader', 'val_dataloader', 'test_dataloader', 'predict_dataloader') + for method in dl_methods: + if is_overridden(method, datamodule): + setattr(model, method, getattr(datamodule, method)) - # Override data transfer hooks if dataset-specific to_device logic has been defined in datamodule - batch_transfer_hooks = ('on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer') - for hook in batch_transfer_hooks: - if is_overridden(hook, datamodule): - setattr(model, hook, getattr(datamodule, hook)) + # Override data transfer hooks if dataset-specific to_device logic has been defined in datamodule + batch_transfer_hooks = ('on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer') + for hook in batch_transfer_hooks: + if is_overridden(hook, datamodule): + setattr(model, hook, getattr(datamodule, hook)) - self.trainer.datamodule = datamodule - datamodule.trainer = self.trainer + self.trainer.datamodule = datamodule + datamodule.trainer = self.trainer - # experimental feature for Flash - if hasattr(datamodule, "data_pipeline"): - model.data_pipeline = datamodule.data_pipeline + # experimental feature for Flash + if hasattr(datamodule, "data_pipeline"): + model.data_pipeline = datamodule.data_pipeline -class _PatchDataLoader(object): +class _PatchDataLoader: r""" Callable object for patching dataloaders passed into trainer.fit(). Use this class to override model.*_dataloader() and be pickle-compatible. Args: dataloader: Dataloader object to return when called. - """ - def __init__(self, dataloader: Union[List[DataLoader], DataLoader]): + def __init__(self, dataloader: Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]) -> None: self.dataloader = dataloader # cannot pickle __code__ so cannot verify if PatchDataloader @@ -158,5 +153,5 @@ def __init__(self, dataloader: Union[List[DataLoader], DataLoader]): # so, we hack it by using the string representation self.patch_loader_code = str(self.__call__.__code__) - def __call__(self) -> Union[List[DataLoader], DataLoader]: + def __call__(self) -> Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]: return self.dataloader diff --git a/pytorch_lightning/trainer/connectors/debugging_connector.py b/pytorch_lightning/trainer/connectors/debugging_connector.py index 0108a1045698f..9691f416a0c23 100644 --- a/pytorch_lightning/trainer/connectors/debugging_connector.py +++ b/pytorch_lightning/trainer/connectors/debugging_connector.py @@ -58,9 +58,9 @@ def on_init_start( limit_val_batches = fast_dev_run limit_test_batches = fast_dev_run limit_predict_batches = fast_dev_run - self.trainer.train_loop.max_steps = fast_dev_run + self.trainer.fit_loop.max_steps = fast_dev_run self.trainer.num_sanity_val_steps = 0 - self.trainer.train_loop.max_epochs = 1 + self.trainer.fit_loop.max_epochs = 1 val_check_interval = 1.0 self.trainer.check_val_every_n_epoch = 1 self.trainer.logger = DummyLogger() @@ -89,9 +89,8 @@ def determine_data_use_amount(self, overfit_batches: float) -> None: def _determine_batch_limits(batches: Union[int, float], name: str) -> Union[int, float]: if 0 <= batches <= 1: return batches - elif batches > 1 and batches % 1.0 == 0: + if batches > 1 and batches % 1.0 == 0: return int(batches) - else: - raise MisconfigurationException( - f'You have passed invalid value {batches} for {name}, it has to be in [0.0, 1.0] or an int.' - ) + raise MisconfigurationException( + f'You have passed invalid value {batches} for {name}, it has to be in [0.0, 1.0] or an int.' + ) diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py index 1f1c41c6eb2f0..d3084e3e4ece5 100644 --- a/pytorch_lightning/trainer/connectors/env_vars_connector.py +++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py @@ -31,7 +31,7 @@ def insert_env_defaults(self, *args, **kwargs): # parse only the argument names cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)] # convert args to kwargs - kwargs.update({k: v for k, v in zip(cls_arg_names, args)}) + kwargs.update(dict(zip(cls_arg_names, args))) env_variables = vars(parse_env_variables(cls)) # update the kwargs by env variables kwargs = dict(list(env_variables.items()) + list(kwargs.items())) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py deleted file mode 100644 index 3d6370e3eb658..0000000000000 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ /dev/null @@ -1,493 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple -from weakref import proxy - -import torch - -import pytorch_lightning as pl -from pytorch_lightning.core.step_result import Result -from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import DistributedType, LightningEnum - - -class ResultStoreType(LightningEnum): - INSIDE_BATCH_TRAIN_LOOP = "inside_batch_train_loop" - OUTSIDE_BATCH_TRAIN_LOOP = "outside_batch_train_loop" - - -class HookResultStore: - """ - This class is defined for internal usage. - It holds all metrics logged using the self.log function - in the scope of ModelHooks or Callback functions. - - We need to differentiate 3 different scenarios: - - (1): We are outside of a batch loop - * It means no dataloader_idx, no optimizer idx, etc.. - - (2): We are inside the training batch loop - * We have an optimizer idx and split idx to track - - (3): We are inside the evaluation loop - * We have a dataloader_idx to track - - The data store `Result` objects for those 3 scenarios in `self._internals`. - - (1): self._internals = {dataloader_idx: [Result(), ..., Result()]} - * dataloader_idx not being defined, it is set to 0 b default - (2): self._internals = {dataloader_idx: {optimizer_idx: {batch_idx: [Result(), ..., Result()]}}} - (3): Same as (1) for simplicity - - Those data structures enables us to reduce properly Result object when batch loop is finished. - """ - - def __init__(self, fx_name: str) -> None: - self._fx_name = fx_name - self._internals = {} - self._internals_reduced = {} - self._internal_type: Optional[ResultStoreType] = None - self.has_reduced = False - self._latest_ref = {} - - @property - def num_dataloaders(self) -> int: - return len(self._internals_reduced if self.has_reduced else self._internals) - - def check_dataloader_idx(self, result: Result) -> bool: - random_key = list(result.keys())[-1] - return result["meta"][random_key]["dataloader_idx"] is not None - - def get_latest_from_func_name(self, latest_result_opt, func_name: str, *args, **kwargs) -> Dict: - results = {} - for opt_idx in latest_result_opt: - latest_result = latest_result_opt[opt_idx] - add_dataloader_idx = self.check_dataloader_idx(latest_result) - func = getattr(latest_result, func_name) - results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) - return results - - def run_latest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]: - """ - This function used cache_ref and cache_result to optimize loading metrics - - Context: As we update the logger_connector metrics on every `self.log` call, - and it can be pretty time consuming, especially when logging outside batch loop. - - HookResultStore keeps track of its latest added result object, - and cache its pbar and log metrics if already called on, - """ - return [ - self.get_latest_from_func_name(self._latest_ref[dl_idx], func_name, *args, **kwargs) - for dl_idx in range(self.num_dataloaders) - ] - - def get_batch_pbar_metrics(self, *args, **kwargs): - return self.run_latest_batch_metrics_with_func_name("get_batch_pbar_metrics", *args, **kwargs) - - def get_batch_log_metrics(self, *args, **kwargs): - return self.run_latest_batch_metrics_with_func_name("get_batch_log_metrics", *args, **kwargs) - - def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> None: - if not isinstance(opt_metric, Result): - raise Exception("The provided opt_metric should be a Result Object. Something is wrong") - - func = getattr(opt_metric, func_name) - metrics_to_log = func(*args, add_dataloader_idx=self.num_dataloaders > 1, **kwargs) - - results.append(metrics_to_log) - - def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]: - results = [] - for dl_idx in range(self.num_dataloaders): - opt_metrics = self._internals_reduced[dl_idx] - if isinstance(opt_metrics, defaultdict): - for opt_metric in opt_metrics.values(): - self.run_epoch_func(results, opt_metric, func_name, *args, **kwargs) - else: - self.run_epoch_func(results, opt_metrics, func_name, *args, **kwargs) - return results - - def get_epoch_pbar_metrics(self, *_, **__) -> List[Dict]: - return self.get_epoch_from_func_name("get_epoch_pbar_metrics") - - def get_epoch_log_metrics(self, *_, **__) -> List[Dict]: - return self.get_epoch_from_func_name("get_epoch_log_metrics") - - def get_forked_metrics(self, *_, **__) -> List[Dict]: - return self.get_epoch_from_func_name("get_forked_metrics") - - def append(self, result: Result, info: Dict) -> None: - dataloader_idx = info["dataloader_idx"] - self._internal_type = info["type"] - opt_idx = info["opt_idx"] - - if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: - if dataloader_idx not in self._internals: - self._internals_reduced[dataloader_idx] = defaultdict(dict) - self._latest_ref[dataloader_idx] = {} - self._internals.setdefault(dataloader_idx, {}) - - batch_idx = info["batch_idx"] - self._internals[dataloader_idx].setdefault(opt_idx, {}) - self._internals[dataloader_idx][opt_idx].setdefault(batch_idx, []) - self._internals[dataloader_idx][opt_idx][batch_idx].append(result) - else: - self._internals.setdefault(dataloader_idx, []) - self._internals[dataloader_idx].append(result) - self._latest_ref.setdefault(dataloader_idx, {}) - - self._latest_ref[dataloader_idx].setdefault(opt_idx, {}) - self._latest_ref[dataloader_idx][opt_idx] = result - - def auto_reduce_results_on_epoch_end(self) -> None: - """ - This function is called to reduce `self._internals` Result object. - The reduced Result object will be saved into `self._internals_reduced` - The `self._internals` stored Result objects will be deleted to save memory. - """ - if self.has_reduced: - return - for dl_idx in range(self.num_dataloaders): - epoch_metrics = self._internals[dl_idx] - - if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: - for opt_idx in list(epoch_metrics): - # TODO: Figure out to reduce memory - # TODO: How to start training in middle of epoch - outputs = epoch_metrics[opt_idx] - # reduce across time first - time_reduced_outputs = [] - for tbptt_outputs in outputs.values(): - tbptt_outputs = type(tbptt_outputs[0]).reduce_across_time(tbptt_outputs) - if len(tbptt_outputs) > 1: - time_reduced_outputs.append(tbptt_outputs) - - if len(time_reduced_outputs) == 0: - continue - - # reduce across training steps - outputs = type(time_reduced_outputs[0]).reduce_on_epoch_end(time_reduced_outputs) - - # with manual opt need 1 + metrics because meta is always there - if outputs.minimize is not None: - outputs.minimize = outputs.minimize.mean() - - self._internals_reduced[dl_idx][opt_idx] = outputs - - # free memory - del self._internals[dl_idx][opt_idx] - else: - reduced_epoch_metrics = epoch_metrics[0] - if len(epoch_metrics) != 1: - reduced_epoch_metrics = type(reduced_epoch_metrics).reduce_on_epoch_end(epoch_metrics) - - self._internals_reduced[dl_idx] = reduced_epoch_metrics - - # free memory - del self._internals[dl_idx] - - self.has_reduced = True - - def reset(self) -> None: - """ - Call at the end of epoch to reset Result objects - """ - for dl_idx in range(self.num_dataloaders): - epoch_metrics = self._internals[dl_idx] if not self.has_reduced else self._internals_reduced[dl_idx] - if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: - for opt_idx in list(epoch_metrics): - epoch_metrics[opt_idx].reset() - else: - epoch_metrics.reset() - - def __getitem__(self, key: str) -> Any: - return self._internals.get(key, None) - - def __repr__(self): - return self._internals.__repr__() - - -class EpochResultStore: - """ - This class is defined for internal usage. - It holds all metrics logged using the self.log function inside `HookResultStore` objects. - - The internal data-structure is as follow: - self._internals = {"fx_name_0": HookResultStore(), ..., "fx_name_n": HookResultStore()} - - ..example:: - - model._results = Result() - model._current_fx_name = 'something' - model.log('a', ...) - epoch_result_store.cache_result() - """ - - def __init__(self, trainer: 'pl.Trainer') -> None: - self.trainer = proxy(trainer) - self._internals = {} - self.reset() - - def __getitem__(self, key: str) -> Any: - return self._internals.get(key, None) - - @property - def info(self): - """ - This function provides necessary parameters to properly configure HookResultStore obj - """ - model_ref = self.trainer.lightning_module - return { - "batch_idx": self.trainer.train_loop.batch_idx, - "fx_name": model_ref._current_fx_name, - "dataloader_idx": model_ref._current_dataloader_idx or 0, - "opt_idx": self._opt_idx or 0, - "split_idx": self._split_idx or 0, - "type": ( - ResultStoreType.INSIDE_BATCH_TRAIN_LOOP if self._opt_idx is not None and self._split_idx is not None - else ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP - ) - } - - def reset_model(self): - """ - This function is used to reset model state at the end of the capture - """ - model_ref = self.trainer.lightning_module - model_ref._results = Result() - model_ref._current_fx_name = None - - def cache_result(self) -> None: - """ - This function is called after every hook and stores the result object - """ - with self.trainer.profiler.profile("cache_result"): - model_ref = self.trainer.lightning_module - - # extract hook results - hook_result = model_ref._results - - if len(hook_result) == 1: - model_ref._current_fx_name = None - return - - info = self.info - fx_name = info["fx_name"] - - self._internals.setdefault(fx_name, HookResultStore(fx_name)) - - # attach capture batch_size - Result.attach_batch_size(self._batch_size, hook_result) - - hook_result = hook_result.detach() - if self.trainer.move_metrics_to_cpu: - hook_result = hook_result.cpu() - elif self.trainer._distrib_type == DistributedType.DP: - hook_result = hook_result.to(torch.device("cuda", self.trainer.root_gpu)) - - self._internals[fx_name].append(hook_result, info) - - # update logged_metrics, progress_bar_metrics, callback_metrics - if "epoch_end" in fx_name: - self.update_logger_connector() - - self.reset_model() - - def update_logger_connector(self) -> Tuple[Dict, Dict]: - """ - This function is called every time we capture a hook - It automatically updates the logger_connector followings: - - progress_bar_metrics with pbar_metrics - - logged_metrics with log_metrics - - callback_metrics with progress_bar_metrics + logged_metrics - """ - - logger_connector = self.trainer.logger_connector - - callback_metrics = {} - batch_pbar_metrics = {} - batch_log_metrics = {} - - if not self._has_batch_loop_finished: - # get pbar - batch_pbar_metrics = self.get_latest_batch_pbar_metrics() - logger_connector.add_progress_bar_metrics(batch_pbar_metrics) - batch_log_metrics = self.get_latest_batch_log_metrics() - - if self.trainer.training: - logger_connector._logged_metrics.update(batch_log_metrics) - callback_metrics.update(batch_pbar_metrics) - callback_metrics.update(batch_log_metrics) - else: - # get pbar - epoch_pbar_metrics = self.get_epoch_pbar_metrics() - logger_connector.add_progress_bar_metrics(epoch_pbar_metrics) - - # get logged_metrics - epoch_log_metrics = self.get_epoch_log_metrics() - logger_connector._logged_metrics.update(epoch_log_metrics) - logger_connector._logged_metrics.update({"epoch": self.trainer.current_epoch}) - - # get forked_metrics - forked_metrics = self.get_forked_metrics() - - callback_metrics.update(epoch_pbar_metrics) - callback_metrics.update(epoch_log_metrics) - callback_metrics.update(forked_metrics) - - # TODO(carmocca): when we implement flushing the logger connector metrics after - # the trainer.state changes, this should check trainer.evaluating instead - if self.trainer.state.fn in (TrainerFn.TESTING, TrainerFn.VALIDATING): - logger_connector.evaluation_callback_metrics.update(callback_metrics) - - # update callback_metrics - logger_connector._callback_metrics.update(callback_metrics) - - batch_pbar_metrics.pop("debug_epoch", None) - return batch_pbar_metrics, batch_log_metrics - - def run_batch_from_func_name(self, func_name) -> Dict: - results = [getattr(hook_result, func_name) for hook_result in self._internals.values()] - results = [func(include_forked_originals=False) for func in results] - return {k: v for d in sum(results, []) for k, v in d.items()} # List[List[dict]] -> dict - - def get_latest_batch_log_metrics(self) -> Dict: - batch_log_metrics = self.run_batch_from_func_name("get_batch_log_metrics") - return batch_log_metrics - - def get_latest_batch_pbar_metrics(self) -> Dict: - batch_pbar_metrics = self.run_batch_from_func_name("get_batch_pbar_metrics") - return batch_pbar_metrics - - @property - def has_reduced(self) -> bool: - hook_results = self._internals.values() - return len(hook_results) == sum(h.has_reduced for h in hook_results) - - def auto_reduce_results_on_epoch_end(self) -> None: - if not self.has_reduced: - for hook_result in self._internals.values(): - hook_result.auto_reduce_results_on_epoch_end() - - @property - def has_batch_loop_finished(self) -> bool: - return self._has_batch_loop_finished - - @has_batch_loop_finished.setter - def has_batch_loop_finished(self, has_batch_loop_finished): - if has_batch_loop_finished: - # If batch loop has finished, reduce metrics - self.auto_reduce_results_on_epoch_end() - - # batch_size should be none as we finished batch loop - self._batch_size = None - - self._has_batch_loop_finished = has_batch_loop_finished - self.update_logger_connector() - - def run_epoch_by_func_name(self, func_name) -> Dict: - if not self.has_reduced: - self.auto_reduce_results_on_epoch_end() - results = [getattr(hook_result, func_name) for hook_result in self._internals.values()] - results = [func() for func in results] - return {k: v for d in sum(results, []) for k, v in d.items()} # List[List[dict]] -> dict - - def get_epoch_pbar_metrics(self) -> Dict: - return self.run_epoch_by_func_name("get_epoch_pbar_metrics") - - def get_epoch_log_metrics(self) -> Dict: - return self.run_epoch_by_func_name("get_epoch_log_metrics") - - def get_forked_metrics(self) -> Dict: - return self.run_epoch_by_func_name("get_forked_metrics") - - def reset(self) -> None: - for value in self._internals.values(): - value.reset() - self._internals = {} - self._dataloader_idx: Optional[int] = None - self._split_idx: Optional[int] = None - self._opt_idx: Optional[int] = None - self._batch_size: Optional[int] = None - self._has_batch_loop_finished = False - - def __call__( - self, - fx_name: str, - dl_idx: Optional[int] = None, - opt_idx: Optional[int] = None, - batch_idx: Optional[int] = None, - split_idx: Optional[int] = None, - reduced: bool = False, - ): - """ - This function is a helper to access stored data - - It access data from the HookResultStore. Please, - check its data structure for better understanding - - Data can be accessed with the following chains: - - IF REDUCED: - * IF accessing a fx_name defined in batch training loop: - fx_name -> dl_idx -> opt_idx -> batch_idx -> split_idx - * ELSE fx_name -> dl_idx -> batch_idx - ELSE: - * IF accessing a fx_name defined in batch training loop: - fx_name -> dl_idx -> opt_idx - * ELSE fx_name -> dl_idx - - Note: - As soon as a param is None, it breaks the chain and returns associated stored data. - - Example:: - - result: Result = self(fx_name="training_step", dl_idx=0, opt_idx=0, reduced=True) - result['train_loss_epoch'] # aggregated train_loss over one epoch. - - Args: - - fx_name: Hook name from ModelHooks or Callback. Example: ``"training_step"`` - - dl_idx: Dataloader index in short. From ``0`` to ``num_dataloaders - 1`` - - opt_idx: Optimizer index in short. From ``0`` to ``num_optimizers - 1`` - - batch_idx: Batch index seen during batch training or evaluation. - Works only with ``reduced=False`` - - split_idx: Index of split idx in training loop when tbptt is used. - - reduced: Data are being aggregated on on_epoch_end. - Indicates if we want to access the aggregated Result or not. - """ - hook_result = self[fx_name] - internal_type = hook_result._internal_type - result = hook_result._internals_reduced if reduced else hook_result._internals - - if dl_idx is not None: - result = result[dl_idx] - if internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: - if opt_idx is not None: - result = result[opt_idx] - if not reduced and batch_idx is not None: - result = result[batch_idx] - if split_idx is not None: - result = result[split_idx] - elif not reduced and batch_idx is not None: - result = result[batch_idx] - return result - - def __repr__(self): - return f"{self.__class__.__name__}(internals={self._internals})" diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index 3db8aace451dd..8d079f8b4a637 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -29,26 +29,26 @@ class FxValidator: on_fit_end=None, on_sanity_check_start=None, on_sanity_check_end=None, - on_train_start=dict(on_step=(False, True), on_epoch=(False, True)), + on_train_start=dict(on_step=(False, ), on_epoch=(True, )), on_train_end=None, - on_validation_start=dict(on_step=(False, True), on_epoch=(False, True)), + on_validation_start=dict(on_step=(False, ), on_epoch=(True, )), on_validation_end=None, - on_test_start=dict(on_step=(False, True), on_epoch=(False, True)), + on_test_start=dict(on_step=(False, ), on_epoch=(True, )), on_test_end=None, on_predict_start=None, on_predict_end=None, on_pretrain_routine_start=None, on_pretrain_routine_end=None, - on_train_epoch_start=dict(on_step=(False, True), on_epoch=(False, True)), - on_train_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), - on_validation_epoch_start=dict(on_step=(False, True), on_epoch=(False, True)), - on_validation_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), - on_test_epoch_start=dict(on_step=(False, True), on_epoch=(False, True)), - on_test_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), + on_train_epoch_start=dict(on_step=(False, True), on_epoch=(True, )), + on_train_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), + on_validation_epoch_start=dict(on_step=(False, True), on_epoch=(True, )), + on_validation_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), + on_test_epoch_start=dict(on_step=(False, True), on_epoch=(True, )), + on_test_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), on_predict_epoch_start=None, on_predict_epoch_end=None, - on_epoch_start=dict(on_step=(False, True), on_epoch=(False, True)), - on_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), + on_epoch_start=dict(on_step=(False, True), on_epoch=(True, )), + on_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), on_batch_start=dict(on_step=(False, True), on_epoch=(False, True)), on_batch_end=dict(on_step=(False, True), on_epoch=(False, True)), on_train_batch_start=dict(on_step=(False, True), on_epoch=(False, True)), @@ -72,19 +72,26 @@ class FxValidator: training_step_end=dict(on_step=(False, True), on_epoch=(False, True)), validation_step_end=dict(on_step=(False, True), on_epoch=(False, True)), test_step_end=dict(on_step=(False, True), on_epoch=(False, True)), - training_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), - validation_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), - test_epoch_end=dict(on_step=(False, ), on_epoch=(False, True)), + training_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), + validation_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), + test_epoch_end=dict(on_step=(False, ), on_epoch=(True, )), + on_before_batch_transfer=None, + transfer_batch_to_device=None, + on_after_batch_transfer=None, + backward=None, + optimizer_step=None, # TODO(@carmocca): some {step,epoch}_{start,end} are missing ) - def check_logging(self, fx_name: str, on_step: bool, on_epoch: bool) -> None: - if fx_name not in self.functions: + @classmethod + def check_logging(cls, fx_name: str, on_step: bool, on_epoch: bool) -> None: + """Check if the given function name is allowed to log""" + if fx_name not in cls.functions: raise RuntimeError( f'You are trying to `self.log()` inside `{fx_name}` but it is not implemented.' ' Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`' ) - allowed = self.functions[fx_name] + allowed = cls.functions[fx_name] if allowed is None: raise MisconfigurationException(f"{fx_name} function doesn't support logging using `self.log()`") diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index a16f5119abff2..e248b5ff8cf13 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -12,18 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from copy import deepcopy from pprint import pprint -from typing import Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, Mapping, Optional, Union import torch +import pytorch_lightning as pl from pytorch_lightning.core import memory -from pytorch_lightning.core.step_result import Result -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger -from pytorch_lightning.trainer.connectors.logger_connector.epoch_result_store import EpochResultStore -from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import FxValidator -from pytorch_lightning.trainer.connectors.logger_connector.metrics_holder import MetricsHolder +from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection, TensorBoardLogger +from pytorch_lightning.trainer.connectors.logger_connector.result import _METRIC, MetricSource from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import DeviceType from pytorch_lightning.utilities.metrics import metrics_to_scalars @@ -32,110 +29,44 @@ class LoggerConnector: - def __init__(self, trainer, log_gpu_memory: Optional[str] = None): + def __init__(self, trainer: 'pl.Trainer', log_gpu_memory: Optional[str] = None) -> None: self.trainer = trainer self.log_gpu_memory = log_gpu_memory - self._callback_metrics = MetricsHolder() - self._evaluation_callback_metrics = MetricsHolder(to_float=True) - self._logged_metrics = MetricsHolder() - self._progress_bar_metrics = MetricsHolder(to_float=True) self.eval_loop_results = [] - self._cached_results = {stage: EpochResultStore(trainer) for stage in RunningStage} - self._cached_results[None] = EpochResultStore(trainer) - self._fx_validator = FxValidator() self._val_log_step: int = 0 self._test_log_step: int = 0 - - @property - def callback_metrics(self) -> Dict: - return self.get_metrics("callback_metrics") - - @callback_metrics.setter - def callback_metrics(self, callback_metrics: Dict) -> None: - self.set_metrics("callback_metrics", callback_metrics) - - @property - def evaluation_callback_metrics(self) -> Dict: - return self.get_metrics("evaluation_callback_metrics") - - @evaluation_callback_metrics.setter - def evaluation_callback_metrics(self, evaluation_callback_metrics: Dict) -> None: - self.set_metrics("evaluation_callback_metrics", evaluation_callback_metrics) - - @property - def logged_metrics(self) -> Dict: - return self.get_metrics("logged_metrics") - - @logged_metrics.setter - def logged_metrics(self, logged_metrics: Dict) -> None: - self.set_metrics("logged_metrics", logged_metrics) - - @property - def progress_bar_metrics(self) -> Dict: - return self.get_metrics("progress_bar_metrics") - - @progress_bar_metrics.setter - def progress_bar_metrics(self, progress_bar_metrics: Dict) -> None: - self.set_metrics("progress_bar_metrics", progress_bar_metrics) - - @property - def cached_results(self) -> Union[EpochResultStore, None]: - return self._cached_results.get(self.trainer.state.stage) - - def get_metrics(self, key: str) -> Dict: - metrics_holder: MetricsHolder = getattr(self, f"_{key}") - model = self.trainer.lightning_module - metrics_holder.convert(model.device if model is not None else None) - return metrics_holder.metrics - - def set_metrics(self, key: str, val: Dict) -> None: - metrics_holder: MetricsHolder = getattr(self, f"_{key}") - metrics_holder.reset(val) - - def reset(self) -> None: - self.cached_results.reset() - - def check_logging(self, fx_name: str, on_step: bool, on_epoch: bool) -> None: - self._fx_validator.check_logging(fx_name=fx_name, on_step=on_step, on_epoch=on_epoch) - - def on_evaluation_batch_start(self, batch, dataloader_idx, num_dataloaders): - model = self.trainer.lightning_module - # set dataloader_idx only if multiple ones - model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None - # track batch_size - self.cached_results._batch_size = Result.extract_batch_size(batch) - - def on_train_split_start(self, split_idx: int, opt_idx: int, split_batch) -> None: - self.cached_results._split_idx = split_idx - self.cached_results._opt_idx = opt_idx - self.cached_results._batch_size = Result.extract_batch_size(split_batch) - - def on_train_batch_end(self) -> None: - self.cached_results._split_idx = None - self.cached_results._opt_idx = None - self.cached_results._batch_size = None - - def cache_logged_metrics(self): - self._cached_results[self.trainer.state.stage].cache_result() - - def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): - # logging + self._progress_bar_metrics: Dict[str, float] = {} + self._logged_metrics: Dict[str, _METRIC] = {} + self._callback_metrics: Dict[str, _METRIC] = {} + self._gpus_metrics: Dict[str, str] = {} + self._epoch_end_reached = False + self._current_fx: Optional[str] = None + self._batch_idx: Optional[int] = None + self._split_idx: Optional[int] = None + + def on_trainer_init( + self, + logger: LightningLoggerBase, + flush_logs_every_n_steps: int, + log_every_n_steps: int, + move_metrics_to_cpu: bool, + ) -> None: self.configure_logger(logger) self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps self.trainer.move_metrics_to_cpu = move_metrics_to_cpu @property - def should_flush_logs(self): + def should_flush_logs(self) -> bool: should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 return should_flush or self.trainer.should_stop @property - def should_update_logs(self): + def should_update_logs(self) -> bool: should_log_every_n_steps = (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 return should_log_every_n_steps or self.trainer.should_stop - def configure_logger(self, logger): + def configure_logger(self, logger: Union[bool, Iterable, LightningLoggerBase]) -> None: if logger is True: version = os.environ.get('PL_EXP_VERSION', self.trainer.slurm_job_id) @@ -151,131 +82,100 @@ def configure_logger(self, logger): else: self.trainer.logger = logger - def cache_training_step_metrics(self, opt_closure_result): - """ - This function is responsible to update - logger_connector internals metrics holder based for depreceated logging - """ - using_results_obj = isinstance(opt_closure_result.training_step_output, Result) - - # temporary dict to collect metrics - logged_metrics_tmp = {} - pbar_metrics_tmp = {} - callback_metrics_tmp = {} - - if using_results_obj: - batch_log_metrics = opt_closure_result.training_step_output.get_batch_log_metrics( - include_forked_originals=False - ) - logged_metrics_tmp.update(batch_log_metrics) - - batch_pbar_metrics = opt_closure_result.training_step_output.get_batch_pbar_metrics( - include_forked_originals=False - ) - pbar_metrics_tmp.update(batch_pbar_metrics) - - forked_metrics = opt_closure_result.training_step_output.get_forked_metrics() - callback_metrics_tmp.update(forked_metrics) - callback_metrics_tmp.update(logged_metrics_tmp) - - else: - batch_log_metrics = opt_closure_result.training_step_output.log_metrics - logged_metrics_tmp.update(batch_log_metrics) - - batch_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end - pbar_metrics_tmp.update(batch_pbar_metrics) - - # track progress bar metrics - if len(pbar_metrics_tmp) > 0: - self.add_progress_bar_metrics(pbar_metrics_tmp) - - self._callback_metrics.update(callback_metrics_tmp) - self._logged_metrics.update(logged_metrics_tmp) - - def log_metrics(self, metrics, grad_norm_dict, step=None): + def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -> None: """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step Args: - metrics (dict): Metric values - grad_norm_dict (dict): Gradient norms - step (int): Step for which metrics should be logged. Default value is `self.global_step` during training or + metrics: Metric values + step: Step for which metrics should be logged. Default value is `self.global_step` during training or the total validation / test log step count during validation and testing. """ - # add gpu memory - if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: - mem_map = memory.get_memory_profile(self.log_gpu_memory) - metrics.update(mem_map) - - # add norms - metrics.update(grad_norm_dict) + if self.trainer.logger is None or not metrics: + return # turn all tensors to scalars scalar_metrics = metrics_to_scalars(metrics) - if "step" in scalar_metrics and step is None: - step = scalar_metrics.pop("step") - - elif step is None: - # added metrics by Lightning for convenience - scalar_metrics['epoch'] = self.trainer.current_epoch + if step is None: + step = scalar_metrics.pop("step", None) + if step is None: + # added metrics for convenience + scalar_metrics.setdefault("epoch", self.trainer.current_epoch) step = self.trainer.global_step # log actual metrics - if self.trainer.logger is not None: - if self.trainer.is_global_zero: - self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step) - self.trainer.logger.save() - - # track the logged metrics - self.logged_metrics.update(scalar_metrics) - self.trainer.dev_debugger.track_logged_metrics_history(scalar_metrics) - - def add_progress_bar_metrics(self, metrics): - for k, v in metrics.items(): - if isinstance(v, torch.Tensor): - v = v.item() - - self._progress_bar_metrics.metrics[k] = v - - self.trainer.dev_debugger.track_pbar_metrics_history(metrics) - - def evaluation_epoch_end(self): - # reset dataloader idx - model_ref = self.trainer.lightning_module - model_ref._current_dataloader_idx = None - - # setting `has_batch_loop_finished` to True - # will perform Results reduction accross entire epoch. - self.cached_results.has_batch_loop_finished = True - - def add_to_eval_loop_results(self, dl_idx, has_been_initialized): - callback_metrics = deepcopy(self.evaluation_callback_metrics) - for key in list(callback_metrics.keys()): - if "dataloader_idx" in key: - if f"dataloader_idx_{dl_idx}" not in key: - # remove dl_idx from self.callback_metrics not belonging to this dataset. - del callback_metrics[key] - if has_been_initialized: - self.eval_loop_results[dl_idx].update(callback_metrics) - else: - self.eval_loop_results.append(callback_metrics) + if self.trainer.is_global_zero: + self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step) + self.trainer.logger.save() + + self._logged_metrics.update(scalar_metrics) + + """ + Evaluation metric updates + """ - def prepare_eval_loop_results(self): - num_dataloaders = self.trainer.evaluation_loop.num_dataloaders + @property + def _eval_log_step(self) -> Optional[int]: + if self.trainer.state.stage is RunningStage.VALIDATING: + return self._val_log_step + if self.trainer.state.stage is RunningStage.TESTING: + return self._test_log_step + return None + + def _increment_eval_log_step(self) -> None: + if self.trainer.state.stage is RunningStage.VALIDATING: + self._val_log_step += 1 + elif self.trainer.state.stage is RunningStage.TESTING: + self._test_log_step += 1 + + def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int, num_dataloaders: int) -> None: + model = self.trainer.lightning_module + # set dataloader_idx only if multiple ones + model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None + + # track batch_size + self.trainer._results.extract_batch_size(batch) + self._batch_idx = batch_idx + + def update_eval_step_metrics(self) -> None: + if self.trainer.sanity_checking: + return + + # logs user requested information to logger + assert not self._epoch_end_reached + self.log_metrics(self.metrics[MetricSource.LOG], step=self._eval_log_step) + + # increment the step even if nothing was logged + self._increment_eval_log_step() + + def _prepare_eval_loop_results(self, metrics: Mapping[str, _METRIC]) -> None: + if self.trainer.sanity_checking: + return + + num_dataloaders = self.trainer._evaluation_loop.num_dataloaders has_been_initialized = len(self.eval_loop_results) == num_dataloaders - for dl_idx in range(self.trainer.evaluation_loop.num_dataloaders): - self.add_to_eval_loop_results(dl_idx, has_been_initialized) + for dl_idx in range(self.trainer._evaluation_loop.num_dataloaders): + # remove callback metrics that don't belong to this dataloader + callback_metrics = { + k: v + for k, v in metrics.items() if "dataloader_idx" not in k or f"dataloader_idx_{dl_idx}" in k + } + if has_been_initialized: + self.eval_loop_results[dl_idx].update(callback_metrics) + else: + self.eval_loop_results.append(callback_metrics) + + def update_eval_epoch_metrics(self) -> _EVALUATE_OUTPUT: + assert self._epoch_end_reached + metrics = self.metrics - def get_evaluate_epoch_results(self) -> _EVALUATE_OUTPUT: if not self.trainer.sanity_checking: # log all the metrics as a single dict - metrics_to_log = self.cached_results.get_epoch_log_metrics() - if len(metrics_to_log) > 0: - self.log_metrics(metrics_to_log, {}) + self.log_metrics(metrics[MetricSource.LOG]) - self.prepare_eval_loop_results() + self._prepare_eval_loop_results(metrics[MetricSource.CALLBACK]) # log results of evaluation if ( @@ -292,110 +192,123 @@ def get_evaluate_epoch_results(self) -> _EVALUATE_OUTPUT: print('-' * 80) results = self.eval_loop_results - # clear mem self.eval_loop_results = [] return results - def on_train_epoch_end(self): - # inform cached logger connector epoch finished - self.cached_results.has_batch_loop_finished = True - - def log_train_epoch_end_metrics(self, epoch_output: List[List[List[Result]]]) -> None: - # epoch output is a list. Each item in that list has all the outputs per optimizer - # epoch_output[optimizer_idx][training_step_idx][tbptt_index] - # remember that not using truncated backprop is equivalent with truncated back prop of len(1) - - # log/aggregate metrics automatically - epoch_log_metrics, epoch_progress_bar_metrics = self.__auto_reduce_results_on_epoch_end(epoch_output) - - # it will perform reduction over epoch and return log metrics - cached_epoch_log_metrics = self.cached_results.get_epoch_log_metrics() - cached_epoch_pbar_metrics = self.cached_results.get_epoch_pbar_metrics() - - # update - epoch_log_metrics.update(cached_epoch_log_metrics) - epoch_progress_bar_metrics.update(cached_epoch_pbar_metrics) - - # -------------------------- - # track results - # -------------------------- - # add the metrics to the loggers and callbacks - if epoch_log_metrics and len(epoch_log_metrics) > 0: - self.log_metrics(epoch_log_metrics, {}) - self._callback_metrics.update(epoch_log_metrics) - - # add metrics to progress_bar and callbacks - if len(epoch_progress_bar_metrics) > 0: - self.add_progress_bar_metrics(epoch_progress_bar_metrics) - self._callback_metrics.update(epoch_progress_bar_metrics) - - # reset epoch loop result for next epoch - self.cached_results.reset() - - def __auto_reduce_results_on_epoch_end(self, epoch_output): - epoch_log_metrics = {} - epoch_progress_bar_metrics = {} - for opt_outputs in epoch_output: - # reduce across time first - time_reduced_outputs = [] - for tbptt_outs in opt_outputs: - tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) - if len(tbptt_outs) > 1: - time_reduced_outputs.append(tbptt_outs) - - if len(time_reduced_outputs) == 0: - continue - - # reduce across training steps - opt_outputs = time_reduced_outputs[0].__class__.reduce_on_epoch_end(time_reduced_outputs) - - # with manual opt need 1 + metrics because meta is always there - if opt_outputs.minimize is not None: - opt_outputs.minimize = opt_outputs.minimize.mean() - epoch_log_metrics.update(opt_outputs.epoch_log_metrics) - epoch_progress_bar_metrics.update(opt_outputs.epoch_pbar_metrics) - - return epoch_log_metrics, epoch_progress_bar_metrics - - def log_train_step_metrics(self, batch_output): - if self.trainer.train_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization: + """ + Train metric updates + """ + + def on_train_split_start(self, batch_idx: int, split_idx: int, split_batch: Any) -> None: + self.trainer._results.extract_batch_size(split_batch) + self._batch_idx = batch_idx + self._split_idx = split_idx + + def update_train_step_metrics(self) -> None: + if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization: return - _, batch_log_metrics = self.cached_results.update_logger_connector() + + self._log_gpus_metrics() + # when metrics should be logged - if self.should_update_logs or self.trainer.fast_dev_run is True: - # logs user requested information to logger - grad_norm_dict = batch_output.grad_norm_dict - if grad_norm_dict is None: - grad_norm_dict = {} - if len(batch_log_metrics) > 0 or len(grad_norm_dict) > 0: - self.log_metrics(batch_log_metrics, grad_norm_dict) - self._callback_metrics.update(batch_log_metrics) + assert not self._epoch_end_reached + if self.should_update_logs or self.trainer.fast_dev_run: + self.log_metrics(self.metrics[MetricSource.LOG]) + + def update_train_epoch_metrics(self) -> None: + # add the metrics to the loggers + assert self._epoch_end_reached + self.log_metrics(self.metrics[MetricSource.LOG]) + + # reset result collection for next epoch + self.trainer._results.reset(metrics=True) + + def _log_gpus_metrics(self): + for key, mem in self.gpus_metrics.items(): + gpu_id = int(key.split('/')[0].split(':')[1]) + if gpu_id in self.trainer.accelerator_connector.parallel_device_ids: + self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False) + + """ + Utilities and properties + """ + + def on_epoch_start(self) -> None: + self._epoch_end_reached = False + + def on_batch_start(self) -> None: + self._epoch_end_reached = False + + def epoch_end_reached(self): + self.trainer.logger_connector._epoch_end_reached = True + self.trainer.logger_connector._batch_idx = None + self.trainer.logger_connector._split_idx = None + + def on_epoch_end(self) -> None: + assert self._epoch_end_reached + metrics = self.metrics + self._progress_bar_metrics.update(metrics[MetricSource.PBAR]) + self._callback_metrics.update(metrics[MetricSource.CALLBACK]) + self._logged_metrics.update(metrics[MetricSource.LOG]) + self._current_fx = None + + def on_batch_end(self) -> None: + assert not self._epoch_end_reached + metrics = self.metrics + self._progress_bar_metrics.update(metrics[MetricSource.PBAR]) + self._callback_metrics.update(metrics[MetricSource.CALLBACK]) + self._logged_metrics.update(metrics[MetricSource.LOG]) + + def should_reset_tensors(self, fx: str) -> bool: + is_different_fx = self._current_fx != fx + if self._split_idx is None: + is_first_batch = self._batch_idx in (None, 0) + else: + is_first_batch = self._batch_idx + self._split_idx == 0 + return is_different_fx and is_first_batch + + def reset(self, metrics: Optional[bool] = None) -> None: + if self.trainer.sanity_checking: + # reset metrics + self._progress_bar_metrics = {} + self._logged_metrics = {} + self._callback_metrics = {} + self.trainer._results.reset(metrics=metrics) + self._batch_idx = None + self._split_idx = None + self._current_fx = None @property - def evaluation_log_step(self) -> Optional[int]: - if self.trainer.state.stage is RunningStage.VALIDATING: - return self._val_log_step - elif self.trainer.state.stage is RunningStage.TESTING: - return self._test_log_step - else: - return None + def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]: + """This function returns either batch or epoch metrics depending on ``_epoch_end_reached``.""" + on_step = not self._epoch_end_reached + return self.trainer._results.metrics(on_step) - def increment_evaluation_log_step(self) -> None: - if self.trainer.state.stage is RunningStage.VALIDATING: - self._val_log_step += 1 - elif self.trainer.state.stage is RunningStage.TESTING: - self._test_log_step += 1 + @property + def gpus_metrics(self) -> Dict[str, str]: + if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: + mem_map = memory.get_memory_profile(self.log_gpu_memory) + self._gpus_metrics.update(mem_map) + return self._gpus_metrics - def log_evaluation_step_metrics(self) -> None: - if self.trainer.sanity_checking: - return - _, batch_log_metrics = self.cached_results.update_logger_connector() + @property + def callback_metrics(self) -> Dict[str, _METRIC]: + if self.trainer._results: + metrics = self.metrics[MetricSource.CALLBACK] + self._callback_metrics.update(metrics) + return self._callback_metrics - # logs user requested information to logger - if len(batch_log_metrics) > 0: - kwargs = dict() if "step" in batch_log_metrics else dict(step=self.evaluation_log_step) - self.log_metrics(batch_log_metrics, {}, **kwargs) + @property + def logged_metrics(self) -> Dict[str, _METRIC]: + if self.trainer._results: + metrics = self.metrics[MetricSource.LOG] + self._logged_metrics.update(metrics) + return self._logged_metrics - # increment the step even if nothing was logged - self.increment_evaluation_log_step() + @property + def progress_bar_metrics(self) -> Dict[str, float]: + if self.trainer._results: + metrics = self.metrics[MetricSource.PBAR] + self._progress_bar_metrics.update(metrics) + return self._progress_bar_metrics diff --git a/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py b/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py deleted file mode 100644 index 8f12f57c640b0..0000000000000 --- a/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numbers -from typing import Dict, Optional - -import torch -from torchmetrics import Metric - -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _METRIC - - -class MetricsHolder: - """ - This class acts as a dictionary holder. - It holds metrics and implements conversion functions. - Those functions will be triggered within LoggerConnector - when the property is being requested from the user. - """ - - def __init__(self, to_float: bool = False) -> None: - self.metrics: Dict[str, _METRIC] = {} - self._to_float = to_float - - def update(self, metrics: dict) -> None: - self.metrics.update(metrics) - - def pop(self, key: str, default: _METRIC) -> _METRIC: - return self.metrics.pop(key, default) - - def reset(self, metrics: Dict[str, _METRIC]) -> None: - self.metrics = metrics - - def convert(self, device: Optional[torch.device]) -> None: - for key, value in self.metrics.items(): - if self._to_float: - if isinstance(value, torch.Tensor) and value.numel() != 1: - raise MisconfigurationException( - f"The metric `{key}` does not contain a single element" - f" thus it cannot be converted to float. Found `{value}`" - ) - converted = self._convert_to_float(value) - else: - converted = self._convert_to_tensor(value, device) - self.metrics[key] = converted - - @staticmethod - def _convert_to_float(current: _METRIC) -> float: - if isinstance(current, Metric): - current = current.compute().detach() - - if isinstance(current, torch.Tensor): - current = float(current.item()) - - elif isinstance(current, int): - current = float(current) - - return current - - @staticmethod - def _convert_to_tensor(current: _METRIC, device: Optional[torch.device]) -> torch.Tensor: - if isinstance(current, Metric): - current = current.compute().detach() - - elif isinstance(current, numbers.Number): - current = torch.tensor(current, device=device, dtype=torch.float) - - if isinstance(current, torch.Tensor) and current.device.type == "xla": - current = current.cpu() - - return current diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py new file mode 100644 index 0000000000000..d97156fdb4b24 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -0,0 +1,700 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections.abc import Generator +from dataclasses import asdict, dataclass, replace +from functools import partial, wraps +from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union + +import torch +from torchmetrics import Metric + +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections +from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from pytorch_lightning.utilities.distributed import distributed_available +from pytorch_lightning.utilities.enums import LightningEnum +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.metrics import metrics_to_scalars +from pytorch_lightning.utilities.warnings import WarningCache + +# re-define the ones from pytorch_lightning.utilities.types without the `Number` type +# TODO(@tchaton): Typing-pickle issue on python<3.7 (https://github.com/cloudpipe/cloudpickle/pull/318) +_METRIC = Any # Union[Metric, torch.Tensor] +_METRIC_COLLECTION = Union[_METRIC, Mapping[str, _METRIC]] + +warning_cache = WarningCache() + + +class MetricSource(LightningEnum): + CALLBACK = "callback" + PBAR = "pbar" + LOG = "log" + + +@dataclass +class _Sync: + fn: Optional[Callable] = None + should: bool = False + rank_zero_only: bool = False + op: Optional[str] = None + group: Optional[Any] = None + + def __post_init__(self) -> None: + if self.fn is None: + self.fn = self.no_op + + @property + def __call__(self) -> Any: + return ( + partial(self.fn, reduce_op=self.op, group=self.group) + if self.should and not self.rank_zero_only else self.no_op + ) + + @staticmethod + def no_op(value: Any, *_, **__) -> Any: + return value + + +@dataclass +class _Metadata: + fx: str + name: str + prog_bar: bool = False + logger: bool = True + on_step: bool = False + on_epoch: bool = True + _reduce_fx: Callable = torch.mean + enable_graph: bool = False + dataloader_idx: Optional[int] = None + metric_attribute: Optional[str] = None + _sync: Optional[_Sync] = None + + @property + def reduce_fx(self) -> Callable: + return self._reduce_fx + + @reduce_fx.setter + def reduce_fx(self, reduce_fx: Union[str, Callable]) -> None: + error = ( + 'Only `self.log(..., reduce_fx={min,max,mean,sum})` are currently supported.' + ' Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`.' + f' Found: {reduce_fx}' + ) + self._reduce_fx = reduce_fx + if isinstance(reduce_fx, str): + reduce_fx = reduce_fx.lower() + if reduce_fx == 'avg': + reduce_fx = 'mean' + if reduce_fx not in ('min', 'max', 'mean', 'sum'): + raise MisconfigurationException(error) + self._reduce_fx = getattr(torch, reduce_fx) + elif self.is_custom_reduction: + raise MisconfigurationException(error) + + @property + def sync(self) -> Optional[_Sync]: + return self._sync + + @sync.setter + def sync(self, sync: _Sync) -> None: + if sync.op is None: + sync.op = self.reduce_fx.__name__ + self._sync = sync + + @property + def forked(self) -> bool: + return self.on_step and self.on_epoch + + def forked_name(self, on_step: bool) -> str: + if self.forked: + return f'{self.name}_{"step" if on_step else "epoch"}' + return self.name + + @property + def is_mean_reduction(self) -> bool: + return self.reduce_fx is torch.mean + + @property + def is_sum_reduction(self) -> bool: + return self.reduce_fx in (torch.sum, sum) + + @property + def is_max_reduction(self) -> bool: + return self.reduce_fx in (torch.max, max) + + @property + def is_min_reduction(self) -> bool: + return self.reduce_fx in (torch.min, min) + + @property + def is_custom_reduction(self) -> bool: + return not (self.is_mean_reduction or self.is_max_reduction or self.is_min_reduction or self.is_sum_reduction) + + def __getstate__(self) -> dict: + # drop the `sync.fn` to avoid potential pickle errors + # need to drop `fn` first otherwise `asdict` produces a `RecursionError` + copy = replace(self, _sync=replace(self.sync, fn=None)) + d = asdict(copy) + # delete the `None` value so it does not override + del d['_sync']['fn'] + return d + + def __setstate__(self, state: dict, sync_fn: Optional[Callable] = None) -> None: + d = {**state, '_sync': _Sync(**state['_sync'], fn=sync_fn)} + self.__dict__.update(d) + + @classmethod + def _reconstruct(cls, state: dict, sync_fn: Optional[Callable] = None) -> '_Metadata': + meta = cls(state['fx'], state['name']) + meta.__setstate__(state, sync_fn=sync_fn) + return meta + + +class ResultMetric(Metric, DeviceDtypeModuleMixin): + """Wraps the value provided to `:meth:`~pytorch_lightning.core.lightning.LightningModule.log`""" + + def __init__(self, metadata: _Metadata, is_tensor: bool) -> None: + super().__init__() + self.is_tensor = is_tensor + self.meta = metadata + self.has_reset = False + if is_tensor: + self.add_state("value", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum) + if self.meta.is_mean_reduction: + self.add_state("cumulated_batch_size", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum) + + def update(self, value: _METRIC, batch_size: torch.Tensor) -> None: + if self.is_tensor: + value = value.float() + self._forward_cache = value + # performance: no need to accumulate on values only logged on_step + if self.meta.on_step and not self.meta.on_epoch: + self.value = self.meta.sync(value) + return + # perform accumulation with reduction + if self.meta.is_mean_reduction: + self.value += value.mean() * batch_size + self.cumulated_batch_size += batch_size + elif self.meta.is_max_reduction or self.meta.is_min_reduction: + self.value = self.meta.reduce_fx(self.value, value.mean()) + elif self.meta.is_sum_reduction: + self.value += value.mean() * batch_size + else: + self.value = value # noqa: attribute-defined-outside-init + self._forward_cache = value._forward_cache + + def compute(self) -> torch.Tensor: + if self.is_tensor: + value = self.meta.sync(self.value) + if self.meta.is_mean_reduction: + cumulated_batch_size = self.meta.sync(self.cumulated_batch_size) + return value / cumulated_batch_size + elif self.meta.is_max_reduction or self.meta.is_min_reduction or self.meta.is_sum_reduction: + return value + return self.value.compute() + + def reset(self) -> None: + if self.is_tensor: + super().reset() + else: + self.value.reset() + self.has_reset = True + + def forward(self, value: _METRIC, batch_size: torch.Tensor) -> None: + if self.meta.enable_graph: + with torch.no_grad(): + self.update(value, batch_size) + else: + # performance: skip the `torch.no_grad` context manager by calling `update` directly + self.update(value, batch_size) + + def _wrap_compute(self, compute: Any) -> Any: + # Override to avoid syncing - we handle it ourselves. + @wraps(compute) + def wrapped_func(*args, **kwargs): + if not self._update_called: + rank_zero_warn( + f"The ``compute`` method of metric {self.__class__.__name__}" + " was called before the ``update`` method which may lead to errors," + " as metric states have not yet been updated.", UserWarning + ) + + # return cached value + if self._computed is not None: + return self._computed + self._computed = compute(*args, **kwargs) + return self._computed + + return wrapped_func + + def __setattr__(self, key: str, value: Any) -> None: + # performance: skip the `torch.nn.Module.__setattr__` checks + object.__setattr__(self, key, value) + + def __repr__(self) -> str: + state = f"{repr(self.meta.name)}, value={self.value}" + if self.is_tensor and self.meta.is_mean_reduction: + state += f", cumulated_batch_size={self.cumulated_batch_size}" + return f"{self.__class__.__name__}({state})" + + def __getstate__(self, drop_value: bool = False) -> dict: + skip = ['update', 'compute', '_update_signature'] + if not self.is_tensor and drop_value: + # Avoid serializing ResultMetrics which are passed Metrics + skip.append('value') + with self.sync_context( + should_sync=not self.meta.sync.rank_zero_only, + process_group=self.meta.sync.group, + distributed_available=distributed_available + ): + d = {k: v for k, v in self.__dict__.items() if k not in skip} + d['meta'] = d['meta'].__getstate__() + d['_class'] = self.__class__.__name__ + return d + + def __setstate__(self, state: dict, sync_fn: Optional[Callable] = None) -> None: + d = {**state, 'meta': _Metadata._reconstruct(state['meta'], sync_fn=sync_fn)} + super().__setstate__(d) + + @classmethod + def _reconstruct(cls, state: dict, sync_fn: Optional[Callable] = None) -> 'ResultMetric': + # need to reconstruct twice because `meta` is used in `__init__` + meta = _Metadata._reconstruct(state['meta']) + result_metric = cls(meta, state['is_tensor']) + result_metric.__setstate__(state, sync_fn=sync_fn) + return result_metric + + +class ResultMetricCollection(dict): + """ + Dict wrapper for easy access to metadata. + + All of the leaf items should be instances of + :class:`~pytorch_lightning.trainer.connectors.logger_connector.result.ResultMetric` + with the same metadata. + """ + + def __init__(self, *args, metadata: Optional[_Metadata] = None) -> None: + super().__init__(*args) + self.meta = metadata + + def __getstate__(self, drop_value: bool = False) -> dict: + + def getstate(item: ResultMetric) -> dict: + return item.__getstate__(drop_value=drop_value) + + items = apply_to_collection(dict(self), (ResultMetric, ResultMetricCollection), getstate) + return {"items": items, "meta": self.meta.__getstate__(), "_class": self.__class__.__name__} + + def __setstate__(self, state: dict, sync_fn: Optional[Callable] = None) -> None: + + def setstate(item: dict) -> Union[Dict[str, ResultMetric], ResultMetric, Any]: + # recurse through dictionaries to set the state. can't use `apply_to_collection` + # as it does not recurse items of the same type. + if not isinstance(item, dict): + return item + if item.get('_class') == ResultMetric.__name__: + return ResultMetric._reconstruct(item, sync_fn=sync_fn) + return {k: setstate(v) for k, v in item.items()} + + items = setstate(state["items"]) + self.update(items) + + any_result_metric = next(iter(items.values())) + self.meta = any_result_metric.meta + + @classmethod + def _reconstruct(cls, state: dict, sync_fn: Optional[Callable] = None) -> 'ResultMetricCollection': + rmc = cls() + rmc.__setstate__(state, sync_fn=sync_fn) + return rmc + + +class ResultCollection(dict): + """ + Collection (dictionary) of :class:`~pytorch_lightning.trainer.connectors.logger_connector.result.ResultMetric` or + :class:`~pytorch_lightning.trainer.connectors.logger_connector.result.ResultMetricCollection` + + Example: + + # `device` needs to be provided before logging + result = ResultCollection(training=True, torch.device("cpu")) + + # you can log to a specific collection. + # arguments: fx, key, value, metadata + result.log('training_step', 'acc', torch.tensor(...), on_step=True, on_epoch=True) + result.log('validation_step', 'recall', torch.tensor(...), on_step=True, on_epoch=True) + """ + + DATALOADER_SUFFIX = "/dataloader_idx_{}" + + def __init__(self, training: bool, device: Optional[Union[str, torch.device]] = None) -> None: + super().__init__() + self.training = training + self._minimize = None + self._batch_size = torch.tensor(1, device=device) + self.device: Optional[Union[str, torch.device]] = device + + @property + def result_metrics(self) -> List[ResultMetric]: + o = [] + + def append_fn(v: ResultMetric) -> None: + nonlocal o + o.append(v) + + apply_to_collection(list(self.values()), ResultMetric, append_fn) + return o + + @property + def batch_size(self) -> torch.Tensor: + # performance: cache the `batch_size` tensor instead of re-creating it + return self._batch_size + + @batch_size.setter + def batch_size(self, value: int) -> None: + self._batch_size = torch.tensor(value, device=self.device) + + @property + def minimize(self) -> Optional[torch.Tensor]: + """ + The :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` loss + will be saved as the ``minimize`` attribute. + """ + return self._minimize + + @minimize.setter + def minimize(self, loss: Optional[torch.Tensor]) -> None: + if loss is not None: + if not isinstance(loss, torch.Tensor): + raise ValueError(f"`Result.minimize` must be a `torch.Tensor`, found: {loss}") + self._minimize = loss + + @property + def extra(self) -> Dict[str, Any]: + """ + Extras are any keys other than the loss returned by + :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` + """ + return self.get('_extra', {}) + + @extra.setter + def extra(self, extra: Dict[str, Any]) -> None: + + def check_fn(v): + if v.grad_fn is not None: + warning_cache.deprecation( + f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically" + " but this behaviour will change in v1.6. Please detach it manually:" + " `return {'loss': ..., 'something': something.detach()}`" + ) + return v.detach() + return v + + # update instead of replace to keep the extra dict reference. TODO: remove with v1.6 deprecation removal + extra.update(apply_to_collection(extra, torch.Tensor, check_fn)) + self['_extra'] = extra + + def log( + self, + fx: str, + name: str, + value: _METRIC_COLLECTION, + prog_bar: bool = False, + logger: bool = True, + on_step: bool = False, + on_epoch: bool = True, + reduce_fx: Callable = torch.mean, + enable_graph: bool = False, + sync_dist: bool = False, + sync_dist_fn: Callable = _Sync.no_op, + sync_dist_group: Optional[Any] = None, + dataloader_idx: Optional[int] = None, + batch_size: Optional[int] = None, + metric_attribute: Optional[str] = None, + rank_zero_only: bool = False, + ) -> None: + """See :meth:`~pytorch_lightning.core.lightning.LightningModule.log`""" + # no metrics should be logged with graphs + if not enable_graph and isinstance(value, torch.Tensor): + value = value.detach() + + # move metrics to cpu on TPU. + if isinstance(value, torch.Tensor) and value.device.type == "xla": + value = value.cpu() + + # storage key + key = f"{fx}.{name}" + # add dataloader_suffix to both key and fx + if dataloader_idx is not None: + key += f'.{dataloader_idx}' + fx += f'.{dataloader_idx}' + + meta = _Metadata( + fx=fx, + name=name, + prog_bar=prog_bar, + logger=logger, + on_step=on_step, + on_epoch=on_epoch, + enable_graph=enable_graph, + dataloader_idx=dataloader_idx, + metric_attribute=metric_attribute, + ) + meta.reduce_fx = reduce_fx + meta.sync = _Sync( + should=sync_dist, + fn=sync_dist_fn, + group=sync_dist_group, + rank_zero_only=rank_zero_only, + ) + + # register logged value if it doesn't exist + if key not in self: + self.register_key(key, meta, value) + + # check the stored metadata and the current one match + elif meta != self[key].meta: + raise MisconfigurationException( + f'You called `self.log({name}, ...)` twice in `{fx}` with different arguments. This is not allowed' + ) + + if batch_size is not None: + self.batch_size = batch_size + + self.update_metrics(key, value) + + def register_key(self, key: str, meta: _Metadata, value: _METRIC_COLLECTION) -> None: + """Create one ResultMetric object per value. Value can be provided as a nested collection""" + + def fn(v: _METRIC) -> ResultMetric: + metric = ResultMetric(meta, isinstance(v, torch.Tensor)) + return metric.to(self.device) + + value = apply_to_collection(value, (torch.Tensor, Metric), fn) + if isinstance(value, dict): + value = ResultMetricCollection(value, metadata=meta) + self[key] = value + + def update_metrics(self, key: str, value: _METRIC_COLLECTION) -> None: + + def fn(result_metric, v): + # performance: avoid calling `__call__` to avoid the checks in `torch.nn.Module._call_impl` + result_metric.forward(v.to(self.device), self.batch_size) + result_metric.has_reset = False + + apply_to_collections(self[key], value, ResultMetric, fn) + + @staticmethod + def _get_cache(result_metric: ResultMetric, on_step: bool) -> Optional[torch.Tensor]: + cache = None + if on_step and result_metric.meta.on_step: + cache = result_metric._forward_cache + elif not on_step and result_metric.meta.on_epoch: + if not result_metric._computed: + # always reduce on epoch end + should = result_metric.meta.sync.should + result_metric.meta.sync.should = True + result_metric.compute() + result_metric.meta.sync.should = should + cache = result_metric._computed + if cache is not None and not result_metric.meta.enable_graph: + return cache.detach() + return cache + + def valid_items(self) -> Generator: + """This function is used to iterate over current valid metrics.""" + return ((k, v) for k, v in self.items() + if not k == "_extra" and not (isinstance(v, ResultMetric) and v.has_reset)) + + def _forked_name(self, result_metric: ResultMetric, on_step: bool) -> Tuple[str, str]: + name = result_metric.meta.name + forked_name = result_metric.meta.forked_name(on_step) + dl_idx = result_metric.meta.dataloader_idx + if dl_idx is not None: + dataloader_suffix = self.DATALOADER_SUFFIX.format(dl_idx) + name += dataloader_suffix + forked_name += dataloader_suffix + return name, forked_name + + def metrics(self, on_step: bool) -> Dict[MetricSource, Dict[str, _METRIC]]: + metrics = {k: {} for k in MetricSource} + + for _, result_metric in self.valid_items(): + + # extract forward_cache or computed from the ResultMetric. ignore when the output is None + value = apply_to_collection(result_metric, ResultMetric, self._get_cache, on_step, include_none=False) + + # convert metric collection to dict container. + if isinstance(value, ResultMetricCollection): + value = dict(value.items()) + + # check if the collection is empty + has_tensor = False + + def any_tensor(_): + nonlocal has_tensor + has_tensor = True + + apply_to_collection(value, torch.Tensor, any_tensor) + if not has_tensor: + continue + + name, forked_name = self._forked_name(result_metric, on_step) + + # populate logging metrics + if result_metric.meta.logger: + metrics[MetricSource.LOG][forked_name] = value + + # populate callback metrics. callback metrics don't take `_step` forked metrics + if self.training or result_metric.meta.on_epoch and not on_step: + metrics[MetricSource.CALLBACK][name] = value + metrics[MetricSource.CALLBACK][forked_name] = value + + # populate progress_bar metrics. convert tensors to numbers + if result_metric.meta.prog_bar: + metrics[MetricSource.PBAR][forked_name] = metrics_to_scalars(value) + + return metrics + + def reset(self, metrics: Optional[bool] = None, fx: Optional[str] = None) -> None: + """ + Reset the result collection + + Args: + metrics: If True, only ``torchmetrics.Metric`` results are reset, + if False, only ``torch.Tensors`` are reset, + if ``None``, both are. + fx: Function to reset + """ + + def fn(item: ResultMetric) -> None: + requested_type = metrics is None or metrics ^ item.is_tensor + same_fx = fx is None or fx == item.meta.fx + if requested_type and same_fx: + item.reset() + + apply_to_collection(self, ResultMetric, fn) + + def extract_batch_size(self, batch: Any) -> None: + try: + self.batch_size = self._extract_batch_size(batch) + except RecursionError: + self.batch_size = 1 + + def _extract_batch_size(self, batch: Any) -> int: + """ + Recursively unpack a batch to find a torch.Tensor. + + Returns: + ``len(tensor)`` when found, or ``1`` when it hits an empty or non iterable. + """ + if isinstance(batch, torch.Tensor): + size = batch.size(0) + elif isinstance(batch, str): + return len(batch) + elif isinstance(batch, dict): + sample = next(iter(batch.values()), 1) + size = self._extract_batch_size(sample) + elif isinstance(batch, Iterable): + sample = next(iter(batch), 1) + size = self._extract_batch_size(sample) + else: + size = 1 + return size + + def to(self, *args, **kwargs) -> 'ResultCollection': + """Move all data to the given device.""" + + def to_(item: Union[torch.Tensor, Metric], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Metric]: + return item.to(*args, **kwargs) + + apply_to_collection(self, (torch.Tensor, Metric), to_, *args, **kwargs) + + if self.minimize is not None: + self.minimize = self.minimize.to(*args, **kwargs) + self._batch_size = self._batch_size.to(*args, **kwargs) + if 'device' in kwargs: + self.device = kwargs['device'] + return self + + def cpu(self) -> 'ResultCollection': + """Move all data to CPU.""" + return self.to(device="cpu") + + def __str__(self) -> str: + return f'{self.__class__.__name__}({self.training}, {self.device}, {repr(self)})' + + def __getstate__(self, drop_value: bool = True) -> dict: + d = self.__dict__.copy() + + # can't deepcopy tensors with grad_fn + minimize = d['_minimize'] + if minimize is not None: + d['_minimize'] = minimize.detach() + + extra = self.get('_extra') + if extra is not None: + d['_extra'] = extra + + # all the items should be either `ResultMetric`s or `ResultMetricCollection`s + items = {k: v.__getstate__(drop_value=drop_value) for k, v in self.items() if k != '_extra'} + return {**d, 'items': items} + + def __setstate__( + self, + state: dict, + map_location: Optional[Union[str, torch.device]] = None, + sync_fn: Optional[Callable] = None, + ) -> None: + self.__dict__.update({k: v for k, v in state.items() if k != 'items'}) + + def setstate(k: str, item: dict) -> Union[ResultMetric, ResultMetricCollection]: + if not isinstance(item, dict): + raise ValueError(f'Unexpected value: {item}') + cls = item['_class'] + if cls == ResultMetric.__name__: + cls = ResultMetric + elif cls == ResultMetricCollection.__name__: + cls = ResultMetricCollection + else: + raise ValueError(f"Unexpected class name: {cls}") + _sync_fn = sync_fn or (self[k].meta.sync.fn if k in self else None) + return cls._reconstruct(item, sync_fn=_sync_fn) + + items = {k: setstate(k, v) for k, v in state['items'].items()} + self.update(items) + + device = map_location or self.device + self.to(device) + + def state_dict(self, drop_value: bool = True) -> dict: + return self.__getstate__(drop_value) + + def load_state_dict( + self, + state_dict: dict, + map_location: Optional[Union[str, torch.device]] = None, + sync_fn: Optional[Callable] = None, + metrics: Optional[Dict[str, Metric]] = None, + ) -> None: + self.__setstate__(state_dict, map_location=map_location, sync_fn=sync_fn) + + if not metrics: + return + result_metrics = self.result_metrics + for metric_attribute, metric in metrics.items(): + for result_metric in result_metrics: + if result_metric.meta.metric_attribute == metric_attribute: + result_metric.value = metric diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py index 2797504288bd3..083d35d4cbb94 100644 --- a/pytorch_lightning/trainer/connectors/optimizer_connector.py +++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py @@ -29,11 +29,17 @@ def on_trainer_init(self) -> None: self.trainer.optimizers = [] self.trainer.optimizer_frequencies = [] - def update_learning_rates(self, interval: str, opt_indices: Optional[List[int]] = None) -> None: + def update_learning_rates( + self, interval: str, update_plateau_schedulers: bool, opt_indices: Optional[List[int]] = None + ) -> None: """Update learning rates. Args: interval: either 'epoch' or 'step'. + update_plateau_schedulers: control whether ``ReduceLROnPlateau`` or non-plateau schedulers get updated. + This is used so non-plateau schedulers can be updated before running validation. Checkpoints are + commonly saved during validation, however, on-plateau schedulers might monitor a validation metric + so they have to be updated separately. opt_indices: indices of the optimizers to update. """ if not self.trainer.lr_schedulers or not self.trainer.lightning_module.automatic_optimization: @@ -46,7 +52,10 @@ def update_learning_rates(self, interval: str, opt_indices: Optional[List[int]] if isinstance(lr_scheduler['opt_idx'], int) and lr_scheduler['opt_idx'] not in opt_indices: continue - current_idx = self.trainer.train_loop.batch_idx if interval == 'step' else self.trainer.current_epoch + if update_plateau_schedulers ^ lr_scheduler["reduce_on_plateau"]: + continue + + current_idx = self.trainer.fit_loop.batch_idx if interval == 'step' else self.trainer.current_epoch current_idx += 1 # account for both batch and epoch starts from 0 # Take step if call to update_learning_rates matches the interval key and # the current step modulo the schedulers frequency is zero @@ -83,7 +92,7 @@ def update_learning_rates(self, interval: str, opt_indices: Optional[List[int]] if self.trainer.dev_debugger.enabled: self.trainer.dev_debugger.track_lr_schedulers_update( - self.trainer.train_loop.batch_idx, + self.trainer.fit_loop.batch_idx, interval, scheduler_idx, old_lr, diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py index f27288d2b13f4..4d93fa5977d13 100644 --- a/pytorch_lightning/trainer/connectors/training_trick_connector.py +++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py @@ -14,8 +14,7 @@ from typing import Dict, List, Optional, Union from pytorch_lightning.callbacks import GradientAccumulationScheduler -from pytorch_lightning.utilities import GradClipAlgorithmType -from pytorch_lightning.utilities.distributed import rank_zero_deprecation +from pytorch_lightning.utilities import GradClipAlgorithmType, rank_zero_deprecation from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 53c9b07dffbaf..ce6caa4e2f330 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -22,8 +22,8 @@ from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler +import pytorch_lightning as pl from pytorch_lightning.accelerators import Accelerator -from pytorch_lightning.core import LightningModule from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector from pytorch_lightning.trainer.states import RunningStage @@ -51,6 +51,7 @@ class TrainerDataLoadingMixin(ABC): test_dataloaders: Optional[List[DataLoader]] num_test_batches: List[Union[int, float]] limit_train_batches: Union[int, float] + log_every_n_steps: int overfit_batches: Union[int, float] distributed_sampler_kwargs: dict accelerator: Accelerator @@ -225,7 +226,7 @@ def _get_distributed_sampler( sampler = cls(dataloader.dataset, **kwargs) return sampler - def reset_train_dataloader(self, model: LightningModule) -> None: + def reset_train_dataloader(self, model: 'pl.LightningModule') -> None: """Resets the train dataloader and initialises required variables (number of batches, when to validate, etc.). @@ -261,6 +262,9 @@ def reset_train_dataloader(self, model: LightningModule) -> None: # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches self.train_dataloader = CombinedLoader(self.train_dataloader, self.data_connector.multiple_trainloader_mode) + # allow accelerator to modify dataloader + self.train_dataloader = self.accelerator.on_reset_train_dataloader(self.train_dataloader) + self.num_training_batches = len(self.train_dataloader) if has_len(self.train_dataloader) else float('inf') if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: @@ -299,9 +303,16 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch) + if self.logger and self.num_training_batches < self.log_every_n_steps: + rank_zero_warn( + f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval" + f" Trainer(log_every_n_steps={self.log_every_n_steps}). Set a lower value for log_every_n_steps if" + f" you want to see logs for the training epoch." + ) + def _reset_eval_dataloader( self, - model: LightningModule, + model: 'pl.LightningModule', mode: str, ) -> Tuple[List[Union[int, float]], List[DataLoader]]: """Generic method to reset a dataloader for evaluation. @@ -361,6 +372,10 @@ def _reset_eval_dataloader( # add worker_init_fn for correct seeding in worker processes apply_to_collection(dataloaders, dtype=DataLoader, function=self.auto_add_worker_init_fn) + # allow accelerator to modify dataloader + hook_name = f"on_reset_{mode}_dataloader" + dataloaders = getattr(self.accelerator, hook_name)(dataloaders) + loader_num_batches = [] # determine number of batches @@ -397,7 +412,7 @@ def _reset_eval_dataloader( return loader_num_batches, dataloaders - def reset_val_dataloader(self, model: LightningModule) -> None: + def reset_val_dataloader(self, model: 'pl.LightningModule') -> None: """Resets the validation dataloader and determines the number of batches. Args: @@ -429,7 +444,20 @@ def reset_predict_dataloader(self, model) -> None: if has_loader: self.num_predict_batches, self.predict_dataloaders = self._reset_eval_dataloader(model, 'predict') - def request_dataloader(self, model: LightningModule, stage: str) -> DataLoader: + def reset_train_val_dataloaders(self, model) -> None: + """ + Resets train and val dataloaders if none are attached to the trainer. + + The val dataloader must be initialized before training loop starts, as the training loop + inspects the val dataloader to determine whether to run the evaluation loop. + """ + if self.train_dataloader is None: + self.reset_train_dataloader(model) + + if self.val_dataloaders is None: + self.reset_val_dataloader(model) + + def request_dataloader(self, model: 'pl.LightningModule', stage: str) -> DataLoader: """Handles downloading data in the GPU or TPU case. Args: diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 7e7817d277dae..a650c6bfe73e8 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -11,13 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from pytorch_lightning.loops import FitLoop from pytorch_lightning.utilities import rank_zero_deprecation class DeprecatedTrainerAttributes: sanity_checking: bool + fit_loop: FitLoop @property def running_sanity_check(self) -> bool: @@ -25,3 +26,10 @@ def running_sanity_check(self) -> bool: "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5." ) return self.sanity_checking + + @property + def train_loop(self) -> FitLoop: + rank_zero_deprecation( + "`Trainer.train_loop` has been renamed to `Trainer.fit_loop` and will be removed in v1.6." + ) + return self.fit_loop diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py deleted file mode 100644 index 810efef3fa52b..0000000000000 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from collections import OrderedDict -from typing import Any, Dict, List, Optional, Tuple, Union - -from torch.utils.data import DataLoader - -import pytorch_lightning as pl -from pytorch_lightning.core.step_result import Result -from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.trainer.supporters import PredictionCollection -from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT -from pytorch_lightning.utilities.warnings import WarningCache - - -class EvaluationLoop(object): - - def __init__(self, trainer: 'pl.Trainer'): - self.trainer: 'pl.Trainer' = trainer - self.outputs: EPOCH_OUTPUT = [] - self.predictions: Optional[PredictionCollection] = None - self.max_batches: Optional[List[Union[int, float]]] = None - self.warning_cache = WarningCache() - self.num_dataloaders: Optional[int] = None - - def on_trainer_init(self) -> None: - self.trainer.num_sanity_val_batches = [] - self.trainer.num_test_batches = [] - self.trainer.num_val_batches = [] - self.trainer.test_dataloaders = None - self.trainer.val_dataloaders = None - - # .validate() and .test() set this when they load a checkpoint - self.trainer.validated_ckpt_path = None - self.trainer.tested_ckpt_path = None - - # when true, print evaluation results in .validate() and .test() - self.trainer.verbose_evaluate = True - - def get_evaluation_dataloaders(self) -> Tuple[Optional[List[DataLoader]], List[Union[int, float]]]: - model = self.trainer.lightning_module - - # select dataloaders - if self.trainer.testing: - self.trainer.reset_test_dataloader(model) - - dataloaders = self.trainer.test_dataloaders - max_batches = self.trainer.num_test_batches - else: - # val - if self.trainer.val_dataloaders is None or self.trainer.reload_dataloaders_every_epoch: - self.trainer.reset_val_dataloader(model) - if self.trainer.sanity_checking: - self.trainer.num_sanity_val_batches = [ - min(self.trainer.num_sanity_val_steps, val_batches) for val_batches in self.trainer.num_val_batches - ] - max_batches = self.trainer.num_sanity_val_batches - else: - max_batches = self.trainer.num_val_batches - dataloaders = self.trainer.val_dataloaders - return dataloaders, max_batches - - def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None: - self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end() - if self.trainer.testing: - self.trainer.call_hook('on_test_start', *args, **kwargs) - else: - self.trainer.call_hook('on_validation_start', *args, **kwargs) - - def on_evaluation_model_eval(self) -> None: - model_ref = self.trainer.lightning_module - if self.trainer.testing: - model_ref.on_test_model_eval() - else: - model_ref.on_validation_model_eval() - - def on_evaluation_model_train(self) -> None: - model_ref = self.trainer.lightning_module - if self.trainer.testing: - model_ref.on_test_model_train() - else: - model_ref.on_validation_model_train() - - def on_evaluation_end(self, *args: Any, **kwargs: Any) -> None: - if self.trainer.testing: - self.trainer.call_hook('on_test_end', *args, **kwargs) - else: - self.trainer.call_hook('on_validation_end', *args, **kwargs) - - if self.trainer.state.fn != TrainerFn.FITTING: - # summarize profile results - self.trainer.profiler.describe() - - def reload_evaluation_dataloaders(self) -> None: - model = self.trainer.lightning_module - if self.trainer.testing: - self.trainer.reset_test_dataloader(model) - else: - self.trainer.reset_val_dataloader(model) - - def setup(self, max_batches: List[Union[int, float]], dataloaders: List[DataLoader]) -> None: - # bookkeeping - self.outputs = [] - self.predictions = PredictionCollection(self.trainer.global_rank, self.trainer.world_size) - - # convert max_batches to list - if isinstance(max_batches, int): - max_batches = [max_batches] * len(dataloaders) - - self.max_batches = max_batches - self.num_dataloaders = self._get_num_dataloaders(dataloaders) - - def on_evaluation_epoch_start(self, *args: Any, **kwargs: Any) -> None: - self.trainer.call_hook('on_epoch_start', *args, **kwargs) - - if self.trainer.testing: - self.trainer.call_hook('on_test_epoch_start', *args, **kwargs) - else: - self.trainer.call_hook('on_validation_epoch_start', *args, **kwargs) - - def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict[str, Union[Any, int]]: - # make dataloader_idx arg in validation_step optional - step_kwargs = OrderedDict([('batch', batch), ('batch_idx', batch_idx)]) - - multiple_val_loaders = ( - not self.trainer.testing and self._get_num_dataloaders(self.trainer.val_dataloaders) > 1 - ) - multiple_test_loaders = (self.trainer.testing and self._get_num_dataloaders(self.trainer.test_dataloaders) > 1) - - if multiple_test_loaders or multiple_val_loaders: - step_kwargs['dataloader_idx'] = dataloader_idx - - return step_kwargs - - def _get_num_dataloaders(self, dataloaders: Optional[List[DataLoader]]) -> int: - # case where user does: - # return dl1, dl2 - if dataloaders is not None: - length = len(dataloaders) - if len(dataloaders) > 0 and isinstance(dataloaders[0], (list, tuple)): - length = len(dataloaders[0]) - return length - else: - return 0 - - def evaluation_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Optional[STEP_OUTPUT]: - # configure step_kwargs - step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx) - - model_ref = self.trainer.lightning_module - model_ref._results = Result() - - if self.trainer.testing: - model_ref._current_fx_name = "test_step" - with self.trainer.profiler.profile("test_step"): - output = self.trainer.accelerator.test_step(step_kwargs) - else: - model_ref._current_fx_name = "validation_step" - with self.trainer.profiler.profile("validation_step"): - output = self.trainer.accelerator.validation_step(step_kwargs) - - # capture any logged information - self.trainer.logger_connector.cache_logged_metrics() - # track batch size for weighted average - if isinstance(output, Result): - output.track_batch_size(batch) - - return output - - def evaluation_step_end(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: - if self.trainer.testing: - output = self.trainer.call_hook('test_step_end', *args, **kwargs) - else: - output = self.trainer.call_hook('validation_step_end', *args, **kwargs) - return output - - def _should_track_batch_outputs_for_epoch_end(self) -> bool: - model = self.trainer.lightning_module - if self.trainer.testing: - return is_overridden('test_epoch_end', model=model) - else: - return is_overridden('validation_epoch_end', model=model) - - def evaluation_epoch_end(self, outputs: EPOCH_OUTPUT) -> None: - # unset dataloder_idx in model - self.trainer.logger_connector.evaluation_epoch_end() - - # call the model epoch end - model = self.trainer.lightning_module - - if self.trainer.testing: - if is_overridden('test_epoch_end', model=model): - model._current_fx_name = 'test_epoch_end' - model.test_epoch_end(outputs) - - else: - if is_overridden('validation_epoch_end', model=model): - model._current_fx_name = 'validation_epoch_end' - model.validation_epoch_end(outputs) - - # capture logging - self.trainer.logger_connector.cache_logged_metrics() - - def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: - # set dataloader_idx to model and track batch_size - self.trainer.logger_connector.on_evaluation_batch_start(batch, dataloader_idx, self.num_dataloaders) - - if self.trainer.testing: - self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx) - else: - self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx) - - def on_evaluation_batch_end( - self, - output: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ) -> None: - if self.trainer.testing: - self.trainer.call_hook('on_test_batch_end', output, batch, batch_idx, dataloader_idx) - else: - self.trainer.call_hook('on_validation_batch_end', output, batch, batch_idx, dataloader_idx) - - # store predicitons if do_write_predictions and track eval loss history - self.store_predictions(output, batch_idx, dataloader_idx) - - def store_predictions(self, output: Optional[STEP_OUTPUT], batch_idx: int, dataloader_idx: int) -> None: - # Add step predictions to prediction collection to write later - if output is not None and self.predictions is not None: - if isinstance(output, Result) and self.trainer.testing: - self.predictions.add(output.pop('predictions', None)) - - # track debug metrics - self.trainer.dev_debugger.track_eval_loss_history(batch_idx, dataloader_idx, output) - - def on_evaluation_epoch_end(self) -> None: - hook_name = "on_test_epoch_end" if self.trainer.testing else "on_validation_epoch_end" - self.trainer.call_hook(hook_name) - self.trainer.call_hook('on_epoch_end') diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index 0a59b9d8d4c36..74603782f3293 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -14,7 +14,7 @@ from abc import ABC -from pytorch_lightning.utilities.distributed import rank_zero_deprecation +from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.metrics import metrics_to_scalars as new_metrics_to_scalars diff --git a/pytorch_lightning/trainer/model_hooks.py b/pytorch_lightning/trainer/model_hooks.py index 86cb1334a7067..2336379fc3d49 100644 --- a/pytorch_lightning/trainer/model_hooks.py +++ b/pytorch_lightning/trainer/model_hooks.py @@ -15,8 +15,8 @@ from abc import ABC from typing import Optional -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities.distributed import rank_zero_deprecation +import pytorch_lightning as pl +from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature @@ -27,9 +27,9 @@ class TrainerModelHooksMixin(ABC): Use the utilities from ``pytorch_lightning.utilities.signature_utils`` instead. """ - lightning_module: LightningModule + lightning_module: 'pl.LightningModule' - def is_function_implemented(self, f_name: str, model: Optional[LightningModule] = None) -> bool: + def is_function_implemented(self, f_name: str, model: Optional['pl.LightningModule'] = None) -> bool: rank_zero_deprecation( "Internal: TrainerModelHooksMixin.is_function_implemented is deprecated in v1.4" " and will be removed in v1.6." diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index b5afe7bf75168..80ec5857de287 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -19,7 +19,7 @@ from torch import optim from torch.optim.optimizer import Optimizer -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -29,7 +29,7 @@ class TrainerOptimizersMixin(ABC): _lightning_optimizers: Optional[List[LightningOptimizer]] - def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]: + def init_optimizers(self, model: 'pl.LightningModule') -> Tuple[List, List, List]: self._lightning_optimizers = None optim_conf = model.configure_optimizers() if optim_conf is None: diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py deleted file mode 100644 index c06ced6662d81..0000000000000 --- a/pytorch_lightning/trainer/predict_loop.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from collections import OrderedDict -from typing import Any, List, Optional - -import torch -from torch.utils.data.dataloader import DataLoader - -from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper -from pytorch_lightning.plugins import DDPSpawnPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _PREDICT_OUTPUT -from pytorch_lightning.utilities.warnings import WarningCache - - -class PredictLoop(object): - - def __init__(self, trainer): - self.trainer = trainer - self.max_batches = None - self.num_dataloaders = None - self.warning_cache = WarningCache() - self.batch_indices: Optional[List[int]] = None - self.epoch_batch_indices: Optional[List[List[int]]] = None - self.predictions: Optional[List[List[Any]]] = None - # `DDPSpawnPlugin` plugins and derivate don't support return predictions. - self._return_predictions: Optional[bool] = None - self._previous_grad_status: Optional[bool] = None - - @property - def return_predictions(self) -> bool: - return self._return_predictions - - @return_predictions.setter - def return_predictions(self, return_predictions: Optional[bool] = None) -> None: - # ``DDPSpawnPlugin`` plugins and derivate don't support return predictions. - is_ddp_spawn = isinstance(self.trainer.training_type_plugin, DDPSpawnPlugin) - if return_predictions and is_ddp_spawn: - raise MisconfigurationException( - "`return_predictions` should be set to `False` when using the `DDPSpawnPlugin` or children class. " - f"Found {return_predictions} with training_type_plugin {type(self.trainer.training_type_plugin)}." - ) - # For non ``DDPSpawnPlugin`` plugin, the `return_predictions` is True by default unless user decide otherwise. - self._return_predictions = not is_ddp_spawn if return_predictions is None else return_predictions - - @property - def should_store_predictions(self) -> bool: - any_pred = any(cb.interval.on_epoch for cb in self.trainer.prediction_writer_callbacks) - return self.return_predictions or any_pred - - def on_trainer_init(self): - self.trainer.num_predict_batches = [] - self.trainer.predicted_ckpt_path = None - - def get_predict_dataloaders(self): - self.trainer.reset_predict_dataloader(self.trainer.lightning_module) - - dataloaders = self.trainer.predict_dataloaders - max_batches = self.trainer.num_predict_batches - - return dataloaders, max_batches - - def should_skip_predict(self, max_batches): - return sum(max_batches) == 0 - - def on_predict_model_eval(self): - model_ref = self.trainer.lightning_module - model_ref.on_predict_model_eval() - - def setup(self, max_batches, dataloaders): - # convert max_batches to list - if isinstance(max_batches, int): - max_batches = [max_batches] * len(dataloaders) - - self.max_batches = max_batches - self.num_dataloaders = self._get_num_dataloaders(dataloaders) - self.predictions = [[] for _ in range(self.num_dataloaders)] - self.epoch_batch_indices = [[] for _ in range(self.num_dataloaders)] - - def _get_num_dataloaders(self, dataloaders: List[DataLoader]) -> int: - # case where user does: - # return dl1, dl2 - length = len(dataloaders) - if len(dataloaders) > 0 and isinstance(dataloaders[0], (list, tuple)): - length = len(dataloaders[0]) - return length - - def _build_kwargs(self, batch, batch_idx, dataloader_idx): - step_kwargs = OrderedDict([('batch', batch), ('batch_idx', batch_idx)]) - if self.num_dataloaders: - step_kwargs['dataloader_idx'] = dataloader_idx - return step_kwargs - - def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: - # configure step_kwargs - step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx) - - # extract batch_indices and store them - self._store_batch_indices(dataloader_idx) - - model_ref = self.trainer.lightning_module - - self.trainer.call_hook("on_predict_batch_start", batch, batch_idx, dataloader_idx) - - model_ref._current_fx_name = "predict_step" - predictions = self.trainer.accelerator.predict_step(step_kwargs) - - if predictions is None: - self.warning_cache.warn("predict returned None if it was on purpose, ignore this warning...") - - self.trainer.call_hook("on_predict_batch_end", predictions, batch, batch_idx, dataloader_idx) - - if self.should_store_predictions: - self.predictions[dataloader_idx].append(predictions) - - def _store_batch_indices(self, dataloader_idx: int) -> None: - batch_sampler = self.trainer.predict_dataloaders[dataloader_idx].batch_sampler - if isinstance(batch_sampler, IndexBatchSamplerWrapper): - self.batch_indices = batch_sampler.batch_indices - if self.should_store_predictions: - self.epoch_batch_indices[dataloader_idx].append(batch_sampler.batch_indices) - - def on_predict_start(self) -> None: - # enable eval mode + no grads - self.on_predict_model_eval() - self.trainer.lightning_module.zero_grad() - self._previous_grad_status = torch.is_grad_enabled() - torch.set_grad_enabled(False) - - # hook - self.trainer.call_hook("on_predict_start") - self.trainer.call_hook("on_predict_epoch_start") - - def on_predict_epoch_end(self) -> Optional[_PREDICT_OUTPUT]: - self.trainer.profiler.describe() - - results = self.predictions - - self.trainer.call_hook("on_predict_epoch_end", results) - - if self.return_predictions: - return results[0] if self.num_dataloaders == 1 else results - - def on_predict_end(self): - # clear memory. the predictions are extracted in `on_predict_epoch_end`. - self.predictions = None - self.batch_indices = None - - # reset grad to its previous status. - torch.set_grad_enabled(self._previous_grad_status) - - # hook - self.trainer.call_hook("on_predict_end") diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py index fce4b431b347c..2d7a1d7e8f53a 100644 --- a/pytorch_lightning/trainer/progress.py +++ b/pytorch_lightning/trainer/progress.py @@ -11,12 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field from typing import Optional @dataclass -class Tracker: +class _DataclassStateDictMixin: + + def __getstate__(self) -> dict: + return asdict(self) + + def __setstate__(self, state: dict) -> None: + self.__dict__.update(state) + + def state_dict(self) -> dict: + return self.__getstate__() + + @classmethod + def from_state_dict(cls, state_dict: dict) -> "_DataclassStateDictMixin": + obj = cls() + obj.__setstate__(state_dict) + return obj + + +@dataclass +class Tracker(_DataclassStateDictMixin): """ Track an event's progress. @@ -28,6 +47,7 @@ class Tracker: Attributes set to ``None`` are treated as unused and are restricted. """ + ready: Optional[int] = 0 started: Optional[int] = 0 processed: Optional[int] = 0 @@ -55,7 +75,7 @@ def __repr__(self): @dataclass -class Progress: +class Progress(_DataclassStateDictMixin): """ Track aggregated and current progress. @@ -63,6 +83,7 @@ class Progress: total: Intended to track the total progress of an event current: Intended to track the current progress of an event """ + total: Tracker = field(default_factory=Tracker) current: Tracker = field(default_factory=Tracker) @@ -91,35 +112,70 @@ def increment_completed(self) -> None: self.current.completed += 1 @classmethod - def from_defaults(cls, **kwargs: Optional[int]) -> 'Progress': + def from_defaults(cls, **kwargs: Optional[int]) -> "Progress": return cls(total=Tracker(**kwargs), current=Tracker(**kwargs)) + def __setstate__(self, state: dict) -> None: + self.total.__setstate__(state["total"]) + self.current.__setstate__(state["current"]) -@dataclass -class LoopProgress: + +class BatchProgress(Progress): + """ + Tracks the batch progress + + Args: + total: Tracks the total epoch progress + current: Tracks the current epoch progress """ - Track loop progress during execution. + +@dataclass +class EpochProgress(Progress): + """ + Tracks the epoch progress These counters are local to a trainer rank. By default, they are not globally synced across all ranks. Args: - epoch: Tracks epochs progress. + total: Tracks the total epoch progress + current: Tracks the current epoch progress batch: Tracks batch progress. """ - epoch: Progress = field(default_factory=Progress) - batch: Progress = field(default_factory=Progress) - def increment_epoch_completed(self) -> None: - self.epoch.increment_completed() - self.reset_on_epoch() + batch: BatchProgress = field(default_factory=BatchProgress) def reset_on_epoch(self) -> None: self.batch.current.reset() - self.epoch.current.reset() + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + self.batch.__setstate__(state["batch"]) @dataclass -class OptimizationProgress: +class OptimizerProgress(_DataclassStateDictMixin): + """ + Track optimizer progress. + + Args: + step: Tracks ``optimizer.step`` calls. + zero_grad: Tracks ``optimizer.zero_grad`` calls. + """ + + step: Progress = field(default_factory=lambda: Progress.from_defaults(processed=None)) + zero_grad: Progress = field(default_factory=lambda: Progress.from_defaults(processed=None)) + + def reset_on_epoch(self) -> None: + self.step.current.reset() + self.zero_grad.current.reset() + + def __setstate__(self, state: dict) -> None: + self.step.__setstate__(state["step"]) + self.zero_grad.__setstate__(state["zero_grad"]) + + +@dataclass +class OptimizationProgress(_DataclassStateDictMixin): """ Track optimization progress. @@ -127,54 +183,86 @@ class OptimizationProgress: optimizer: Tracks optimizer progress. scheduler: Tracks scheduler progress. """ - optimizer: Progress = Progress.from_defaults(processed=None) - scheduler: Progress = Progress.from_defaults(started=None, processed=None) - zero_grad: Progress = Progress.from_defaults(processed=None) + + # TODO: support for multiple optimizers + optimizer: OptimizerProgress = field(default_factory=OptimizerProgress) + scheduler: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None)) @property def optimizer_steps(self) -> int: - return self.optimizer.total.completed + return self.optimizer.step.total.completed @property def scheduler_steps(self) -> int: return self.scheduler.total.completed + def reset_on_epoch(self) -> None: + self.optimizer.reset_on_epoch() + self.scheduler.current.reset() + + def __setstate__(self, state: dict) -> None: + self.optimizer.__setstate__(state["optimizer"]) + self.scheduler.__setstate__(state["scheduler"]) + @dataclass -class TrainingProgress(Progress): +class EpochLoopProgress(_DataclassStateDictMixin): """ - Extends ``Progress`` with training specific attributes + Tracks epoch loop progress. + These counters are local to a trainer rank. By default, they are not globally synced across all ranks. Args: - optimization: Tracks optimization progress + epoch: Tracks epochs progress. """ - optimization: OptimizationProgress = field(default_factory=OptimizationProgress) + epoch: EpochProgress = field(default_factory=EpochProgress) -@dataclass -class TrainingLoopProgress(LoopProgress): - epoch: TrainingProgress = field(default_factory=TrainingProgress) + def increment_epoch_completed(self) -> None: + self.epoch.increment_completed() + self.reset_on_epoch() def reset_on_epoch(self) -> None: - # override to avoid resetting `epoch.current` - self.batch.current.reset() + self.epoch.reset_on_epoch() + self.epoch.current.reset() + + def __setstate__(self, state: dict) -> None: + self.epoch.__setstate__(state["epoch"]) @dataclass -class FitLoopProgress: - train: TrainingLoopProgress = field(default_factory=TrainingLoopProgress) - val: LoopProgress = field(default_factory=LoopProgress) +class TrainingEpochProgress(EpochProgress): + """ + Extends ``EpochProgress`` with training specific attributes + + Args: + total: Tracks the total epoch progress. + current: Tracks the current epoch progress. + batch: Tracks batch progress. + optim: Tracks optimization progress. + val: Tracks val_loop progress. + """ + + optim: OptimizationProgress = field(default_factory=OptimizationProgress) + val: EpochLoopProgress = field(default_factory=EpochLoopProgress) + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + self.optim.__setstate__(state["optim"]) + self.val.__setstate__(state["val"]) @dataclass -class LoopState: +class FitLoopProgress(EpochLoopProgress): """ - Basic dataclass to track loop progress across trainer functions during trainer execution. + Extends ``EpochLoopProgress`` with fit specific attributes - This class will be removed and these attributes will live in each loop. + Args: + epoch: Tracks epochs progress. """ - fit: FitLoopProgress = field(default_factory=FitLoopProgress) - val: LoopProgress = field(default_factory=LoopProgress) - test: LoopProgress = field(default_factory=LoopProgress) - predict: LoopProgress = field(default_factory=LoopProgress) + epoch: TrainingEpochProgress = field(default_factory=TrainingEpochProgress) + + def reset_on_epoch(self) -> None: + # do not reset `epoch.current` as it should track the number of epochs this `fit` call + self.epoch.reset_on_epoch() + self.epoch.optim.reset_on_epoch() diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 440a6693aba43..edf30d7f3f79f 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -15,25 +15,29 @@ import os from abc import ABC from argparse import ArgumentParser, Namespace +from pathlib import Path from typing import cast, List, Optional, Type, TypeVar, Union import torch from torch.optim import Optimizer +import pytorch_lightning as pl from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ProgressBarBase from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter -from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.loggers.tensorboard import TensorBoardLogger +from pytorch_lightning.loops import PredictionLoop +from pytorch_lightning.loops.dataloader.evaluation_loop import EvaluationLoop +from pytorch_lightning.loops.fit_loop import FitLoop from pytorch_lightning.plugins import ParallelPlugin, PrecisionPlugin, TrainingTypePlugin from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector -from pytorch_lightning.trainer.states import RunningStage, TrainerState, TrainerStatus -from pytorch_lightning.trainer.training_loop import TrainLoop +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.trainer.states import RunningStage, TrainerFn, TrainerState, TrainerStatus from pytorch_lightning.utilities import DeviceType, DistributedType, rank_zero_warn from pytorch_lightning.utilities.argparse import ( add_argparse_args, @@ -59,7 +63,13 @@ class TrainerProperties(ABC): logger: LightningLoggerBase logger_connector: LoggerConnector state: TrainerState - train_loop: TrainLoop + fit_loop: FitLoop + validate_loop: EvaluationLoop + test_loop: EvaluationLoop + predict_loop: PredictionLoop + """ + Accelerator properties + """ @property def accelerator(self) -> Accelerator: @@ -125,6 +135,10 @@ def root_gpu(self) -> Optional[int]: def tpu_cores(self) -> int: return self.accelerator_connector.tpu_cores + @property + def ipus(self) -> int: + return self.accelerator_connector.ipus + @property def num_gpus(self) -> int: return self.accelerator_connector.num_gpus @@ -134,46 +148,92 @@ def data_parallel_device_ids(self) -> Optional[List[int]]: return self.accelerator_connector.parallel_device_ids @property - def log_dir(self) -> Optional[str]: - if self.logger is None: - dirpath = self.default_root_dir - else: - dirpath = getattr(self.logger, 'log_dir' if isinstance(self.logger, TensorBoardLogger) else 'save_dir') + def lightning_module(self) -> 'pl.LightningModule': + return self.accelerator.lightning_module - dirpath = self.accelerator.broadcast(dirpath) - return dirpath + @property + def optimizers(self) -> Optional[List[Optimizer]]: + return self.accelerator.optimizers + + @optimizers.setter + def optimizers(self, new_optims: Optional[List[Optimizer]]) -> None: + # Necessary to rewrap optimizers to lightning + # They will be re-created when accessing + # the `lightning_optimizers` trainer property + self._lightning_optimizers = None + + self.accelerator.optimizers = new_optims @property - def use_amp(self) -> bool: - return self.precision == 16 + def lr_schedulers(self) -> Optional[list]: + return self.accelerator.lr_schedulers + + @lr_schedulers.setter + def lr_schedulers(self, new_schedulers: Optional[list]) -> None: + self.accelerator.lr_schedulers = new_schedulers @property - def callback_metrics(self) -> dict: - return self.logger_connector.callback_metrics + def optimizer_frequencies(self) -> list: + return self.accelerator.optimizer_frequencies - @callback_metrics.setter - def callback_metrics(self, x: dict) -> None: - self.logger_connector.callback_metrics = x + @optimizer_frequencies.setter + def optimizer_frequencies(self, new_freqs: list) -> None: + self.accelerator.optimizer_frequencies = new_freqs @property - def logged_metrics(self) -> dict: - return self.logger_connector.logged_metrics + def amp_backend(self) -> Optional[str]: + return self.accelerator.amp_backend - @logged_metrics.setter - def logged_metrics(self, x: dict) -> None: - self.logger_connector.logged_metrics = x + @property + def precision(self) -> Union[str, int]: + return self.accelerator.precision @property - def progress_bar_metrics(self) -> dict: - return self.logger_connector.progress_bar_metrics + def scaler(self): + return self.accelerator.scaler - @progress_bar_metrics.setter - def progress_bar_metrics(self, x: dict) -> None: - self.logger_connector.progress_bar_metrics = x + @property + def gpus(self) -> Optional[Union[List[int], str, int]]: + return self.accelerator_connector.gpus @property - def interrupted(self) -> bool: - return self.state.status == TrainerStatus.INTERRUPTED + def model(self) -> torch.nn.Module: + """ + The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel. + To access the pure LightningModule, use + :meth:`~pytorch_lightning.trainer.trainer.Trainer.lightning_module` instead. + """ + return self.accelerator.model + + @model.setter + def model(self, model: torch.nn.Module) -> None: + """ + Setter for the model, pass-through to accelerator and plugin where the model reference is stored. + Used by the Tuner to reset the state of Trainer and Accelerator. + + Args: + model: The LightningModule, possibly wrapped into DataParallel or DistributedDataParallel, depending + on the backend. + """ + self.accelerator.model = model + + """ + General properties + """ + + @property + def log_dir(self) -> Optional[str]: + if self.logger is None: + dirpath = self.default_root_dir + else: + dirpath = getattr(self.logger, 'log_dir' if isinstance(self.logger, TensorBoardLogger) else 'save_dir') + + dirpath = self.accelerator.broadcast(dirpath) + return dirpath + + @property + def use_amp(self) -> bool: + return self.precision == 16 @property def is_global_zero(self) -> bool: @@ -194,39 +254,16 @@ def slurm_job_id(self) -> Optional[int]: job_id = None return job_id - @classmethod - def default_attributes(cls) -> dict: - init_signature = inspect.signature(cls) - return {k: v.default for k, v in init_signature.parameters.items()} - - @classmethod - def get_deprecated_arg_names(cls) -> List: - """Returns a list with deprecated Trainer arguments.""" - depr_arg_names = [] - for name, val in cls.__dict__.items(): - if name.startswith('DEPRECATED') and isinstance(val, (tuple, list)): - depr_arg_names.extend(val) - return depr_arg_names - - @classmethod - def from_argparse_args(cls: Type['_T'], args: Union[Namespace, ArgumentParser], **kwargs) -> '_T': - return from_argparse_args(cls, args, **kwargs) - - @classmethod - def parse_argparser(cls, arg_parser: Union[ArgumentParser, Namespace]) -> Namespace: - return parse_argparser(cls, arg_parser) - - @classmethod - def match_env_arguments(cls) -> Namespace: - return parse_env_variables(cls) - - @classmethod - def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser: - return add_argparse_args(cls, parent_parser, **kwargs) + @property + def lightning_optimizers(self) -> List[LightningOptimizer]: + if self._lightning_optimizers is None: + self.convert_to_lightning_optimizers() + return self._lightning_optimizers @property - def gpus(self) -> Optional[Union[List[int], str, int]]: - return self.accelerator_connector.gpus + def distributed_sampler_kwargs(self) -> Optional[dict]: + if isinstance(self.training_type_plugin, ParallelPlugin): + return self.training_type_plugin.distributed_sampler_kwargs @property def data_parallel(self) -> bool: @@ -242,11 +279,11 @@ def progress_bar_callback(self) -> Optional[ProgressBarBase]: def progress_bar_dict(self) -> dict: """ Read-only for progress bar metrics. """ ref_model = self.lightning_module - ref_model = cast(LightningModule, ref_model) + ref_model = cast(pl.LightningModule, ref_model) standard_metrics = ref_model.get_progress_bar_dict() - logged_metrics = self.progress_bar_metrics - duplicates = list(standard_metrics.keys() & logged_metrics.keys()) + pbar_metrics = self.progress_bar_metrics + duplicates = list(standard_metrics.keys() & pbar_metrics.keys()) if duplicates: rank_zero_warn( f"The progress bar already tracks a metric with the name(s) '{', '.join(duplicates)}' and" @@ -254,9 +291,7 @@ def progress_bar_dict(self) -> dict: f" If this is undesired, change the name or override `get_progress_bar_dict()`" f" in `LightingModule`.", UserWarning ) - all_metrics = dict(**standard_metrics) - all_metrics.update(**logged_metrics) - return all_metrics + return {**standard_metrics, **pbar_metrics} @property def enable_validation(self) -> bool: @@ -327,94 +362,54 @@ def checkpoint_callbacks(self) -> List[ModelCheckpoint]: """ return [c for c in self.callbacks if isinstance(c, ModelCheckpoint)] - def save_checkpoint(self, filepath, weights_only: bool = False) -> None: - self.checkpoint_connector.save_checkpoint(filepath, weights_only) - @property - def model(self) -> torch.nn.Module: - """ - The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel. - To access the pure LightningModule, use - :meth:`~pytorch_lightning.trainer.trainer.Trainer.lightning_module` instead. - """ - return self.accelerator.model - - @model.setter - def model(self, model: torch.nn.Module) -> None: - """ - Setter for the model, pass-through to accelerator and plugin where the model reference is stored. - Used by the Tuner to reset the state of Trainer and Accelerator. - - Args: - model: The LightningModule, possibly wrapped into DataParallel or DistributedDataParallel, depending - on the backend. - """ - self.accelerator.model = model + def resume_from_checkpoint(self) -> Optional[Union[str, Path]]: + return self.checkpoint_connector.resume_checkpoint_path - @property - def lightning_optimizers(self) -> List[LightningOptimizer]: - if self._lightning_optimizers is None: - self.convert_to_lightning_optimizers() - return self._lightning_optimizers - - @property - def lightning_module(self) -> LightningModule: - return self.accelerator.lightning_module - - @property - def optimizers(self) -> Optional[List[Optimizer]]: - return self.accelerator.optimizers - - @optimizers.setter - def optimizers(self, new_optims: Optional[List[Optimizer]]) -> None: - # Necessary to rewrap optimizers to lightning - # They will be re-created when accessing - # the `lightning_optimizers` trainer property - self._lightning_optimizers = None - - self.accelerator.optimizers = new_optims - - @property - def lr_schedulers(self) -> Optional[list]: - return self.accelerator.lr_schedulers + def save_checkpoint(self, filepath, weights_only: bool = False) -> None: + self.checkpoint_connector.save_checkpoint(filepath, weights_only) - @lr_schedulers.setter - def lr_schedulers(self, new_schedulers: Optional[list]) -> None: - self.accelerator.lr_schedulers = new_schedulers + """ + Parsing properties + """ - @property - def optimizer_frequencies(self) -> list: - return self.accelerator.optimizer_frequencies + @classmethod + def default_attributes(cls) -> dict: + init_signature = inspect.signature(cls) + return {k: v.default for k, v in init_signature.parameters.items()} - @optimizer_frequencies.setter - def optimizer_frequencies(self, new_freqs: list) -> None: - self.accelerator.optimizer_frequencies = new_freqs + @classmethod + def get_deprecated_arg_names(cls) -> List: + """Returns a list with deprecated Trainer arguments.""" + depr_arg_names = [] + for name, val in cls.__dict__.items(): + if name.startswith('DEPRECATED') and isinstance(val, (tuple, list)): + depr_arg_names.extend(val) + return depr_arg_names - @property - def amp_backend(self) -> Optional[str]: - return self.accelerator.amp_backend + @classmethod + def from_argparse_args(cls: Type['_T'], args: Union[Namespace, ArgumentParser], **kwargs) -> '_T': + return from_argparse_args(cls, args, **kwargs) - @property - def precision(self) -> Union[str, int]: - return self.accelerator.precision + @classmethod + def parse_argparser(cls, arg_parser: Union[ArgumentParser, Namespace]) -> Namespace: + return parse_argparser(cls, arg_parser) - @property - def scaler(self): - return self.accelerator.scaler + @classmethod + def match_env_arguments(cls) -> Namespace: + return parse_env_variables(cls) - # TODO: refactor this so that it can be done in LightningOptimizer - def __getstate__(self): - # remove lightning_optimizers - self._lightning_optimizers = None - return self.__dict__ + @classmethod + def add_argparse_args(cls, parent_parser: ArgumentParser, **kwargs) -> ArgumentParser: + return add_argparse_args(cls, parent_parser, **kwargs) - def __setstate__(self, state): - self.__dict__ = state + """ + State properties + """ @property - def distributed_sampler_kwargs(self) -> Optional[dict]: - if isinstance(self.training_type_plugin, ParallelPlugin): - return self.training_type_plugin.distributed_sampler_kwargs + def interrupted(self) -> bool: + return self.state.status == TrainerStatus.INTERRUPTED @property def training(self) -> bool: @@ -486,29 +481,91 @@ def sanity_checking(self, val: bool) -> None: elif self.sanity_checking: self.state.stage = None + """ + Loop properties + """ + @property def global_step(self) -> int: - return self.train_loop.global_step + return self.fit_loop.global_step @property def current_epoch(self) -> int: - return self.train_loop.current_epoch + return self.fit_loop.current_epoch @property def max_epochs(self) -> Optional[int]: - return self.train_loop.max_epochs + return self.fit_loop.max_epochs @property def min_epochs(self) -> Optional[int]: - return self.train_loop.min_epochs + return self.fit_loop.min_epochs @property def max_steps(self) -> Optional[int]: - return self.train_loop.max_steps + return self.fit_loop.max_steps @property def min_steps(self) -> Optional[int]: - return self.train_loop.min_steps + return self.fit_loop.min_steps + + @property + def is_last_batch(self) -> bool: + return self.fit_loop.epoch_loop.is_last_batch + + @property + def _evaluation_loop(self) -> EvaluationLoop: + if self.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING): + return self.fit_loop.epoch_loop.val_loop + if self.state.fn == TrainerFn.VALIDATING: + return self.validate_loop + if self.state.fn == TrainerFn.TESTING: + return self.test_loop + raise RuntimeError("The `Trainer._evaluation_loop` property isn't defined. Accessed outside of scope") + + @property + def _active_loop(self) -> Optional[Union[FitLoop, EvaluationLoop, PredictionLoop]]: + if self.training: + return self.fit_loop + if self.sanity_checking or self.evaluating: + return self._evaluation_loop + if self.predicting: + return self.predict_loop + + """ + Logging properties + """ + + @property + def callback_metrics(self) -> dict: + return self.logger_connector.callback_metrics + + @property + def logged_metrics(self) -> dict: + return self.logger_connector.logged_metrics + + @property + def progress_bar_metrics(self) -> dict: + return self.logger_connector.progress_bar_metrics + + @property + def _results(self) -> Optional[ResultCollection]: + active_loop = self._active_loop + if active_loop is not None: + return active_loop._results + + """ + Other + """ + + # TODO: refactor this so that it can be done in LightningOptimizer + def __getstate__(self): + # remove lightning_optimizers + self._lightning_optimizers = None + return self.__dict__ + + def __setstate__(self, state): + self.__dict__ = state # Used to represent the concrete type TrainerProperties class methods are called on. diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index df6db1e180c24..e93d87291193d 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -102,8 +102,7 @@ def _agg_memory(self, how: str): if self.last_idx is not None: if self.rotated: return getattr(self.memory, how)() - else: - return getattr(self.memory[:self.current_idx], how)() + return getattr(self.memory[:self.current_idx], how)() class PredictionCollection(object): @@ -158,7 +157,7 @@ def to_disk(self) -> None: # Switch predictions so each entry has its own dict outputs = [] for values in zip(*predictions.values()): - output_element = {k: v for k, v in zip(predictions.keys(), values)} + output_element = dict(zip(predictions.keys(), values)) outputs.append(output_element) # Write predictions for current file to disk @@ -295,10 +294,10 @@ def _get_len_recursive(self, data) -> int: if isinstance(data, Dataset): return len(data) - elif isinstance(data, (float, int)): + if isinstance(data, (float, int)): return data - elif isinstance(data, Mapping): + if isinstance(data, Mapping): if any(isinstance(v, (Mapping, Sequence, Dataset, Iterable)) for v in data.values()): return {k: self._get_len_recursive(v) for k, v in data.items()} elif isinstance(data, Sequence): @@ -417,9 +416,7 @@ def _calc_num_batches(loaders: Any) -> Union[int, float]: if isinstance(all_lengths, (int, float)): return all_lengths - - else: - return _nested_calc_num_data(all_lengths, min) + return _nested_calc_num_data(all_lengths, min) def __len__(self) -> int: return self._calc_num_batches(self.loaders) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b24d6d7b2da48..78aa5b2b73d7f 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -13,23 +13,22 @@ # limitations under the License. """Trainer to automate the training.""" import logging +import traceback import warnings from datetime import timedelta -from itertools import count from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Union from weakref import proxy import torch -from torch.utils.data import DataLoader +import pytorch_lightning as pl from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.callbacks import Callback from pytorch_lightning.core.datamodule import LightningDataModule -from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.memory import ModelSummary -from pytorch_lightning.core.step_result import Result from pytorch_lightning.loggers import LightningLoggerBase +from pytorch_lightning.loops import EvaluationLoop, FitLoop, PredictionLoop from pytorch_lightning.plugins import Plugin from pytorch_lightning.plugins.environments import ClusterEnvironment from pytorch_lightning.profiler import ( @@ -38,6 +37,7 @@ PassThroughProfiler, PyTorchProfiler, SimpleProfiler, + XLAProfiler, ) from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import ConfigValidator @@ -54,24 +54,29 @@ from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin from pytorch_lightning.trainer.deprecated_api import DeprecatedTrainerAttributes -from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin -from pytorch_lightning.trainer.predict_loop import PredictLoop from pytorch_lightning.trainer.properties import TrainerProperties from pytorch_lightning.trainer.states import TrainerFn, TrainerState, TrainerStatus -from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.tuner.lr_finder import _LRFinder from pytorch_lightning.tuner.tuning import Tuner -from pytorch_lightning.utilities import DeviceType, parsing, rank_zero_warn +from pytorch_lightning.utilities import ( + _IPU_AVAILABLE, + _TPU_AVAILABLE, + DeviceType, + parsing, + rank_zero_deprecation, + rank_zero_info, + rank_zero_warn, +) from pytorch_lightning.utilities.debugging import InternalDebugger +from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT +from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT, EVAL_DATALOADERS, TRAIN_DATALOADERS log = logging.getLogger(__name__) # warnings to ignore in trainer @@ -107,6 +112,7 @@ def __init__( gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, + ipus: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: Optional[int] = None, overfit_batches: Union[int, float] = 0.0, @@ -283,6 +289,8 @@ def __init__( tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] + ipus: How many IPUs to train on. + track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. truncated_bptt_steps: Deprecated in v1.3 to be removed in 1.5. @@ -323,20 +331,26 @@ def __init__( self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector( - num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, benchmark, - replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins + num_processes, tpu_cores, ipus, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, + benchmark, replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) self.training_tricks_connector = TrainingTricksConnector(self) - self.checkpoint_connector = CheckpointConnector(self) + self.checkpoint_connector = CheckpointConnector(self, resume_from_checkpoint) self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) - self.train_loop = TrainLoop(self, max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps) - self.evaluation_loop = EvaluationLoop(self) - self.predict_loop = PredictLoop(self) + + self.fit_loop = FitLoop(min_epochs, max_epochs, min_steps, max_steps) + self.validate_loop = EvaluationLoop() + self.test_loop = EvaluationLoop() + self.predict_loop = PredictionLoop() + self.fit_loop.connect(self) + self.validate_loop.connect(self) + self.test_loop.connect(self) + self.predict_loop.connect(self) # training state if weights_summary is not None and weights_summary not in ModelSummary.MODES: @@ -355,7 +369,6 @@ def __init__( process_position, default_root_dir, weights_save_path, - resume_from_checkpoint, stochastic_weight_avg, max_time, ) @@ -380,8 +393,7 @@ def __init__( truncated_bptt_steps, terminate_on_nan, ) - self.evaluation_loop.on_trainer_init() - self.predict_loop.on_trainer_init() + self._setup_on_init(num_sanity_val_steps) # configure tuner self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size) @@ -411,12 +423,45 @@ def __init__( # Callback system self.on_init_end() + def _setup_on_init( + self, + num_sanity_val_steps: int, + ) -> None: + self._log_device_info() + + self.should_stop = False + self.state = TrainerState() + self.num_training_batches = 0 + self.train_dataloader = None + + if num_sanity_val_steps == -1: + self.num_sanity_val_steps = float("inf") + else: + self.num_sanity_val_steps = num_sanity_val_steps + + self.num_sanity_val_batches = [] + self.num_test_batches = [] + self.num_val_batches = [] + self.test_dataloaders = None + self.val_dataloaders = None + + # .validate() and .test() set this when they load a checkpoint + self.validated_ckpt_path = None + self.tested_ckpt_path = None + + # when true, print evaluation results in .validate() and .test() + self.verbose_evaluate = True + + self.num_predict_batches = [] + self.predicted_ckpt_path = None + def fit( self, - model: LightningModule, - train_dataloader: Any = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + model: 'pl.LightningModule', + train_dataloaders: Optional[Union[TRAIN_DATALOADERS, LightningDataModule]] = None, + val_dataloaders: Optional[EVAL_DATALOADERS] = None, datamodule: Optional[LightningDataModule] = None, + train_dataloader=None, # noqa TODO: remove with 1.6 ) -> None: r""" Runs the full optimization routine. @@ -424,12 +469,11 @@ def fit( Args: model: Model to fit. - train_dataloader: Either a single PyTorch DataLoader or a collection of these - (list, dict, nested lists and dicts). In the case of multiple dataloaders, please - see this :ref:`page ` + train_dataloaders: A collection of :class:`torch.utils.data.DataLoader` or a + :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying training samples. + In the case of multiple dataloaders, please see this :ref:`page `. - val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. - If the model has a predefined val_dataloaders method this will be skipped + val_dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples. datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`. """ @@ -439,21 +483,29 @@ def fit( self.state.status = TrainerStatus.RUNNING self.training = True + if train_dataloader is not None: + rank_zero_deprecation( + "`trainer.fit(train_dataloader)` is deprecated in v1.4 and will be removed in v1.6." + " Use `trainer.fit(train_dataloaders)` instead. HINT: added 's'" + ) + train_dataloaders = train_dataloader # if a datamodule comes in as the second arg, then fix it for the user - if isinstance(train_dataloader, LightningDataModule): - datamodule = train_dataloader - train_dataloader = None + if isinstance(train_dataloaders, LightningDataModule): + datamodule = train_dataloaders + train_dataloaders = None # If you supply a datamodule you can't supply train_dataloader or val_dataloaders - if (train_dataloader is not None or val_dataloaders is not None) and datamodule is not None: + if (train_dataloaders is not None or val_dataloaders is not None) and datamodule is not None: raise MisconfigurationException( 'You cannot pass `train_dataloader` or `val_dataloaders` to `trainer.fit(datamodule=...)`' ) # links data to the trainer self.data_connector.attach_data( - model, train_dataloader=train_dataloader, val_dataloaders=val_dataloaders, datamodule=datamodule + model, train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, datamodule=datamodule ) + self.checkpoint_connector.resume_start() + self._run(model) assert self.state.stopped @@ -461,11 +513,12 @@ def fit( def validate( self, - model: Optional[LightningModule] = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + model: Optional['pl.LightningModule'] = None, + dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None, ckpt_path: Optional[str] = 'best', verbose: bool = True, datamodule: Optional[LightningDataModule] = None, + val_dataloaders=None, # noqa TODO: remove with 1.6 ) -> _EVALUATE_OUTPUT: r""" Perform one evaluation epoch over the validation set. @@ -473,8 +526,8 @@ def validate( Args: model: The model to validate. - val_dataloaders: Either a single PyTorch DataLoader or a list of them, - specifying validation samples. + dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them, + or a :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying validation samples. ckpt_path: Either ``best`` or path to the checkpoint you wish to validate. If ``None``, use the current weights of the model. @@ -499,11 +552,19 @@ def validate( self.state.status = TrainerStatus.RUNNING self.validating = True - # If you supply a datamodule you can't supply val_dataloaders - if val_dataloaders is not None and datamodule: - raise MisconfigurationException( - 'You cannot pass both `trainer.validate(val_dataloaders=..., datamodule=...)`' + if val_dataloaders is not None: + rank_zero_deprecation( + "`trainer.validate(val_dataloaders)` is deprecated in v1.4 and will be removed in v1.6." + " Use `trainer.validate(dataloaders)` instead." ) + dataloaders = val_dataloaders + # if a datamodule comes in as the second arg, then fix it for the user + if isinstance(dataloaders, LightningDataModule): + datamodule = dataloaders + dataloaders = None + # If you supply a datamodule you can't supply val_dataloaders + if dataloaders is not None and datamodule: + raise MisconfigurationException('You cannot pass both `trainer.validate(dataloaders=..., datamodule=...)`') model_provided = model is not None model = model or self.lightning_module @@ -513,7 +574,7 @@ def validate( ) # links data to the trainer - self.data_connector.attach_data(model, val_dataloaders=val_dataloaders, datamodule=datamodule) + self.data_connector.attach_data(model, val_dataloaders=dataloaders, datamodule=datamodule) if not model_provided: self.validated_ckpt_path = self.__load_ckpt_weights(ckpt_path) @@ -528,11 +589,12 @@ def validate( def test( self, - model: Optional[LightningModule] = None, - test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + model: Optional['pl.LightningModule'] = None, + dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None, ckpt_path: Optional[str] = 'best', verbose: bool = True, datamodule: Optional[LightningDataModule] = None, + test_dataloaders=None, # noqa TODO: remove with 1.6 ) -> _EVALUATE_OUTPUT: r""" Perform one evaluation epoch over the test set. It's separated from @@ -541,8 +603,8 @@ def test( Args: model: The model to test. - test_dataloaders: Either a single PyTorch DataLoader or a list of them, - specifying test samples. + dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them, + or a :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying test samples. ckpt_path: Either ``best`` or path to the checkpoint you wish to test. If ``None``, use the current weights of the model. @@ -565,9 +627,19 @@ def test( self.state.status = TrainerStatus.RUNNING self.testing = True + if test_dataloaders is not None: + rank_zero_deprecation( + "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6." + " Use `trainer.test(dataloaders)` instead." + ) + dataloaders = test_dataloaders + # if a datamodule comes in as the second arg, then fix it for the user + if isinstance(dataloaders, LightningDataModule): + datamodule = dataloaders + dataloaders = None # If you supply a datamodule you can't supply test_dataloaders - if test_dataloaders is not None and datamodule: - raise MisconfigurationException('You cannot pass both `trainer.test(test_dataloaders=..., datamodule=...)`') + if dataloaders is not None and datamodule: + raise MisconfigurationException('You cannot pass both `trainer.test(dataloaders=..., datamodule=...)`') model_provided = model is not None model = model or self.lightning_module @@ -577,7 +649,7 @@ def test( ) # links data to the trainer - self.data_connector.attach_data(model, test_dataloaders=test_dataloaders, datamodule=datamodule) + self.data_connector.attach_data(model, test_dataloaders=dataloaders, datamodule=datamodule) if not model_provided: self.tested_ckpt_path = self.__load_ckpt_weights(ckpt_path) @@ -592,8 +664,8 @@ def test( def predict( self, - model: Optional[LightningModule] = None, - dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + model: Optional['pl.LightningModule'] = None, + dataloaders: Optional[Union[EVAL_DATALOADERS, LightningDataModule]] = None, datamodule: Optional[LightningDataModule] = None, return_predictions: Optional[bool] = None, ckpt_path: Optional[str] = 'best', @@ -606,7 +678,8 @@ def predict( Args: model: The model to predict with. - dataloaders: Either a single PyTorch DataLoader or a list of them, specifying inference samples. + dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them, + or a :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying prediction samples. datamodule: The datamodule with a predict_dataloader method that returns one or more dataloaders. @@ -632,6 +705,10 @@ def predict( self.predict_loop.return_predictions = return_predictions + # if a datamodule comes in as the second arg, then fix it for the user + if isinstance(dataloaders, LightningDataModule): + datamodule = dataloaders + dataloaders = None if dataloaders is not None and datamodule: raise MisconfigurationException('You cannot pass both `trainer.predict(dataloaders=..., datamodule=...)`') @@ -657,12 +734,13 @@ def predict( def tune( self, - model: LightningModule, - train_dataloader: Optional[DataLoader] = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + model: 'pl.LightningModule', + train_dataloaders: Optional[Union[TRAIN_DATALOADERS, LightningDataModule]] = None, + val_dataloaders: Optional[EVAL_DATALOADERS] = None, datamodule: Optional[LightningDataModule] = None, scale_batch_size_kwargs: Optional[Dict[str, Any]] = None, lr_find_kwargs: Optional[Dict[str, Any]] = None, + train_dataloader=None, # noqa TODO: remove with 1.6 ) -> Dict[str, Optional[Union[int, _LRFinder]]]: r""" Runs routines to tune hyperparameters before training. @@ -670,11 +748,11 @@ def tune( Args: model: Model to tune. - train_dataloader: A Pytorch DataLoader with training samples. If the model has - a predefined train_dataloader method this will be skipped. + train_dataloaders: A collection of :class:`torch.utils.data.DataLoader` or a + :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying training samples. + In the case of multiple dataloaders, please see this :ref:`page `. - val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. - If the model has a predefined val_dataloaders method this will be skipped + val_dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples. datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`. @@ -688,19 +766,25 @@ def tune( self.state.status = TrainerStatus.RUNNING self.tuning = True + if train_dataloader is not None: + rank_zero_deprecation( + "`trainer.tune(train_dataloader)` is deprecated in v1.4 and will be removed in v1.6." + " Use `trainer.tune(train_dataloaders)` instead. HINT: added 's'" + ) + train_dataloaders = train_dataloader # if a datamodule comes in as the second arg, then fix it for the user - if isinstance(train_dataloader, LightningDataModule): - datamodule = train_dataloader - train_dataloader = None + if isinstance(train_dataloaders, LightningDataModule): + datamodule = train_dataloaders + train_dataloaders = None # If you supply a datamodule you can't supply train_dataloader or val_dataloaders - if (train_dataloader is not None or val_dataloaders is not None) and datamodule is not None: + if (train_dataloaders is not None or val_dataloaders is not None) and datamodule is not None: raise MisconfigurationException( 'You cannot pass `train_dataloader` or `val_dataloaders` to `trainer.tune(datamodule=...)`' ) # links data to the trainer self.data_connector.attach_data( - model, train_dataloader=train_dataloader, val_dataloaders=val_dataloaders, datamodule=datamodule + model, train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, datamodule=datamodule ) result = self.tuner._tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs, lr_find_kwargs=lr_find_kwargs) @@ -710,7 +794,7 @@ def tune( return result - def _run(self, model: LightningModule) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]: + def _run(self, model: 'pl.LightningModule') -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]: # clean hparams if hasattr(model, "hparams"): parsing.clean_namespace(model.hparams) @@ -731,6 +815,13 @@ def _run(self, model: LightningModule) -> Optional[Union[_EVALUATE_OUTPUT, _PRED self.accelerator.connect(model) self.accelerator.setup_environment() self._call_setup_hook(model) # allow user to setup lightning_module in accelerator environment + + # restore modules after setup + self.checkpoint_connector.restore_datamodule() + self.checkpoint_connector.restore_model() + # restore callback states + self.checkpoint_connector.restore_callbacks() + self._call_configure_sharded_model(model) # allow user to setup in model sharded environment self.accelerator.setup(self, model) # note: this sets up self.lightning_module @@ -752,7 +843,7 @@ def _run(self, model: LightningModule) -> Optional[Union[_EVALUATE_OUTPUT, _PRED {self.run_stage} || | || DIRECTION {self._run_train} || - or {self._run_evaluation} || + or {self._run_evaluate} || or {self._run_predict} || | || results \/ @@ -772,6 +863,9 @@ def _run(self, model: LightningModule) -> Optional[Union[_EVALUATE_OUTPUT, _PRED # plugin will setup fitting (e.g. ddp will launch child processes) self._pre_dispatch() + # restore optimizers, etc. + self.checkpoint_connector.restore_training_state() + # dispatch `start_training` or `start_evaluating` or `start_predicting` self._dispatch() @@ -806,7 +900,10 @@ def _pre_dispatch(self): def _post_dispatch(self): self.accelerator.post_dispatch(self) + # these `teardown` calls are here instead of in `_call_teardown_hook` since they are internal teardowns + # which need to happen before. self.accelerator.teardown() + self._active_loop.teardown() def _dispatch(self): if self.evaluating: @@ -833,6 +930,8 @@ def _pre_training_routine(self): # register auto-resubmit when on SLURM self.slurm_connector.register_slurm_signal_handlers() + self.checkpoint_connector.resume_end() + # -------------------------- # Pre-train # -------------------------- @@ -844,10 +943,8 @@ def _pre_training_routine(self): # print model summary if self.is_global_zero and self.weights_summary is not None and not self.testing: - ref_model.summarize(mode=self.weights_summary) - - # restore training and model before hpc is called - self.checkpoint_connector.restore_weights() + max_depth = ModelSummary.MODES[self.weights_summary] + ref_model.summarize(max_depth=max_depth) # on pretrain routine end self.on_pretrain_routine_end() @@ -861,56 +958,17 @@ def _run_train(self) -> None: self._run_sanity_check(self.lightning_module) - self.checkpoint_connector.has_trained = False - # enable train mode self.model.train() torch.set_grad_enabled(True) # reload data when needed model = self.lightning_module - self.train_loop.reset_train_val_dataloaders(model) - # hook - self.train_loop.on_train_start() + self.reset_train_val_dataloaders(model) try: - if self.train_loop.should_skip_training(): - return - # run all epochs - epochs = range(self.current_epoch, self.max_epochs) if self.max_epochs else count(self.current_epoch) - for epoch in epochs: - - # hook - self.train_loop.on_train_epoch_start(epoch) - - with self.profiler.profile("run_training_epoch"): - # run train epoch - self.train_loop.run_training_epoch() - - if self.max_steps and self.max_steps <= self.global_step: - self.train_loop.on_train_end() - return - - # early stopping - met_min_epochs = (epoch >= self.min_epochs - 1) if self.min_epochs else True - met_min_steps = self.global_step >= self.min_steps if self.min_steps else True - - if self.should_stop: - if met_min_epochs and met_min_steps: - self.train_loop.on_train_end() - return - else: - log.info( - 'Trainer was signaled to stop but required minimum epochs' - f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has' - ' not been met. Training will continue...' - ) - self.should_stop = False - - # hook - self.train_loop.on_train_end() - + self.fit_loop.run() except KeyboardInterrupt: rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...') # user could press Ctrl+c many times... only shutdown once @@ -919,137 +977,28 @@ def _run_train(self) -> None: self.on_keyboard_interrupt() # same treatment as below self.accelerator.on_train_end() - self.state.stage = None except BaseException: self.state.status = TrainerStatus.INTERRUPTED + if distributed_available() and self.world_size > 1: + # try syncing remaing processes, kill otherwise + self.training_type_plugin.reconciliate_processes(traceback.format_exc()) # give accelerators a chance to finish self.accelerator.on_train_end() # reset bookkeeping self.state.stage = None raise - def _run_evaluation(self) -> _EVALUATE_OUTPUT: - if not (self.evaluating or self.sanity_checking): - rank_zero_warn( - f"`trainer._run_evaluation()` was called but the running stage is set to {self.state.stage}." - " This should not happen normally. Setting it to `RunningStage.VALIDATING`", RuntimeWarning - ) - self.validating = True - - # prepare dataloaders - dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders() - - # check if we want to skip this evaluation - if sum(max_batches) == 0: - return [], [] - - # enable eval mode + no grads - self.evaluation_loop.on_evaluation_model_eval() - # ref model - model = self.lightning_module - model.zero_grad() - torch.set_grad_enabled(False) - - # hook - self.evaluation_loop.on_evaluation_start() - - # set up the eval loop - self.evaluation_loop.setup(max_batches, dataloaders) - - # hook - self.evaluation_loop.on_evaluation_epoch_start() - - # run validation/testing - for dataloader_idx, dataloader in enumerate(dataloaders): - # bookkeeping - dl_outputs = [] - dataloader = self.accelerator.process_dataloader(dataloader) - dl_max_batches = self.evaluation_loop.max_batches[dataloader_idx] - - for batch_idx, batch in enumerate(dataloader): - if batch is None: - continue - - # stop short when running on limited batches - if batch_idx >= dl_max_batches: - break - - # hook - self.evaluation_loop.on_evaluation_batch_start(batch, batch_idx, dataloader_idx) - - # lightning module methods - with self.profiler.profile("evaluation_step_and_end"): - output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx) - output = self.evaluation_loop.evaluation_step_end(output) - - # hook + store predictions - self.evaluation_loop.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx) - - # log batch metrics - self.logger_connector.log_evaluation_step_metrics() - - # track epoch level outputs - dl_outputs = self._track_output_for_epoch_end(dl_outputs, output) - - # store batch level output per dataloader - if self.evaluation_loop.should_track_batch_outputs_for_epoch_end: - self.evaluation_loop.outputs.append(dl_outputs) - - outputs = self.evaluation_loop.outputs - - # reset outputs - self.evaluation_loop.outputs = [] - - # with a single dataloader don't pass a 2D list - if len(outputs) > 0 and self.evaluation_loop.num_dataloaders == 1: - outputs = outputs[0] - - # lightning module method - self.evaluation_loop.evaluation_epoch_end(outputs) - - # hook - self.evaluation_loop.on_evaluation_epoch_end() - - # log epoch metrics - eval_loop_results = self.logger_connector.get_evaluate_epoch_results() - - # hook - self.evaluation_loop.on_evaluation_end() - - # save predictions to disk - self.evaluation_loop.predictions.to_disk() - - # enable train mode again - self.evaluation_loop.on_evaluation_model_train() - - # reset cached results - self.logger_connector.reset() - - torch.set_grad_enabled(True) - - return eval_loop_results - - def _track_output_for_epoch_end(self, outputs, output): - if output is not None: - if isinstance(output, Result): - output = output.detach() - if self.move_metrics_to_cpu: - output = output.cpu() - elif isinstance(output, dict): - output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu) - elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu: - output = output.cpu() - outputs.append(output) - return outputs - def _run_evaluate(self) -> _EVALUATE_OUTPUT: if not self.is_global_zero and self.progress_bar_callback is not None: self.progress_bar_callback.disable() assert self.evaluating - with self.profiler.profile(f"run_{self.state.stage}_evaluation"): - eval_loop_results = self._run_evaluation() + # reload dataloaders + self._evaluation_loop.reload_evaluation_dataloaders() + + with self.profiler.profile(f"run_{self.state.stage}_evaluation"), torch.no_grad(): + eval_loop_results = self._evaluation_loop.run() # remove the tensors from the eval results for i, result in enumerate(eval_loop_results): @@ -1061,42 +1010,9 @@ def _run_evaluate(self) -> _EVALUATE_OUTPUT: return eval_loop_results def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: - # prepare dataloaders - dataloaders, max_batches = self.predict_loop.get_predict_dataloaders() - - # check if we want to skip this evaluation - if self.predict_loop.should_skip_predict(max_batches): - return [] - - # set up the eval loop - self.predict_loop.setup(max_batches, dataloaders) - - # call hook - self.predict_loop.on_predict_start() - - # run validation/testing - for dataloader_idx, dataloader in enumerate(dataloaders): - dataloader = self.accelerator.process_dataloader(dataloader) - dl_max_batches = self.predict_loop.max_batches[dataloader_idx] - for batch_idx, batch in enumerate(dataloader): - if batch is None: - continue - - # stop short when running on limited batches - if batch_idx >= dl_max_batches: - break - - # lightning module methods - with self.profiler.profile("predict_step"): - self.predict_loop.predict_step(batch, batch_idx, dataloader_idx) - - # call hook - results = self.predict_loop.on_predict_epoch_end() - - # call hook - self.predict_loop.on_predict_end() - - return results + self.reset_predict_dataloader(self.lightning_module) + with torch.no_grad(): + return self.predict_loop.run() def _run_sanity_check(self, ref_model): using_val_step = ref_model.val_dataloader is not None and is_overridden('validation_step', ref_model) @@ -1111,17 +1027,25 @@ def _run_sanity_check(self, ref_model): # hook and callback self.on_sanity_check_start() + # reload dataloaders + self._evaluation_loop.reload_evaluation_dataloaders() + # run eval step - self._run_evaluation() + with torch.no_grad(): + self._evaluation_loop.run() self.on_sanity_check_end() - self.state.stage = stage + # reset validation metrics + self.logger_connector.reset() # reset the seed to what it was before sanity check # prevents sanity check to affect random sampling in training reset_seed() + # restore the previous stage when the sanity check if finished + self.state.stage = stage + def __load_ckpt_weights(self, ckpt_path: Optional[str]) -> Optional[str]: if ckpt_path is None: return @@ -1153,12 +1077,10 @@ def __load_ckpt_weights(self, ckpt_path: Optional[str]) -> Optional[str]: if not self._device_type == DeviceType.TPU: self.training_type_plugin.barrier() - self.training_type_plugin.restore_model_state_from_ckpt_path( - ckpt_path, map_location=lambda storage, loc: storage - ) + self.checkpoint_connector.restore_model_weights(ckpt_path) return ckpt_path - def _call_setup_hook(self, model: LightningModule) -> None: + def _call_setup_hook(self, model: 'pl.LightningModule') -> None: fn = self.state.fn._setup_fn self.accelerator.barrier("pre_setup") @@ -1170,7 +1092,7 @@ def _call_setup_hook(self, model: LightningModule) -> None: self.accelerator.barrier("post_setup") - def _call_configure_sharded_model(self, model: LightningModule) -> None: + def _call_configure_sharded_model(self, model: 'pl.LightningModule') -> None: # Call configure sharded model hook if accelerator requests. In some cases # we will not call the hook; the hook has initialized the sharded model for example. @@ -1183,7 +1105,7 @@ def _call_configure_sharded_model(self, model: LightningModule) -> None: model.call_configure_sharded_model_hook = True self.accelerator.call_configure_sharded_model_hook = False - def _call_teardown_hook(self, model: LightningModule) -> None: + def _call_teardown_hook(self, model: 'pl.LightningModule') -> None: fn = self.state.fn._setup_fn if self.datamodule is not None: @@ -1194,33 +1116,17 @@ def _call_teardown_hook(self, model: LightningModule) -> None: model._current_fx_name = None model._current_dataloader_idx = None - - def _reset_result_and_set_fx_name(self, hook_name: str) -> bool: - # on_before_zero_grad is called within training_step - # TODO(@carmocca): Result should handle this logic - if "batch_start" in hook_name or hook_name in ("on_before_zero_grad", "on_after_backward"): - return True - model_ref = self.lightning_module - if model_ref is not None: - # used to track current hook name called - model_ref._results = Result() - model_ref._current_fx_name = hook_name - return False - - def _cache_logged_metrics(self): - model_ref = self.lightning_module - if model_ref is not None: - # capture logging for this hook - self.logger_connector.cache_logged_metrics() + # these could have become stale if metrics are defined in `setup` + model._metric_attributes = None def call_hook(self, hook_name: str, *args, **kwargs) -> Any: - # Note this implementation is copy/pasted into the TrainLoop class in TrainLoop._on_train_epoch_end_hook + # Note this implementation is copy/pasted into the TrainLoop class in TrainingEpochLoop._on_train_epoch_end_hook # This was done to manage the deprecation of the `outputs` argument to on_train_epoch_end # If making changes to this function, ensure that those changes are also made to - # TrainLoop._on_train_epoch_end_hook - - # set hook_name to model + reset Result obj - skip = self._reset_result_and_set_fx_name(hook_name) + # TrainingEpochLoop._on_train_epoch_end_hook + if self.lightning_module: + prev_fx_name = self.lightning_module._current_fx_name + self.lightning_module._current_fx_name = hook_name # always profile hooks with self.profiler.profile(hook_name): @@ -1237,14 +1143,19 @@ def call_hook(self, hook_name: str, *args, **kwargs) -> Any: hook_fx = getattr(model_ref, hook_name) output = hook_fx(*args, **kwargs) - # if the PL module doesn't have the hook then call the accelerator - # used to auto-reduce things for the user with Results obj - elif hasattr(self.accelerator, hook_name): + # call the accelerator hook + if hasattr(self.accelerator, hook_name): accelerator_hook = getattr(self.accelerator, hook_name) - output = accelerator_hook(*args, **kwargs) + accelerator_output = accelerator_hook(*args, **kwargs) + # Rely on the accelerator output if lightningModule hook returns nothing + # Required for cases such as DataParallel where we reduce the output for the user + # todo: move this data parallel logic into the data parallel plugin + output = accelerator_output if output is None else output + + if self.lightning_module: + # restore current_fx when nested context + self.lightning_module._current_fx_name = prev_fx_name - if not skip: - self._cache_logged_metrics() return output @staticmethod @@ -1257,6 +1168,7 @@ def __init_profiler(self, profiler: Optional[Union[BaseProfiler, str]]) -> None: "simple": SimpleProfiler, "advanced": AdvancedProfiler, "pytorch": PyTorchProfiler, + "xla": XLAProfiler, } profiler = profiler.lower() if profiler not in PROFILERS: @@ -1272,3 +1184,30 @@ def __setup_profiler(self) -> None: local_rank = self.local_rank if self.world_size > 1 else None self.profiler._lightning_module = proxy(self.lightning_module) self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir) + + def _log_device_info(self) -> None: + rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') + + num_tpu_cores = self.tpu_cores if self.tpu_cores is not None else 0 + rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores') + + num_ipus = self.ipus if self.ipus is not None else 0 + rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs') + + if torch.cuda.is_available() and self._device_type != DeviceType.GPU: + rank_zero_warn( + "GPU available but not used. Set the gpus flag in your trainer" + " `Trainer(gpus=1)` or script `--gpus=1`." + ) + + if _TPU_AVAILABLE and self._device_type != DeviceType.TPU: + rank_zero_warn( + "TPU available but not used. Set the `tpu_cores` flag in your trainer" + " `Trainer(tpu_cores=8)` or script `--tpu_cores=8`." + ) + + if _IPU_AVAILABLE and self._device_type != DeviceType.IPU: + rank_zero_warn( + "IPU available but not used. Set the `ipus` flag in your trainer" + " `Trainer(ipus=8)` or script `--ipus=8`." + ) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py deleted file mode 100644 index ea33241b7a4af..0000000000000 --- a/pytorch_lightning/trainer/training_loop.py +++ /dev/null @@ -1,944 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict -from contextlib import contextmanager, suppress -from copy import copy -from functools import partial, update_wrapper -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from torch.optim import Optimizer - -from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.core.step_result import Result -from pytorch_lightning.plugins import ParallelPlugin -from pytorch_lightning.trainer.supporters import TensorRunningAccum -from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.finite_checks import detect_nan_parameters -from pytorch_lightning.utilities.grads import grad_norm -from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.parsing import AttributeDict -from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature -from pytorch_lightning.utilities.warnings import WarningCache - - -class TrainLoop: - - def __init__( - self, - trainer, - max_epochs: Optional[int], - min_epochs: Optional[int], - max_steps: Optional[int], - min_steps: Optional[int], - num_sanity_val_steps: int, - ): - self.trainer = trainer - self.accumulated_loss = None - self.warning_cache = WarningCache() - self._teardown_already_run = False - self.running_loss = TensorRunningAccum(window_length=20) - self._skip_backward = False - self._optimizer_freq_cumsum = None - self._hiddens = None - - self.global_step = 0 - self.current_epoch = 0 - self.trainer.should_stop = False - - # the total batch index across all epochs - self.total_batch_idx = 0 - # the current batch index in the loop that runs over the dataloader(s) - self.batch_idx = 0 - # the current split index when the batch gets split into chunks in truncated backprop through time - self.split_idx = None - - self.trainer.num_training_batches = 0 - self.trainer.train_dataloader = None - - # If neither max_epochs or max_steps is set, then use existing default of max_epochs = 1000 - self.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs - # If neither min_epochs or min_steps is set, then use existing default of min_epochs = 1 - self.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs - self.max_steps = max_steps - self.min_steps = min_steps - - if num_sanity_val_steps == -1: - self.trainer.num_sanity_val_steps = float("inf") - else: - self.trainer.num_sanity_val_steps = num_sanity_val_steps - - @property - def num_active_optimizers(self) -> int: - return len(self.get_active_optimizers()) - - @property - def optimizer_freq_cumsum(self): - if self._optimizer_freq_cumsum is None: - self._optimizer_freq_cumsum = np.cumsum(self.trainer.optimizer_frequencies) - return self._optimizer_freq_cumsum - - def should_skip_training(self) -> bool: - should_by_max_steps = self.max_steps is not None and self.global_step >= self.max_steps - should_by_epoch = self.max_epochs is not None and self.current_epoch >= self.max_epochs - return should_by_max_steps or should_by_epoch or self.trainer.num_training_batches == 0 - - def on_train_start(self): - # hook - self.trainer.call_hook("on_train_start") - - def on_train_end(self): - if self._teardown_already_run: - return - self._teardown_already_run = True - - # hook - self.trainer.call_hook("on_train_end") - - # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers. - # It might be related to xla tensors blocked when moving the cpu - # kill loggers - if self.trainer.logger is not None: - self.trainer.logger.finalize("success") - - # summarize profile results - self.trainer.profiler.describe() - - # give accelerators a chance to finish - self.trainer.accelerator.on_train_end() - - # reset bookkeeping - self.trainer.state.stage = None - - def on_train_epoch_start(self, epoch): - - # update training progress in trainer - self.current_epoch = epoch - - model = self.trainer.lightning_module - - # reset train dataloader - if epoch != 0 and self.trainer.reload_dataloaders_every_epoch: - self.trainer.reset_train_dataloader(model) - - # todo: specify the possible exception - with suppress(Exception): - # set seed for distributed sampler (enables shuffling for each epoch) - self.trainer.train_dataloader.sampler.set_epoch(epoch) - - # changing gradient according accumulation_scheduler - self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module) - - # stores accumulated grad fractions per batch - self.accumulated_loss = TensorRunningAccum(window_length=self.trainer.accumulate_grad_batches) - - # hook - self.trainer.call_hook("on_epoch_start") - self.trainer.call_hook("on_train_epoch_start") - - def on_train_batch_end(self, epoch_output, batch_end_outputs, batch, batch_idx, dataloader_idx): - batch_end_outputs = [opt_idx_out for opt_idx_out in batch_end_outputs if len(opt_idx_out)] - - processed_batch_end_outputs = TrainLoop._prepare_outputs(batch_end_outputs, batch_mode=True) - - # hook - self.trainer.call_hook('on_train_batch_end', processed_batch_end_outputs, batch, batch_idx, dataloader_idx) - self.trainer.call_hook('on_batch_end') - - # figure out what to track for epoch end - self.track_epoch_end_reduce_metrics(epoch_output, batch_end_outputs) - - # reset batch logger internals - self.trainer.logger_connector.on_train_batch_end() - - def reset_train_val_dataloaders(self, model) -> None: - """ - Resets train and val dataloaders if none are attached to the trainer. - - The val dataloader must be initialized before training loop starts, as the training loop - inspects the val dataloader to determine whether to run the evaluation loop. - """ - if self.trainer.train_dataloader is None: - self.trainer.reset_train_dataloader(model) - - if self.trainer.val_dataloaders is None: - self.trainer.reset_val_dataloader(model) - - def track_epoch_end_reduce_metrics(self, epoch_output, batch_end_outputs): - - hook_overridden = self._should_add_batch_output_to_epoch_output() - - # track the outputs to reduce at the end of the epoch - for opt_idx, opt_outputs in enumerate(batch_end_outputs): - sample_output = opt_outputs[-1] - - # decide if we need to reduce at the end of the epoch automatically - auto_reduce_tng_result = isinstance(sample_output, Result) and sample_output.should_reduce_on_epoch_end - - # only track when a) it needs to be autoreduced OR b) the user wants to manually reduce on epoch end - if not (hook_overridden or auto_reduce_tng_result): - continue - - # with 1 step (no tbptt) don't use a sequence at epoch end - if isinstance(opt_outputs, list) and len(opt_outputs) == 1 and not isinstance(opt_outputs[0], Result): - opt_outputs = opt_outputs[0] - - epoch_output[opt_idx].append(opt_outputs) - - def _should_add_batch_output_to_epoch_output(self) -> bool: - # We add to the epoch outputs if - # 1. The model defines training_epoch_end OR - # 2. The model overrides on_train_epoch_end which has `outputs` in the signature - # TODO: in v1.5 this only needs to check if training_epoch_end is overridden - lightning_module = self.trainer.lightning_module - if is_overridden("training_epoch_end", model=lightning_module): - return True - - if is_overridden("on_train_epoch_end", model=lightning_module): - model_hook_fx = getattr(lightning_module, "on_train_epoch_end") - if is_param_in_hook_signature(model_hook_fx, "outputs"): - return True - - return False - - def get_active_optimizers(self, batch_idx: Optional[int] = None) -> List[Tuple[int, Optimizer]]: - """ - Returns the currently active optimizers. When multiple optimizers are used with different frequencies, - only one of the optimizers is active at a time. - - Returns: - A list of tuples (opt_idx, optimizer) of currently active optimizers. - """ - if not self.trainer.optimizer_frequencies: - # call training_step once per optimizer - return list(enumerate(self.trainer.optimizers)) - - batch_idx = self.total_batch_idx if batch_idx is None else batch_idx - optimizers_loop_length = self.optimizer_freq_cumsum[-1] - current_place_in_loop = batch_idx % optimizers_loop_length - - # find optimzier index by looking for the first {item > current_place} in the cumsum list - opt_idx = int(np.argmax(self.optimizer_freq_cumsum > current_place_in_loop)) - return [(opt_idx, self.trainer.optimizers[opt_idx])] - - def on_after_backward(self, training_step_output, batch_idx, untouched_loss): - training_step_output.detach() - - # insert after step hook - self.trainer.call_hook("on_after_backward") - - # when in dev debugging track the losses - self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) - - def _check_training_step_output(self, training_step_output): - if isinstance(training_step_output, torch.Tensor) and not self.trainer.lightning_module.automatic_optimization: - if training_step_output.grad_fn is None: - # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... - raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") - - def training_step(self, split_batch, batch_idx, opt_idx, hiddens): - # give the PL module a result for logging - model_ref = self.trainer.lightning_module - - with self.trainer.profiler.profile("model_forward"): - step_kwargs = self._build_kwargs(split_batch, batch_idx, opt_idx, hiddens) - - # manually capture logged metrics - model_ref._current_fx_name = 'training_step' - model_ref._results = Result() - with self.trainer.profiler.profile("training_step"): - training_step_output = self.trainer.accelerator.training_step(step_kwargs) - self.trainer.accelerator.post_training_step() - - self.trainer.logger_connector.cache_logged_metrics() - - self._check_training_step_output(training_step_output) - - training_step_output = self.trainer.call_hook("training_step_end", training_step_output) - - training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( - training_step_output, split_batch - ) - if training_step_output_for_epoch_end is None: - return - - # enable empty loss when using manual opt - closure_loss = None - untouched_loss = None - - if self.trainer.lightning_module.automatic_optimization: - # accumulate loss. if accumulate_grad_batches==1, no effect - closure_loss = training_step_output.minimize / self.trainer.accumulate_grad_batches - - # the loss will get scaled for amp. avoid any modifications to it - untouched_loss = closure_loss.detach().clone() - - # result - result = AttributeDict( - closure_loss=closure_loss, - loss=untouched_loss, - training_step_output=training_step_output, - training_step_output_for_epoch_end=training_step_output_for_epoch_end, - ) - return result - - def _process_training_step_output(self, training_step_output, split_batch): - training_step_output_for_epoch_end = training_step_output - - # enable validation_step return None - if training_step_output_for_epoch_end is None: - return None, None - - result = self.trainer.lightning_module._results - - loss = None - hiddens = None - result["extra"] = {} - - # handle dict return - if isinstance(training_step_output, dict): - loss = training_step_output.pop("loss", None) - hiddens = training_step_output.pop("hiddens", None) - if hiddens is not None: - hiddens = hiddens.detach() - result["extra"] = training_step_output - - # handle scalar return - elif isinstance(training_step_output, torch.Tensor): - loss = training_step_output - - # map to results under the hood - result.minimize = loss - self._hiddens = hiddens - - # track batch for manual reduction with result - result.track_batch_size(len(split_batch)) - - # track metrics without grads for epoch reduction - training_step_output_for_epoch_end = copy(result) - training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach() - if self.trainer.move_metrics_to_cpu: - training_step_output_for_epoch_end = training_step_output_for_epoch_end.cpu() - - return training_step_output_for_epoch_end, result - - @staticmethod - def _prepare_outputs( - outputs: List[List[List[Result]]], - batch_mode: bool, - ) -> Union[List[List[List[Dict]]], List[List[Dict]], List[Dict], Dict]: - """ - Extract required information from batch or epoch end results. - - Args: - outputs: A 3-dimensional list of ``Result`` objects with dimensions: - [optimizer outs][batch outs][tbptt steps]. - - batch_mode: If True, ignore the batch output dimension. - - Returns: - The cleaned outputs with ``Result`` objects converted to dictionaries. All list dimensions of size one will - be collapsed. - """ - processed_outputs = [] - for opt_outputs in outputs: - # handle an edge case where an optimizer output is the empty list - if len(opt_outputs) == 0: - continue - - processed_batch_outputs = [] - - if batch_mode: - opt_outputs = [opt_outputs] - - for batch_outputs in opt_outputs: - processed_tbptt_outputs = [] - - for tbptt_output in batch_outputs: - out = tbptt_output.extra - out['loss'] = tbptt_output.minimize - processed_tbptt_outputs.append(out) - - # if there was only one tbptt step then we can collapse that dimension - if len(processed_tbptt_outputs) == 1: - processed_tbptt_outputs = processed_tbptt_outputs[0] - processed_batch_outputs.append(processed_tbptt_outputs) - - # batch_outputs should be just one dict (or a list of dicts if using tbptt) per optimizer - if batch_mode: - processed_batch_outputs = processed_batch_outputs[0] - processed_outputs.append(processed_batch_outputs) - - # if there is only one optimiser then we collapse that dimension - if len(processed_outputs) == 1: - processed_outputs = processed_outputs[0] - return processed_outputs - - def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure): - model_ref = self.trainer.lightning_module - - is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - using_native_amp = self.trainer.amp_backend == AMPType.NATIVE - - # native amp + lbfgs is a no go right now - if using_native_amp and is_lbfgs: - raise MisconfigurationException( - 'native PyTorch amp and lbfgs are not compatible.' - ' To request, please file a Github issue in PyTorch and tag @mcarilli' - ) - - # wraps into LightningOptimizer only for running step - optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx) - - # model hook - model_ref.optimizer_step( - self.trainer.current_epoch, - batch_idx, - optimizer, - opt_idx, - train_step_and_backward_closure, - on_tpu=self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE, - using_native_amp=using_native_amp, - using_lbfgs=is_lbfgs, - ) - - def on_before_zero_grad(self, optimizer): - self.trainer.call_hook('on_before_zero_grad', optimizer) - - def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): - self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) - - def track_and_norm_grad(self, optimizer) -> dict: - # track gradient norms - grad_norm_dict = self._track_gradient_norm() - - # clip gradients - self.trainer.accelerator.clip_gradients( - optimizer, self.trainer.gradient_clip_val, gradient_clip_algorithm=self.trainer.gradient_clip_algorithm - ) - return grad_norm_dict - - def _track_gradient_norm(self): - grad_norm_dict = {} - if (self.global_step + 1) % self.trainer.log_every_n_steps == 0: - if float(self.trainer.track_grad_norm) > 0: - model = self.trainer.lightning_module - grad_norm_dict = grad_norm(model, self.trainer.track_grad_norm) - return grad_norm_dict - - def _tbptt_split_batch(self, batch: Any) -> List[Any]: - splits = [batch] - truncated_bptt_enabled = self._truncated_bptt_enabled() - if truncated_bptt_enabled: - model_ref = self.trainer.lightning_module - with self.trainer.profiler.profile("tbptt_split_batch"): - splits = model_ref.tbptt_split_batch(batch, self._truncated_bptt_steps()) - return splits - - def run_training_epoch(self): - # modify dataloader if needed (ddp, etc...) - train_dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader) - - # track epoch output - epoch_output = [[] for _ in range(self.num_active_optimizers)] - - train_dataloader = self.trainer.data_connector.get_profiled_train_dataloader(train_dataloader) - dataloader_idx = 0 - batch_idx = None - - for batch_idx, (batch, is_last_batch) in train_dataloader: - self.batch_idx = batch_idx - - # ------------------------------------ - # TRAINING_STEP + TRAINING_STEP_END - # ------------------------------------ - with self.trainer.profiler.profile("run_training_batch"): - batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx) - - # when returning -1 from train_step, we end epoch early - if batch_output.signal == -1: - break - - # hook - # TODO: add outputs to batches - self.on_train_batch_end( - epoch_output, - batch_output.training_step_output_for_epoch_end, - batch, - batch_idx, - dataloader_idx, - ) - - # ----------------------------------------- - # SAVE METRICS TO LOGGERS - # ----------------------------------------- - self.trainer.logger_connector.log_train_step_metrics(batch_output) - - # ----------------------------------------- - # VALIDATE IF NEEDED - # ----------------------------------------- - should_check_val = self._should_check_val_fx(batch_idx, is_last_batch) - if should_check_val: - self.trainer.validating = True - self.trainer._run_evaluation() - self.trainer.training = True - - # ----------------------------------------- - # SAVE LOGGERS (ie: Tensorboard, etc...) - # ----------------------------------------- - self.save_loggers_on_train_batch_end() - - # update LR schedulers - self.update_lr_schedulers('step') - self.trainer.checkpoint_connector.has_trained = True - - self.total_batch_idx += 1 - - # progress global step according to grads progress - self.increment_accumulated_grad_global_step() - - max_steps_reached = (self.max_steps is not None and self.max_steps <= self.global_step) - if max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(is_last_batch): - break - - if batch_idx is None: - # dataloader/iterator did not produce a batch - return - - # handle epoch_output on epoch end - # TODO: this can log so ModelCheckpoint won't have access to them since the logger conector is updated after. - self.on_train_epoch_end(epoch_output) - - # the global step is manually decreased here due to backwards compatibility with existing loggers - # as they expect that the same step is used when logging epoch end metrics even when the batch loop has - # finished. this means the attribute does not exactly track the number of optimizer steps applied. - # TODO(@carmocca): deprecate and rename so users don't get confused - self.global_step -= 1 - # log epoch metrics - self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output) - self.global_step += 1 - - self.update_lr_schedulers('epoch') - - def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: - # inform logger the batch loop has finished - self.trainer.logger_connector.on_train_epoch_end() - - # prepare epoch output - processed_epoch_output = TrainLoop._prepare_outputs(epoch_output, batch_mode=False) - - # get the model and call model.training_epoch_end - model = self.trainer.lightning_module - - if is_overridden('training_epoch_end', model=model): - # run training_epoch_end - # refresh the result for custom logging at the epoch level - model._current_fx_name = 'training_epoch_end' - training_epoch_end_output = model.training_epoch_end(processed_epoch_output) - - if training_epoch_end_output is not None: - raise MisconfigurationException( - 'training_epoch_end expects a return of None. ' - 'HINT: remove the return statement in training_epoch_end' - ) - - # capture logging - self.trainer.logger_connector.cache_logged_metrics() - - # call train epoch end hooks - self._on_train_epoch_end_hook(processed_epoch_output) - self.trainer.call_hook('on_epoch_end') - - def _on_train_epoch_end_hook(self, processed_epoch_output) -> None: - # We cannot rely on Trainer.call_hook because the signatures might be different across - # lightning module and callback - # As a result, we need to inspect if the module accepts `outputs` in `on_train_epoch_end` - - # This implementation is copied from Trainer.call_hook - hook_name = "on_train_epoch_end" - - # set hook_name to model + reset Result obj - skip = self.trainer._reset_result_and_set_fx_name(hook_name) - - # always profile hooks - with self.trainer.profiler.profile(hook_name): - - # first call trainer hook - if hasattr(self.trainer, hook_name): - trainer_hook = getattr(self.trainer, hook_name) - trainer_hook(processed_epoch_output) - - # next call hook in lightningModule - model_ref = self.trainer.lightning_module - if is_overridden(hook_name, model_ref): - hook_fx = getattr(model_ref, hook_name) - if is_param_in_hook_signature(hook_fx, "outputs"): - self.warning_cache.warn( - "The signature of `ModelHooks.on_train_epoch_end` has changed in v1.3." - " `outputs` parameter has been deprecated." - " Support for the old signature will be removed in v1.5", DeprecationWarning - ) - model_ref.on_train_epoch_end(processed_epoch_output) - else: - model_ref.on_train_epoch_end() - - # if the PL module doesn't have the hook then call the accelerator - # used to auto-reduce things for the user with Results obj - elif hasattr(self.trainer.accelerator, hook_name): - accelerator_hook = getattr(self.trainer.accelerator, hook_name) - accelerator_hook() - - if not skip: - self.trainer._cache_logged_metrics() - - def run_training_batch(self, batch, batch_idx, dataloader_idx): - # track grad norms - grad_norm_dict = {} - - # bookkeeping - self._hiddens = None - - optimizers = list(enumerate(self.trainer.optimizers)) - - # track all outputs across time and num of optimizers - batch_outputs = [[] for _ in range(len(optimizers))] - - if batch is None: - self.warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...") - return AttributeDict( - signal=0, - grad_norm_dict={}, - training_step_output_for_epoch_end=batch_outputs, - ) - - # hook - response = self.trainer.call_hook("on_batch_start") - if response == -1: - return AttributeDict(signal=-1, grad_norm_dict={}) - - # hook - response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx) - if response == -1: - return AttributeDict(signal=-1, grad_norm_dict={}) - - # lightning module hook - splits = self._tbptt_split_batch(batch) - - for split_idx, split_batch in enumerate(splits): - self.split_idx = split_idx - - if self.trainer.lightning_module.automatic_optimization: - for opt_idx, optimizer in self.get_active_optimizers(batch_idx): - result = self._run_optimization(batch_idx, split_idx, split_batch, opt_idx, optimizer) - if result: - batch_outputs[opt_idx].append(result.training_step_output_for_epoch_end) - grad_norm_dict = result.get("grad_norm_dict", {}) - else: - # in manual optimization, there is no looping over optimizers - result = self._run_optimization(batch_idx, split_idx, split_batch) - if result: - batch_outputs[0].append(result.training_step_output_for_epoch_end) - - output = AttributeDict( - signal=0, - # todo: Properly aggregate grad_norm accros opt_idx and split_idx - grad_norm_dict=grad_norm_dict, - training_step_output_for_epoch_end=batch_outputs, - ) - return output - - def _run_optimization(self, batch_idx, split_idx, split_batch, opt_idx=0, optimizer=None): - # TODO: In v1.5, when optimizer_idx gets removed from training_step in manual_optimization, change - # opt_idx=0 to opt_idx=None in the signature here - - # toggle model params + set info to logger_connector - self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer) - - result = AttributeDict() - closure = self.make_closure(split_batch, batch_idx, opt_idx, optimizer, self._hiddens, result) - - if self.should_accumulate(): - # For gradient accumulation - - # ------------------- - # calculate loss (train step + train step end) - # ------------------- - # automatic_optimization=True: perform ddp sync only when performing optimizer_step - # automatic_optimization=False: don't block synchronization here - with self.block_ddp_sync_behaviour(): - closure() - - # ------------------------------ - # BACKWARD PASS - # ------------------------------ - # gradient update with accumulated gradients - else: - if self.trainer.lightning_module.automatic_optimization: - self.optimizer_step(optimizer, opt_idx, batch_idx, closure) - if len(self.trainer.optimizers) > 1: - # revert back to previous state - self.trainer.lightning_module.untoggle_optimizer(opt_idx) - else: - result = self.training_step(split_batch, batch_idx, opt_idx, self._hiddens) - - if not result: - # user decided to skip optimization - return result - - # update running loss + reset accumulated loss - self.update_running_loss(result.loss) - - self._process_closure_result(result) - return result - - def training_step_and_backward_closure( - self, - split_batch: Any, - batch_idx: int, - opt_idx: int, - optimizer: Optimizer, - hiddens, - return_result: AttributeDict, - ) -> Optional[torch.Tensor]: - - step_result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) - if step_result is not None: - return_result.update(step_result) - return return_result.loss - - def make_closure(self, *closure_args, **closure_kwargs: Any) -> Callable: - """ Wraps the training step closure into a partial object which will be called within ``optimizer.step``. """ - partial_func = partial(self.training_step_and_backward_closure, *closure_args, **closure_kwargs) - return update_wrapper(partial_func, self.training_step_and_backward_closure) - - @contextmanager - def block_ddp_sync_behaviour(self, should_block_sync: bool = False): - """ - automatic_optimization = True - Blocks ddp sync gradients behaviour on backwards pass. - This is useful for skipping sync when accumulating gradients, reducing communication overhead - - automatic_optimization = False - do not block ddp gradient sync when using manual optimization - as gradients are needed within the training step - - Returns: - context manager with sync behaviour off - - """ - if ( - isinstance(self.trainer.training_type_plugin, ParallelPlugin) - and (self.trainer.lightning_module.automatic_optimization or should_block_sync) - ): - with self.trainer.training_type_plugin.block_backward_sync(): - yield None - else: - yield None - - def _process_closure_result(self, opt_closure_result: Optional[AttributeDict]) -> None: - if not opt_closure_result: - return - - # cache metrics - self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result) - - # check if loss or model weights are nan - if self.trainer.terminate_on_nan: - self._check_finite(opt_closure_result.loss) - - def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens): - """Wrap forward, zero_grad and backward in a closure so second order methods work""" - with self.trainer.profiler.profile("training_step_and_backward"): - # lightning module hook - result = self.training_step(split_batch, batch_idx, opt_idx, hiddens) - - if not self._skip_backward and self.trainer.lightning_module.automatic_optimization: - is_first_batch_to_accumulate = batch_idx % self.trainer.accumulate_grad_batches == 0 - - if is_first_batch_to_accumulate: - self.on_before_zero_grad(optimizer) - self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) - - # backward pass - if result is not None: - with self.trainer.profiler.profile("backward"): - self.backward(result, optimizer, opt_idx) - - # hook - call this hook only - # when gradients have finished to accumulate - if not self.should_accumulate(): - self.on_after_backward(result.training_step_output, batch_idx, result.loss) - - # check if loss or model weights are nan - if self.trainer.terminate_on_nan: - self._check_finite(result.loss) - - else: - self.warning_cache.warn( - "training_step returned None. If this was on purpose, ignore this warning..." - ) - - return result - - def _check_finite(self, loss: torch.Tensor) -> None: - if not torch.isfinite(loss).all(): - raise ValueError(f'The loss returned in `training_step` is {loss}.') - model = self.trainer.lightning_module - detect_nan_parameters(model) - - def backward(self, result, optimizer, opt_idx, *args, **kwargs): - self.trainer.dev_debugger.track_event("backward_call") - - should_accumulate = self.should_accumulate() - - # backward can be called manually in the training loop - if isinstance(result, torch.Tensor): - self.trainer.accelerator.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs) - else: - result.closure_loss = self.trainer.accelerator.backward( - result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs - ) - - if not self.should_accumulate(): - # track gradients - result.grad_norm_dict = self.track_and_norm_grad(optimizer=optimizer) - - def update_lr_schedulers(self, interval: str) -> None: - if interval == "step": - finished_accumulation = self._accumulated_batches_reached() - finished_epoch = self._num_training_batches_reached() - if not finished_accumulation and not finished_epoch: - return - self.trainer.optimizer_connector.update_learning_rates( - interval=interval, - opt_indices=[opt_idx for opt_idx, _ in self.get_active_optimizers()], - ) - - def increment_accumulated_grad_global_step(self): - num_accumulated_batches_reached = self._accumulated_batches_reached() - num_training_batches_reached = self._num_training_batches_reached() - - # progress global step according to grads progress - if num_accumulated_batches_reached or num_training_batches_reached: - self.global_step = self.trainer.accelerator.update_global_step(self.total_batch_idx, self.global_step) - - def _accumulated_batches_reached(self): - return (self.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0 - - def _num_training_batches_reached(self, is_last_batch=False): - return (self.batch_idx + 1) == self.trainer.num_training_batches or is_last_batch - - def should_accumulate(self): - # checks if backward or backward + optimizer step (via closure) - accumulation_done = self._accumulated_batches_reached() - is_final_batch = self._num_training_batches_reached() - return not (accumulation_done or is_final_batch) - - def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: - """ Decide if we should run validation. """ - if not self.trainer.enable_validation: - return False - - is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0 - if not is_val_check_epoch: - return False - - # val_check_batch is inf for iterable datasets with no length defined - is_infinite_dataset = self.trainer.val_check_batch == float('inf') - if is_last_batch and is_infinite_dataset: - return True - - if self.trainer.should_stop: - return True - - # TODO: let training/eval loop handle logic around limit_*_batches and val_check_batch - is_val_check_batch = is_last_batch - if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset: - is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0 - elif self.trainer.val_check_batch != float('inf'): - is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 - return is_val_check_batch - - def _build_kwargs(self, batch, batch_idx, opt_idx, hiddens): - # enable not needing to add opt_idx to training_step - step_kwargs = OrderedDict([('batch', batch), ('batch_idx', batch_idx)]) - - lightning_module = self.trainer.lightning_module - - if len(self.trainer.optimizers) > 1: - training_step_fx = getattr(lightning_module, "training_step") - has_opt_idx_in_train_step = is_param_in_hook_signature(training_step_fx, "optimizer_idx") - if has_opt_idx_in_train_step: - if not lightning_module.automatic_optimization: - self.warning_cache.warn( - "`training_step` hook signature has changed in v1.3." - " `optimizer_idx` argument has been removed in case of manual optimization. Support for" - " the old signature will be removed in v1.5", DeprecationWarning - ) - step_kwargs['optimizer_idx'] = opt_idx - elif not has_opt_idx_in_train_step and self.trainer.lightning_module.automatic_optimization: - raise ValueError( - f"Your LightningModule defines {len(self.trainer.optimizers)} optimizers but" - ' `training_step` is missing the `optimizer_idx` argument.' - ) - - # pass hiddens if using tbptt - if self._truncated_bptt_enabled(): - step_kwargs['hiddens'] = hiddens - - return step_kwargs - - def _truncated_bptt_enabled(self) -> bool: - """ Temporary tbptt utilities until this flag is fully migrated to the lightning module. """ - return self._truncated_bptt_steps() > 0 - - def _truncated_bptt_steps(self) -> int: - lightning_module = self.trainer.lightning_module - # Give precedence to the LightningModule as the Trainer flag will be removed in v1.5 - if lightning_module.truncated_bptt_steps > 0: - return lightning_module.truncated_bptt_steps - return self.trainer.truncated_bptt_steps or 0 - - def save_loggers_on_train_batch_end(self): - # when loggers should save to disk - should_flush_logs = self.trainer.logger_connector.should_flush_logs - if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None: - self.trainer.logger.save() - - def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer): - # make sure only the gradients of the current optimizer's parameters are calculated - # in the training step to prevent dangling gradients in multiple-optimizer setup. - if self.trainer.lightning_module.automatic_optimization and len(self.trainer.optimizers) > 1: - model = self.trainer.lightning_module - model.toggle_optimizer(optimizer, opt_idx) - - # use to track metrics internally - self.trainer.logger_connector.on_train_split_start(split_idx, opt_idx, split_batch) - - def update_running_loss(self, current_loss: torch.Tensor) -> None: - if self.trainer.lightning_module.automatic_optimization: - # track total loss for logging (avoid mem leaks) - self.accumulated_loss.append(current_loss) - - accumulated_loss = self.accumulated_loss.mean() - - if accumulated_loss is not None: - # calculate running loss for display - self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) - - # reset for next set of accumulated grads - self.accumulated_loss.reset() diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index a45c9436dbdb7..beecc5e2a764d 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -18,7 +18,7 @@ import torch from torch import Tensor -from pytorch_lightning.core.lightning import LightningModule +import pytorch_lightning as pl from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.finite_checks import detect_nan_parameters, print_nan_gradients @@ -34,7 +34,7 @@ class TrainerTrainingTricksMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class - lightning_module: LightningModule + lightning_module: 'pl.LightningModule' def print_nan_gradients(self) -> None: rank_zero_deprecation( diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py index d114c36a60104..f23a7f883c5a2 100644 --- a/pytorch_lightning/tuner/batch_size_scaling.py +++ b/pytorch_lightning/tuner/batch_size_scaling.py @@ -17,7 +17,7 @@ import pytorch_lightning as pl from pytorch_lightning.loggers.base import DummyLogger -from pytorch_lightning.utilities import DeviceType, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.data import has_len from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -83,7 +83,7 @@ def scale_batch_size( # Restore initial state of model if trainer.is_global_zero: - trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer._device_type == DeviceType.GPU) + trainer.checkpoint_connector.restore(str(save_path)) fs = get_filesystem(str(save_path)) if fs.exists(save_path): fs.rm(save_path) @@ -115,8 +115,8 @@ def __scale_batch_dump_params(trainer: 'pl.Trainer') -> None: def __scale_batch_reset_params(trainer: 'pl.Trainer', model: 'pl.LightningModule', steps_per_trial: int) -> None: trainer.auto_scale_batch_size = None # prevent recursion trainer.auto_lr_find = False # avoid lr find being called multiple times - trainer.train_loop.current_epoch = 0 - trainer.train_loop.max_steps = steps_per_trial # take few steps + trainer.fit_loop.current_epoch = 0 + trainer.fit_loop.max_steps = steps_per_trial # take few steps trainer.weights_summary = None # not needed before full run trainer.logger = DummyLogger() trainer.callbacks = [] # not needed before full run @@ -127,8 +127,8 @@ def __scale_batch_reset_params(trainer: 'pl.Trainer', model: 'pl.LightningModule def __scale_batch_restore_params(trainer: 'pl.Trainer') -> None: trainer.auto_lr_find = trainer.__dumped_params['auto_lr_find'] - trainer.train_loop.current_epoch = trainer.__dumped_params['current_epoch'] - trainer.train_loop.max_steps = trainer.__dumped_params['max_steps'] + trainer.fit_loop.current_epoch = trainer.__dumped_params['current_epoch'] + trainer.fit_loop.max_steps = trainer.__dumped_params['max_steps'] trainer.weights_summary = trainer.__dumped_params['weights_summary'] trainer.logger = trainer.__dumped_params['logger'] trainer.callbacks = trainer.__dumped_params['callbacks'] @@ -144,7 +144,7 @@ def _run_power_scaling( """ Batch scaling mode where the size is doubled at each iteration until an OOM error is encountered. """ for _ in range(max_trials): garbage_collection_cuda() - trainer.train_loop.global_step = 0 # reset after each try + trainer.fit_loop.global_step = 0 # reset after each try try: # Try fit trainer.tuner._run(model) @@ -178,7 +178,7 @@ def _run_binsearch_scaling( count = 0 while True: garbage_collection_cuda() - trainer.train_loop.global_step = 0 # reset after each try + trainer.fit_loop.global_step = 0 # reset after each try try: # Try fit trainer.tuner._run(model) diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py index 71d145d921ff4..29a93d3916aea 100644 --- a/pytorch_lightning/tuner/lr_finder.py +++ b/pytorch_lightning/tuner/lr_finder.py @@ -25,7 +25,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks import Callback from pytorch_lightning.loggers.base import DummyLogger -from pytorch_lightning.utilities import DeviceType, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import lightning_hasattr, lightning_setattr @@ -230,7 +230,7 @@ def lr_find( trainer.logger = DummyLogger() # Max step set to number of iterations - trainer.train_loop.max_steps = num_training + trainer.fit_loop.max_steps = num_training # Disable standard progress bar for fit if trainer.progress_bar_callback: @@ -255,11 +255,11 @@ def lr_find( # Transfer results from callback to lr finder object lr_finder.results.update({'lr': trainer.callbacks[0].lrs, 'loss': trainer.callbacks[0].losses}) - lr_finder._total_batch_idx = trainer.train_loop.total_batch_idx # for debug purpose + lr_finder._total_batch_idx = trainer.fit_loop.total_batch_idx # for debug purpose # Reset model state if trainer.is_global_zero: - trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer._device_type == DeviceType.GPU) + trainer.checkpoint_connector.restore(str(save_path)) fs = get_filesystem(str(save_path)) if fs.exists(save_path): fs.rm(save_path) @@ -297,8 +297,8 @@ def __lr_finder_restore_params(trainer, model): trainer.auto_lr_find = trainer.__dumped_params['auto_lr_find'] trainer.logger = trainer.__dumped_params['logger'] trainer.callbacks = trainer.__dumped_params['callbacks'] - trainer.train_loop.max_steps = trainer.__dumped_params['max_steps'] - trainer.train_loop.current_epoch = trainer.__dumped_params['current_epoch'] + trainer.fit_loop.max_steps = trainer.__dumped_params['max_steps'] + trainer.fit_loop.current_epoch = trainer.__dumped_params['current_epoch'] model.configure_optimizers = trainer.__dumped_params['configure_optimizers'] del trainer.__dumped_params @@ -340,7 +340,7 @@ def __init__( def on_batch_start(self, trainer, pl_module): """ Called before each training batch, logs the lr that will be used """ - if (trainer.train_loop.batch_idx + 1) % trainer.accumulate_grad_batches != 0: + if (trainer.fit_loop.batch_idx + 1) % trainer.accumulate_grad_batches != 0: return if self.progress_bar_refresh_rate and self.progress_bar is None: @@ -350,13 +350,13 @@ def on_batch_start(self, trainer, pl_module): def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): """ Called when the training batch ends, logs the calculated loss """ - if (trainer.train_loop.batch_idx + 1) % trainer.accumulate_grad_batches != 0: + if (trainer.fit_loop.batch_idx + 1) % trainer.accumulate_grad_batches != 0: return if self.progress_bar: self.progress_bar.update() - current_loss = trainer.train_loop.running_loss.last().item() + current_loss = trainer.fit_loop.running_loss.last().item() current_step = trainer.global_step # Avg loss (loss with momentum) + smoothing @@ -366,7 +366,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, data # Check if we diverging if self.early_stop_threshold is not None: if current_step > 1 and smoothed_loss > self.early_stop_threshold * self.best_loss: - trainer.train_loop.max_steps = current_step # stop signal + trainer.fit_loop.max_steps = current_step # stop signal if self.progress_bar: self.progress_bar.close() diff --git a/pytorch_lightning/tuner/tuning.py b/pytorch_lightning/tuner/tuning.py index a25b950ee3fca..449f9d862ecef 100644 --- a/pytorch_lightning/tuner/tuning.py +++ b/pytorch_lightning/tuner/tuning.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, List, Optional, Union - -from torch.utils.data import DataLoader +from typing import Any, Dict, Optional, Union import pytorch_lightning as pl from pytorch_lightning.trainer.states import TrainerStatus from pytorch_lightning.tuner.batch_size_scaling import scale_batch_size from pytorch_lightning.tuner.lr_finder import _LRFinder, lr_find +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS class Tuner: @@ -67,14 +66,15 @@ def _run(self, *args: Any, **kwargs: Any) -> None: def scale_batch_size( self, model: 'pl.LightningModule', - train_dataloader: Optional[DataLoader] = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + train_dataloaders: Optional[Union[TRAIN_DATALOADERS, 'pl.LightningDataModule']] = None, + val_dataloaders: Optional[EVAL_DATALOADERS] = None, datamodule: Optional['pl.LightningDataModule'] = None, mode: str = 'power', steps_per_trial: int = 3, init_val: int = 2, max_trials: int = 25, batch_arg_name: str = 'batch_size', + train_dataloader=None, # noqa TODO: remove with 1.6 ) -> Optional[int]: """ Iteratively try to find the largest batch size for a given model @@ -83,11 +83,11 @@ def scale_batch_size( Args: model: Model to tune. - train_dataloader: A Pytorch DataLoader with training samples. If the model has - a predefined train_dataloader method this will be skipped. + train_dataloaders: A collection of :class:`torch.utils.data.DataLoader` or a + :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying training samples. + In the case of multiple dataloaders, please see this :ref:`page `. - val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. - If the model has a predefined val_dataloaders method this will be skipped + val_dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples. datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`. @@ -118,7 +118,8 @@ def scale_batch_size( self.trainer.auto_scale_batch_size = True result = self.trainer.tune( model, - train_dataloader=train_dataloader, + train_dataloaders=train_dataloaders, + train_dataloader=train_dataloader, # TODO: deprecated - remove with 1.6 val_dataloaders=val_dataloaders, datamodule=datamodule, scale_batch_size_kwargs={ @@ -135,8 +136,8 @@ def scale_batch_size( def lr_find( self, model: 'pl.LightningModule', - train_dataloader: Optional[DataLoader] = None, - val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, + train_dataloaders: Optional[Union[TRAIN_DATALOADERS, 'pl.LightningDataModule']] = None, + val_dataloaders: Optional[EVAL_DATALOADERS] = None, datamodule: Optional['pl.LightningDataModule'] = None, min_lr: float = 1e-8, max_lr: float = 1, @@ -144,6 +145,7 @@ def lr_find( mode: str = 'exponential', early_stop_threshold: float = 4.0, update_attr: bool = False, + train_dataloader=None, # noqa TODO: remove with 1.6 ) -> Optional[_LRFinder]: """ Enables the user to do a range test of good initial learning rates, @@ -152,11 +154,11 @@ def lr_find( Args: model: Model to tune. - train_dataloader: A Pytorch DataLoader with training samples. If the model has - a predefined train_dataloader method this will be skipped. + train_dataloaders: A collection of :class:`torch.utils.data.DataLoader` or a + :class:`~pytorch_lightning.core.datamodule.LightningDataModule` specifying training samples. + In the case of multiple dataloaders, please see this :ref:`page `. - val_dataloaders: Either a single Pytorch Dataloader or a list of them, specifying validation samples. - If the model has a predefined val_dataloaders method this will be skipped + val_dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples. datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`. @@ -185,7 +187,8 @@ def lr_find( self.trainer.auto_lr_find = True result = self.trainer.tune( model, - train_dataloader=train_dataloader, + train_dataloaders=train_dataloaders, + train_dataloader=train_dataloader, # TODO: deprecated - remove with 1.6 val_dataloaders=val_dataloaders, datamodule=datamodule, lr_find_kwargs={ diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 6664be43bef88..536b36ceb81b0 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -16,13 +16,7 @@ import numpy from pytorch_lightning.utilities.apply_func import move_data_to_device # noqa: F401 -from pytorch_lightning.utilities.distributed import ( # noqa: F401 - AllGatherGrad, - rank_zero_deprecation, - rank_zero_info, - rank_zero_only, - rank_zero_warn, -) +from pytorch_lightning.utilities.distributed import AllGatherGrad, rank_zero_info, rank_zero_only # noqa: F401 from pytorch_lightning.utilities.enums import ( # noqa: F401 AMPType, DeviceType, @@ -38,16 +32,16 @@ _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE, - _FAIRSCALE_PIPE_AVAILABLE, _GROUP_AVAILABLE, _HOROVOD_AVAILABLE, _HYDRA_AVAILABLE, _HYDRA_EXPERIMENTAL_AVAILABLE, + _IPU_AVAILABLE, _IS_INTERACTIVE, _module_available, _NATIVE_AMP_AVAILABLE, _OMEGACONF_AVAILABLE, - _RPC_AVAILABLE, + _POPTORCH_AVAILABLE, _TORCH_GREATER_EQUAL_1_5, _TORCH_GREATER_EQUAL_1_6, _TORCH_GREATER_EQUAL_1_7, @@ -61,6 +55,7 @@ _XLA_AVAILABLE, ) from pytorch_lightning.utilities.parsing import AttributeDict, flatten_dict, is_picklable # noqa: F401 +from pytorch_lightning.utilities.warnings import rank_zero_deprecation, rank_zero_warn # noqa: F401 FLOAT16_EPSILON = numpy.finfo(numpy.float16).eps FLOAT32_EPSILON = numpy.finfo(numpy.float32).eps diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 1cbab2fb8dee9..606eb37dd9730 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses import operator from abc import ABC +from collections import OrderedDict from collections.abc import Mapping, Sequence from copy import copy from functools import partial @@ -54,12 +56,23 @@ def from_numpy(value, device: torch.device = None): ] +def _is_namedtuple(obj: object) -> bool: + # https://github.com/pytorch/pytorch/blob/v1.8.1/torch/nn/parallel/scatter_gather.py#L4-L8 + return isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields") + + +def _is_dataclass_instance(obj): + # https://docs.python.org/3/library/dataclasses.html#module-level-decorators-classes-and-functions + return dataclasses.is_dataclass(obj) and not isinstance(obj, type) + + def apply_to_collection( data: Any, dtype: Union[type, tuple], function: Callable, *args, wrong_dtype: Optional[Union[type, tuple]] = None, + include_none: bool = True, **kwargs ) -> Any: """ @@ -70,38 +83,108 @@ def apply_to_collection( dtype: the given function will be applied to all elements of this dtype function: the function to apply *args: positional arguments (will be forwarded to calls of ``function``) - wrong_dtype: the given function won't be applied if this type is specified and the given collections is of - the :attr:`wrong_type` even if it is of type :attr`dtype` + wrong_dtype: the given function won't be applied if this type is specified and the given collections + is of the ``wrong_dtype`` even if it is of type ``dtype`` + include_none: Whether to include an element if the output of ``function`` is ``None``. **kwargs: keyword arguments (will be forwarded to calls of ``function``) Returns: - the resulting collection + The resulting collection """ - elem_type = type(data) - # Breaking condition if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)): return function(data, *args, **kwargs) + elem_type = type(data) + # Recursively apply to collection items if isinstance(data, Mapping): + out = [] + for k, v in data.items(): + v = apply_to_collection(v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) + if include_none or v is not None: + out.append((k, v)) + return elem_type(OrderedDict(out)) + + is_namedtuple = _is_namedtuple(data) + is_sequence = isinstance(data, Sequence) and not isinstance(data, str) + if is_namedtuple or is_sequence: + out = [] + for d in data: + v = apply_to_collection(d, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) + if include_none or v is not None: + out.append(v) + return elem_type(*out) if is_namedtuple else elem_type(out) + + if _is_dataclass_instance(data): + out = dict() + for field in data.__dataclass_fields__: + v = apply_to_collection(getattr(data, field), dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) + if include_none or v is not None: + out[field] = v + return elem_type(**out) + + # data is neither of dtype, nor a collection + return data + + +def apply_to_collections( + data1: Optional[Any], + data2: Optional[Any], + dtype: Union[type, tuple], + function: Callable, + *args, + wrong_dtype: Optional[Union[type, tuple]] = None, + **kwargs +) -> Any: + """ + Zips two collections and applies a function to their items of a certain dtype. + + Args: + data1: The first collection + data2: The second collection + dtype: the given function will be applied to all elements of this dtype + function: the function to apply + *args: positional arguments (will be forwarded to calls of ``function``) + wrong_dtype: the given function won't be applied if this type is specified and the given collections + is of the ``wrong_dtype`` even if it is of type ``dtype`` + **kwargs: keyword arguments (will be forwarded to calls of ``function``) + + Returns: + The resulting collection + + Raises: + AssertionError: + If sequence collections have different data sizes. + """ + if data1 is None and data2 is not None: + # in case they were passed reversed + data1, data2 = data2, None + + elem_type = type(data1) + + if isinstance(data1, dtype) and data2 is not None and (wrong_dtype is None or not isinstance(data1, wrong_dtype)): + return function(data1, data2, *args, **kwargs) + + if isinstance(data1, Mapping) and data2 is not None: + # use union because we want to fail if a key does not exist in both + zipped = {k: (data1[k], data2[k]) for k in data1.keys() | data2.keys()} return elem_type({ - k: apply_to_collection(v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) - for k, v in data.items() + k: apply_to_collections(*v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) + for k, v in zipped.items() }) - if isinstance(data, tuple) and hasattr(data, '_fields'): # named tuple - return elem_type( - *(apply_to_collection(d, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) for d in data) - ) + is_namedtuple = _is_namedtuple(data1) + is_sequence = isinstance(data1, Sequence) and not isinstance(data1, str) + if (is_namedtuple or is_sequence) and data2 is not None: + assert len(data1) == len(data2), 'Sequence collections have different sizes' + out = [ + apply_to_collections(v1, v2, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) + for v1, v2 in zip(data1, data2) + ] + return elem_type(*out) if is_namedtuple else elem_type(out) - if isinstance(data, Sequence) and not isinstance(data, str): - return elem_type([ - apply_to_collection(d, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) for d in data - ]) - - # data is neither of dtype, nor a collection - return data + return apply_to_collection(data1, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) class TransferableDataType(ABC): @@ -168,15 +251,15 @@ def batch_to(data): return apply_to_collection(batch, dtype=dtype, function=batch_to) -def convert_to_tensors(data, device: torch.device = None): +def convert_to_tensors(data: Any, device: torch.device) -> Any: if device is None: - raise MisconfigurationException("device (torch.device) should be provided.") + raise MisconfigurationException("`torch.device` should be provided.") for src_dtype, conversion_func in CONVERSION_DTYPES: - data = apply_to_collection(data, src_dtype, partial(conversion_func, device=device)) + data = apply_to_collection(data, src_dtype, conversion_func, device=device) - def _move_to_device_and_make_contiguous(t: torch.Tensor, device: torch.device): + def _move_to_device_and_make_contiguous(t: torch.Tensor, device: torch.device) -> torch.Tensor: return t.to(device).contiguous() - data = apply_to_collection(data, torch.Tensor, partial(_move_to_device_and_make_contiguous, device=device)) + data = apply_to_collection(data, torch.Tensor, _move_to_device_and_make_contiguous, device=device) return data diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py index 6f91397bd0306..aebbcb41ac34f 100644 --- a/pytorch_lightning/utilities/argparse.py +++ b/pytorch_lightning/utilities/argparse.py @@ -46,7 +46,7 @@ def from_argparse_args(cls, args: Union[Namespace, ArgumentParser], **kwargs): # we only want to pass in valid Trainer args, the rest may be user specific valid_kwargs = inspect.signature(cls.__init__).parameters - trainer_kwargs = dict((name, params[name]) for name in valid_kwargs if name in params) + trainer_kwargs = {name: params[name] for name in valid_kwargs if name in params} trainer_kwargs.update(**kwargs) return cls(**trainer_kwargs) @@ -139,9 +139,8 @@ def _get_abbrev_qualified_cls_name(cls): if cls.__module__.startswith("pytorch_lightning."): # Abbreviate. return f"pl.{cls.__name__}" - else: - # Fully qualified. - return f"{cls.__module__}.{cls.__qualname__}" + # Fully qualified. + return f"{cls.__module__}.{cls.__qualname__}" def add_argparse_args( @@ -169,6 +168,10 @@ def add_argparse_args( Only arguments of the allowed types (str, float, int, bool) will extend the ``parent_parser``. + Raises: + RuntimeError: + If ``parent_parser`` is not an ``ArgumentParser`` instance + Examples: # Option 1: Default usage. @@ -254,8 +257,7 @@ def add_argparse_args( if use_argument_group: return parent_parser - else: - return parser + return parser def _parse_args_from_docstring(docstring: str) -> Dict[str, str]: @@ -284,8 +286,7 @@ def _parse_args_from_docstring(docstring: str) -> Dict[str, str]: def _gpus_allowed_type(x) -> Union[int, str]: if ',' in x: return str(x) - else: - return int(x) + return int(x) def _gpus_arg_default(x) -> Union[int, str]: # pragma: no-cover @@ -298,5 +299,4 @@ def _gpus_arg_default(x) -> Union[int, str]: # pragma: no-cover def _int_or_float_type(x) -> Union[int, float]: if '.' in str(x): return float(x) - else: - return int(x) + return int(x) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index 5dccad4ab9135..0edc50c14e8c4 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -12,15 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import warnings from argparse import Namespace -from typing import Any, Dict, Optional, Type, Union +from types import MethodType +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union + +from torch.optim import Optimizer from pytorch_lightning.callbacks import Callback from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.utilities import _module_available +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.seed import seed_everything +from pytorch_lightning.utilities.types import LRSchedulerType, LRSchedulerTypeTuple _JSONARGPARSE_AVAILABLE = _module_available("jsonargparse") if _JSONARGPARSE_AVAILABLE: @@ -33,7 +40,7 @@ class LightningArgumentParser(ArgumentParser): """Extension of jsonargparse's ArgumentParser for pytorch-lightning""" - def __init__(self, *args, parse_as_dict: bool = True, **kwargs) -> None: + def __init__(self, *args: Any, parse_as_dict: bool = True, **kwargs: Any) -> None: """Initialize argument parser that supports configuration file input For full details of accepted arguments see `ArgumentParser.__init__ @@ -48,44 +55,124 @@ def __init__(self, *args, parse_as_dict: bool = True, **kwargs) -> None: self.add_argument( '--config', action=ActionConfigFile, help='Path to a configuration file in json or yaml format.' ) + self.callback_keys: List[str] = [] + self.optimizers_and_lr_schedulers: Dict[str, Tuple[Union[Type, Tuple[Type, ...]], str]] = {} def add_lightning_class_args( self, - lightning_class: Union[Type[Trainer], Type[LightningModule], Type[LightningDataModule]], + lightning_class: Union[Type[Trainer], Type[LightningModule], Type[LightningDataModule], Type[Callback]], nested_key: str, subclass_mode: bool = False - ) -> None: + ) -> List[str]: """ Adds arguments from a lightning class to a nested key of the parser Args: - lightning_class: Any subclass of {Trainer,LightningModule,LightningDataModule}. + lightning_class: Any subclass of {Trainer, LightningModule, LightningDataModule, Callback}. nested_key: Name of the nested namespace to store arguments. subclass_mode: Whether allow any subclass of the given class. """ - assert issubclass(lightning_class, (Trainer, LightningModule, LightningDataModule)) + assert issubclass(lightning_class, (Trainer, LightningModule, LightningDataModule, Callback)) + if issubclass(lightning_class, Callback): + self.callback_keys.append(nested_key) if subclass_mode: return self.add_subclass_arguments(lightning_class, nested_key, required=True) - return self.add_class_arguments(lightning_class, nested_key, fail_untyped=False) + return self.add_class_arguments( + lightning_class, + nested_key, + fail_untyped=False, + instantiate=not issubclass(lightning_class, Trainer), + ) + + def add_optimizer_args( + self, + optimizer_class: Union[Type[Optimizer], Tuple[Type[Optimizer], ...]], + nested_key: str = 'optimizer', + link_to: str = 'AUTOMATIC', + ) -> None: + """ + Adds arguments from an optimizer class to a nested key of the parser + + Args: + optimizer_class: Any subclass of torch.optim.Optimizer. + nested_key: Name of the nested namespace to store arguments. + link_to: Dot notation of a parser key to set arguments or AUTOMATIC. + """ + if isinstance(optimizer_class, tuple): + assert all(issubclass(o, Optimizer) for o in optimizer_class) + else: + assert issubclass(optimizer_class, Optimizer) + kwargs = { + 'instantiate': False, + 'fail_untyped': False, + 'skip': {'params'}, + } + if isinstance(optimizer_class, tuple): + self.add_subclass_arguments(optimizer_class, nested_key, required=True, **kwargs) + else: + self.add_class_arguments(optimizer_class, nested_key, **kwargs) + self.optimizers_and_lr_schedulers[nested_key] = (optimizer_class, link_to) + + def add_lr_scheduler_args( + self, + lr_scheduler_class: Union[LRSchedulerType, Tuple[LRSchedulerType, ...]], + nested_key: str = 'lr_scheduler', + link_to: str = 'AUTOMATIC', + ) -> None: + """ + Adds arguments from a learning rate scheduler class to a nested key of the parser + + Args: + lr_scheduler_class: Any subclass of ``torch.optim.lr_scheduler.{_LRScheduler, ReduceLROnPlateau}``. + nested_key: Name of the nested namespace to store arguments. + link_to: Dot notation of a parser key to set arguments or AUTOMATIC. + """ + if isinstance(lr_scheduler_class, tuple): + assert all(issubclass(o, LRSchedulerTypeTuple) for o in lr_scheduler_class) + else: + assert issubclass(lr_scheduler_class, LRSchedulerTypeTuple) + kwargs = { + 'instantiate': False, + 'fail_untyped': False, + 'skip': {'optimizer'}, + } + if isinstance(lr_scheduler_class, tuple): + self.add_subclass_arguments(lr_scheduler_class, nested_key, required=True, **kwargs) + else: + self.add_class_arguments(lr_scheduler_class, nested_key, **kwargs) + self.optimizers_and_lr_schedulers[nested_key] = (lr_scheduler_class, link_to) class SaveConfigCallback(Callback): - """Saves a LightningCLI config to the log_dir when training starts""" + """Saves a LightningCLI config to the log_dir when training starts + + Raises: + RuntimeError: If the config file already exists in the directory to avoid overwriting a previous run + """ def __init__( self, parser: LightningArgumentParser, config: Union[Namespace, Dict[str, Any]], - config_filename: str = 'config.yaml' + config_filename: str, + overwrite: bool = False, ) -> None: self.parser = parser self.config = config self.config_filename = config_filename + self.overwrite = overwrite def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: log_dir = trainer.log_dir or trainer.default_root_dir config_path = os.path.join(log_dir, self.config_filename) - self.parser.save(self.config, config_path, skip_none=False) + if not self.overwrite and os.path.isfile(config_path): + raise RuntimeError( + f'{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting' + ' results of a previous run. You can delete the previous config file,' + ' set `LightningCLI(save_config_callback=None)` to disable config saving,' + ' or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file.' + ) + self.parser.save(self.config, config_path, skip_none=False, overwrite=self.overwrite) class LightningCLI: @@ -95,7 +182,9 @@ def __init__( self, model_class: Type[LightningModule], datamodule_class: Type[LightningDataModule] = None, - save_config_callback: Type[SaveConfigCallback] = SaveConfigCallback, + save_config_callback: Optional[Type[SaveConfigCallback]] = SaveConfigCallback, + save_config_filename: str = 'config.yaml', + save_config_overwrite: bool = False, trainer_class: Type[Trainer] = Trainer, trainer_defaults: Dict[str, Any] = None, seed_everything_default: int = None, @@ -132,6 +221,8 @@ def __init__( model_class: :class:`~pytorch_lightning.core.lightning.LightningModule` class to train on. datamodule_class: An optional :class:`~pytorch_lightning.core.datamodule.LightningDataModule` class. save_config_callback: A callback class to save the training config. + save_config_filename: Filename for the config file. + save_config_overwrite: Whether to overwrite an existing config file. trainer_class: An optional subclass of the :class:`~pytorch_lightning.trainer.trainer.Trainer` class. trainer_defaults: Set to override Trainer defaults or add persistent callbacks. seed_everything_default: Default value for the :func:`~pytorch_lightning.utilities.seed.seed_everything` @@ -154,6 +245,8 @@ def __init__( self.model_class = model_class self.datamodule_class = datamodule_class self.save_config_callback = save_config_callback + self.save_config_filename = save_config_filename + self.save_config_overwrite = save_config_overwrite self.trainer_class = trainer_class self.trainer_defaults = {} if trainer_defaults is None else trainer_defaults self.seed_everything_default = seed_everything_default @@ -165,11 +258,13 @@ def __init__( self.init_parser() self.add_core_arguments_to_parser() self.add_arguments_to_parser(self.parser) + self.link_optimizers_and_lr_schedulers() self.parse_arguments() if self.config['seed_everything'] is not None: seed_everything(self.config['seed_everything'], workers=True) self.before_instantiate_classes() self.instantiate_classes() + self.add_configure_optimizers_method_to_model() self.prepare_fit_kwargs() self.before_fit() self.fit() @@ -201,6 +296,17 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser) -> None: parser: The argument parser object to which arguments can be added """ + def link_optimizers_and_lr_schedulers(self) -> None: + """Creates argument links for optimizers and lr_schedulers that specified a link_to""" + for key, (class_type, link_to) in self.parser.optimizers_and_lr_schedulers.items(): + if link_to == 'AUTOMATIC': + continue + if isinstance(class_type, tuple): + self.parser.link_arguments(key, link_to) + else: + add_class_path = _add_class_path_generator(class_type) + self.parser.link_arguments(key, link_to, compute_fn=add_class_path) + def parse_arguments(self) -> None: """Parses command line arguments and stores it in self.config""" self.config = self.parser.parse_args() @@ -210,40 +316,89 @@ def before_instantiate_classes(self) -> None: def instantiate_classes(self) -> None: """Instantiates the classes using settings from self.config""" - self.config_init = self.parser.instantiate_subclasses(self.config) - self.instantiate_datamodule() - self.instantiate_model() + self.config_init = self.parser.instantiate_classes(self.config) + self.datamodule = self.config_init.get('data') + self.model = self.config_init['model'] self.instantiate_trainer() - def instantiate_datamodule(self) -> None: - """Instantiates the datamodule using self.config_init['data'] if given""" - if self.datamodule_class is None: - self.datamodule = None - elif self.subclass_mode_data: - self.datamodule = self.config_init['data'] - else: - self.datamodule = self.datamodule_class(**self.config_init.get('data', {})) - - def instantiate_model(self) -> None: - """Instantiates the model using self.config_init['model']""" - if self.subclass_mode_model: - self.model = self.config_init['model'] - else: - self.model = self.model_class(**self.config_init.get('model', {})) - def instantiate_trainer(self) -> None: """Instantiates the trainer using self.config_init['trainer']""" if self.config_init['trainer'].get('callbacks') is None: self.config_init['trainer']['callbacks'] = [] + callbacks = [self.config_init[c] for c in self.parser.callback_keys] + self.config_init['trainer']['callbacks'].extend(callbacks) if 'callbacks' in self.trainer_defaults: if isinstance(self.trainer_defaults['callbacks'], list): self.config_init['trainer']['callbacks'].extend(self.trainer_defaults['callbacks']) else: self.config_init['trainer']['callbacks'].append(self.trainer_defaults['callbacks']) - if self.save_config_callback is not None: - self.config_init['trainer']['callbacks'].append(self.save_config_callback(self.parser, self.config)) + if self.save_config_callback and not self.config_init['trainer']['fast_dev_run']: + config_callback = self.save_config_callback( + self.parser, self.config, self.save_config_filename, overwrite=self.save_config_overwrite + ) + self.config_init['trainer']['callbacks'].append(config_callback) self.trainer = self.trainer_class(**self.config_init['trainer']) + def add_configure_optimizers_method_to_model(self) -> None: + """ + Adds to the model an automatically generated configure_optimizers method + + If a single optimizer and optionally a scheduler argument groups are added to the parser as 'AUTOMATIC', + then a `configure_optimizers` method is automatically implemented in the model class. + """ + + def get_automatic(class_type: Union[Type, Tuple[Type, ...]]) -> List[str]: + automatic = [] + for key, (base_class, link_to) in self.parser.optimizers_and_lr_schedulers.items(): + if not isinstance(base_class, tuple): + base_class = (base_class, ) + if link_to == 'AUTOMATIC' and any(issubclass(c, class_type) for c in base_class): + automatic.append(key) + return automatic + + optimizers = get_automatic(Optimizer) + lr_schedulers = get_automatic(LRSchedulerTypeTuple) + + if len(optimizers) == 0: + return + + if len(optimizers) > 1 or len(lr_schedulers) > 1: + raise MisconfigurationException( + f"`{self.__class__.__name__}.add_configure_optimizers_method_to_model` expects at most one optimizer " + f"and one lr_scheduler to be 'AUTOMATIC', but found {optimizers+lr_schedulers}. In this case the user " + "is expected to link the argument groups and implement `configure_optimizers`, see " + "https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_cli.html" + "#optimizers-and-learning-rate-schedulers" + ) + + if is_overridden('configure_optimizers', self.model): + warnings.warn( + f"`{self.model.__class__.__name__}.configure_optimizers` will be overridden by " + f"`{self.__class__.__name__}.add_configure_optimizers_method_to_model`." + ) + + optimizer_class = self.parser.optimizers_and_lr_schedulers[optimizers[0]][0] + optimizer_init = self.config_init.get(optimizers[0], {}) + if not isinstance(optimizer_class, tuple): + optimizer_init = _global_add_class_path(optimizer_class, optimizer_init) + lr_scheduler_init = None + if lr_schedulers: + lr_scheduler_class = self.parser.optimizers_and_lr_schedulers[lr_schedulers[0]][0] + lr_scheduler_init = self.config_init.get(lr_schedulers[0], {}) + if not isinstance(lr_scheduler_class, tuple): + lr_scheduler_init = _global_add_class_path(lr_scheduler_class, lr_scheduler_init) + + def configure_optimizers( + self: LightningModule + ) -> Union[Optimizer, Tuple[List[Optimizer], List[LRSchedulerType]]]: + optimizer = instantiate_class(self.parameters(), optimizer_init) + if not lr_scheduler_init: + return optimizer + lr_scheduler = instantiate_class(optimizer, lr_scheduler_init) + return [optimizer], [lr_scheduler] + + self.model.configure_optimizers = MethodType(configure_optimizers, self.model) + def prepare_fit_kwargs(self) -> None: """Prepares fit_kwargs including datamodule using self.config_init['data'] if given""" self.fit_kwargs = {'model': self.model} @@ -259,3 +414,37 @@ def fit(self) -> None: def after_fit(self) -> None: """Implement to run some code after fit has finished""" + + +def _global_add_class_path(class_type: Type, init_args: Dict[str, Any]) -> Dict[str, Any]: + return { + 'class_path': class_type.__module__ + '.' + class_type.__name__, + 'init_args': init_args, + } + + +def _add_class_path_generator(class_type: Type) -> Callable[[Dict[str, Any]], Dict[str, Any]]: + + def add_class_path(init_args: Dict[str, Any]) -> Dict[str, Any]: + return _global_add_class_path(class_type, init_args) + + return add_class_path + + +def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any: + """Instantiates a class with the given args and init. + + Args: + args: Positional arguments required for instantiation. + init: Dict of the form {"class_path":...,"init_args":...}. + + Returns: + The instantiated class object. + """ + kwargs = init.get('init_args', {}) + if not isinstance(args, tuple): + args = (args, ) + class_module, class_name = init['class_path'].rsplit('.', 1) + module = __import__(class_module, fromlist=[class_name]) + args_class = getattr(module, class_name) + return args_class(*args, **kwargs) diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py index 9e8240981feda..6bd6a172a7a41 100644 --- a/pytorch_lightning/utilities/cloud_io.py +++ b/pytorch_lightning/utilities/cloud_io.py @@ -38,9 +38,8 @@ def get_filesystem(path: Union[str, Path]): if "://" in path: # use the fileystem from the protocol specified return fsspec.filesystem(path.split(":", 1)[0]) - else: - # use local filesystem - return LocalFileSystem() + # use local filesystem + return LocalFileSystem() def atomic_save(checkpoint, filepath: str): diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py index 27345fda3b110..9d36206748197 100644 --- a/pytorch_lightning/utilities/data.py +++ b/pytorch_lightning/utilities/data.py @@ -24,8 +24,14 @@ def has_iterable_dataset(dataloader: DataLoader): def has_len(dataloader: DataLoader) -> bool: - """ Checks if a given Dataloader has __len__ method implemented i.e. if - it is a finite dataloader or infinite dataloader. """ + """ + Checks if a given Dataloader has ``__len__`` method implemented i.e. if + it is a finite dataloader or infinite dataloader. + + Raises: + ValueError: + If the length of Dataloader is 0, as it requires at least one batch + """ try: # try getting the length diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index 56833fd03735a..b4388bf89c195 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -39,8 +39,6 @@ class InternalDebugger(object): def __init__(self, trainer): self.enabled = os.environ.get('PL_DEV_DEBUG', '0') == '1' self.trainer = trainer - self.logged_metrics = [] - self.pbar_added_metrics = [] self.saved_train_losses = [] self.saved_val_losses = [] self.saved_test_losses = [] @@ -53,6 +51,7 @@ def __init__(self, trainer): self.test_dataloader_calls = [] self.dataloader_sequence_calls = [] + @enabled_only def track_event( self, evt_type: str, @@ -110,11 +109,6 @@ def track_load_dataloader_call(self, name, dataloaders): elif 'test' in name: self.test_dataloader_calls.append(values) - @enabled_only - def track_logged_metrics_history(self, scalar_metrics): - scalar_metrics['global_step'] = self.trainer.global_step - self.logged_metrics.append(scalar_metrics) - @enabled_only def track_train_loss_history(self, batch_idx, loss): loss_dict = {'batch_idx': batch_idx, 'epoch': self.trainer.current_epoch, 'loss': loss.detach()} @@ -151,11 +145,6 @@ def track_eval_loss_history(self, batch_idx, dataloader_idx, output): else: self.saved_val_losses.append(loss_dict) - @enabled_only - def track_pbar_metrics_history(self, metrics): - metrics['debug_epoch'] = self.trainer.current_epoch - self.pbar_added_metrics.append(metrics) - @enabled_only def track_early_stopping_history(self, callback, current): debug_dict = { diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 511a91326953d..ffa11c053f83a 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -16,7 +16,8 @@ import torch -from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_deprecation from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version @@ -28,6 +29,12 @@ def determine_root_gpu_device(gpus: List[int]) -> Optional[int]: Returns: designated root GPU device id + + Raises: + TypeError: + If ``gpus`` is not a list + AssertionError: + If GPU list is empty """ if gpus is None: return None @@ -78,6 +85,11 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i gpus = _normalize_parse_gpu_input_to_list(gpus) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") + + if TorchElasticEnvironment.is_using_torchelastic() and len(gpus) != 1 and len(_get_all_available_gpus()) == 1: + # omit sanity check on torchelastic as by default shows one visible GPU per process + return gpus + gpus = _sanitize_gpu_ids(gpus) return gpus @@ -96,6 +108,10 @@ def parse_tpu_cores(tpu_cores: Union[int, str, List]) -> Optional[Union[List[int Returns: a list of tpu_cores to be used or ``None`` if no TPU cores were requested + + Raises: + MisconfigurationException: + If TPU cores aren't 1 or 8 cores, or no TPU devices are found """ _check_data_type(tpu_cores) @@ -116,20 +132,18 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[in return s if s == '-1': return -1 - elif ',' in s: + if ',' in s: return [int(x.strip()) for x in s.split(',') if len(x) > 0] - else: - num_gpus = int(s.strip()) - if _compare_version("pytorch_lightning", operator.lt, "1.5"): - rank_zero_warn( - f"Parsing of the Trainer argument gpus='{s}' (string) will change in the future." - " In the current version of Lightning, this will select" - f" CUDA device with index {num_gpus}, but from v1.5 it will select gpus" - f" {list(range(num_gpus))} (same as gpus={s} (int)).", - DeprecationWarning, - ) - return [num_gpus] - return num_gpus + num_gpus = int(s.strip()) + if _compare_version("pytorch_lightning", operator.lt, "1.5"): + rank_zero_deprecation( + f"Parsing of the Trainer argument gpus='{s}' (string) will change in the future." + " In the current version of Lightning, this will select" + f" CUDA device with index {num_gpus}, but from v1.5 it will select gpus" + f" {list(range(num_gpus))} (same as gpus={s} (int)).", + ) + return [num_gpus] + return num_gpus def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: @@ -142,6 +156,10 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: Returns: unmodified gpus variable + + Raises: + MisconfigurationException: + If machine has fewer available GPUs than requested. """ all_available_gpus = _get_all_available_gpus() for gpu in gpus: @@ -181,6 +199,10 @@ def _check_data_type(device_ids: Any) -> None: Args: device_ids: gpus/tpu_cores parameter as passed to the Trainer + + Raises: + MisconfigurationException: + If ``device_ids`` of GPU/TPUs aren't ``int``, ``str``, sequence of ``int`` or ``None`` """ if device_ids is not None and \ (not isinstance(device_ids, (int, str, MutableSequence, tuple)) or isinstance(device_ids, bool)): diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index a54d00a983d9e..6ca2de7eb2ca2 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -14,8 +14,8 @@ import logging import os -import warnings -from functools import partial, wraps +from functools import wraps +from platform import python_version from typing import Any, Optional, Union import torch @@ -65,22 +65,44 @@ def _get_rank() -> int: rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank()) -def _warn(*args, **kwargs): - warnings.warn(*args, **kwargs) +def rank_zero_warn(*args, stacklevel: int = 5, **kwargs): + from pytorch_lightning.utilities.warnings import rank_zero_deprecation, rank_zero_warn + rank_zero_deprecation( + '`pytorch_lightning.utilities.distributed.rank_zero_warn` has been moved to' + ' `pytorch_lightning.utilities.rank_zero_warn` in v1.3.7 and will be removed in v1.6' + ) + return rank_zero_warn(*args, stacklevel=stacklevel, **kwargs) + +def rank_zero_deprecation(*args, stacklevel: int = 5, **kwargs): + from pytorch_lightning.utilities.warnings import rank_zero_deprecation + rank_zero_deprecation( + '`pytorch_lightning.utilities.distributed.rank_zero_deprecation` has been moved to' + ' `pytorch_lightning.utilities.rank_zero_deprecation` in v1.3.7 and will be removed in v1.6' + ) + return rank_zero_deprecation(*args, stacklevel=stacklevel, **kwargs) -def _info(*args, **kwargs): + +def _info(*args, stacklevel: int = 2, **kwargs): + if python_version() >= "3.8.0": + kwargs['stacklevel'] = stacklevel log.info(*args, **kwargs) -def _debug(*args, **kwargs): +def _debug(*args, stacklevel: int = 2, **kwargs): + if python_version() >= "3.8.0": + kwargs['stacklevel'] = stacklevel log.debug(*args, **kwargs) -rank_zero_debug = rank_zero_only(_debug) -rank_zero_info = rank_zero_only(_info) -rank_zero_warn = rank_zero_only(_warn) -rank_zero_deprecation = partial(rank_zero_warn, category=DeprecationWarning) +@rank_zero_only +def rank_zero_debug(*args, stacklevel: int = 4, **kwargs): + _debug(*args, stacklevel=stacklevel, **kwargs) + + +@rank_zero_only +def rank_zero_info(*args, stacklevel: int = 4, **kwargs): + _info(*args, stacklevel=stacklevel, **kwargs) def gather_all_tensors(result: Union[torch.Tensor], group: Optional[Any] = None): @@ -113,6 +135,10 @@ def gather_all_tensors(result: Union[torch.Tensor], group: Optional[Any] = None) return gathered_result +def distributed_available() -> bool: + return torch.distributed.is_available() and torch.distributed.is_initialized() or tpu_distributed() + + def sync_ddp_if_available( result: Union[torch.Tensor], group: Optional[Any] = None, @@ -129,7 +155,7 @@ def sync_ddp_if_available( Return: reduced value """ - if torch.distributed.is_available() and torch.distributed.is_initialized(): + if distributed_available(): return sync_ddp(result, group=group, reduce_op=reduce_op) return result @@ -208,12 +234,11 @@ def all_gather_ddp_if_available( A tensor of shape (world_size, batch, ...) """ group = group if group is not None else torch.distributed.group.WORLD - if torch.distributed.is_available() and torch.distributed.is_initialized(): + if distributed_available(): if sync_grads: return AllGatherGrad.apply(tensor, group) - else: - with torch.no_grad(): - return AllGatherGrad.apply(tensor, group) + with torch.no_grad(): + return AllGatherGrad.apply(tensor, group) return tensor @@ -294,6 +319,7 @@ def register_ddp_comm_hook( ddp_comm_wrapper=default.fp16_compress_wrapper, ) """ + from pytorch_lightning.utilities import rank_zero_warn if not _TORCH_GREATER_EQUAL_1_8: rank_zero_warn("Not registering DDP comm hook. To use communication hooks, please use pytorch>=1.8.0.") return diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py index 98e10a9126a44..98f2770d03cf9 100644 --- a/pytorch_lightning/utilities/enums.py +++ b/pytorch_lightning/utilities/enums.py @@ -79,7 +79,6 @@ def is_interactive_compatible(self) -> bool: HOROVOD = 'horovod' DDP_SHARDED = 'ddp_sharded' DDP_SHARDED_SPAWN = 'ddp_sharded_spawn' - RPC_SEQUENTIAL_PLUGIN = 'rpc_sequential' DDP_FULLY_SHARDED = "ddp_fully_sharded" @@ -97,6 +96,7 @@ class DeviceType(LightningEnum): """ CPU = 'CPU' GPU = 'GPU' + IPU = 'IPU' TPU = 'TPU' diff --git a/pytorch_lightning/utilities/exceptions.py b/pytorch_lightning/utilities/exceptions.py index 01b1e8c053950..bf5258f4f5f36 100644 --- a/pytorch_lightning/utilities/exceptions.py +++ b/pytorch_lightning/utilities/exceptions.py @@ -14,4 +14,12 @@ class MisconfigurationException(Exception): - pass + """ + Exception used to inform users of mis-use with PyTorch Lightning + """ + + +class DeadlockDetectedException(Exception): + """ + Exception used when a deadlock has been detected and processes are being killed + """ diff --git a/pytorch_lightning/utilities/finite_checks.py b/pytorch_lightning/utilities/finite_checks.py index 770ea7a2276f0..b40e97c9b45e9 100644 --- a/pytorch_lightning/utilities/finite_checks.py +++ b/pytorch_lightning/utilities/finite_checks.py @@ -29,7 +29,13 @@ def print_nan_gradients(model: nn.Module) -> None: def detect_nan_parameters(model: nn.Module) -> None: - """ Iterates over model parameters and prints gradients if any parameter is not finite. """ + """ + Iterates over model parameters and prints gradients if any parameter is not finite. + + Raises: + ValueError: + If ``NaN`` or ``inf`` values are found + """ for name, param in model.named_parameters(): if not torch.isfinite(param).all(): print_nan_gradients(model) diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index f40d092f68e9f..3125a2d38f15e 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -75,7 +75,6 @@ def _compare_version(package: str, op, version) -> bool: _BOLTS_AVAILABLE = _module_available('pl_bolts') _DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _module_available('deepspeed') _FAIRSCALE_AVAILABLE = _TORCH_GREATER_EQUAL_1_6 and not _IS_WINDOWS and _module_available('fairscale.nn') -_FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.le, "0.1.3") _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") _FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.group') @@ -85,7 +84,7 @@ def _compare_version(package: str, op, version) -> bool: _KINETO_AVAILABLE = _TORCH_GREATER_EQUAL_1_8_1 and torch.profiler.kineto_available() _NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast") _OMEGACONF_AVAILABLE = _module_available("omegaconf") -_RPC_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.rpc') +_POPTORCH_AVAILABLE = _module_available('poptorch') _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != 'none']) _TORCHTEXT_AVAILABLE = _module_available("torchtext") _TORCHVISION_AVAILABLE = _module_available('torchvision') @@ -96,3 +95,9 @@ def _compare_version(package: str, op, version) -> bool: from pytorch_lightning.utilities.xla_device import XLADeviceUtils # noqa: E402 _TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists() + +if _POPTORCH_AVAILABLE: + import poptorch + _IPU_AVAILABLE = poptorch.ipuHardwareIsAvailable() +else: + _IPU_AVAILABLE = False diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 6c01390a8c81e..0ae88e8995614 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -76,11 +76,10 @@ def is_out_of_cpu_memory(exception): def garbage_collection_cuda(): """Garbage collection Torch (CUDA) memory.""" gc.collect() - if torch.cuda.is_available(): - try: - # This is the last thing that should cause an OOM error, but seemingly it can. - torch.cuda.empty_cache() - except RuntimeError as exception: - if not is_oom_error(exception): - # Only handle OOM errors - raise + try: + # This is the last thing that should cause an OOM error, but seemingly it can. + torch.cuda.empty_cache() + except RuntimeError as exception: + if not is_oom_error(exception): + # Only handle OOM errors + raise diff --git a/pytorch_lightning/utilities/metrics.py b/pytorch_lightning/utilities/metrics.py index bd57470dc270e..5db2ff5d83360 100644 --- a/pytorch_lightning/utilities/metrics.py +++ b/pytorch_lightning/utilities/metrics.py @@ -12,29 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. """Helper functions to operate on metric values. """ +import numbers +from typing import Any import torch +from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException -def metrics_to_scalars(metrics: dict) -> dict: - """ Recursively walk through a dictionary of metrics and convert single-item tensors to scalar values. """ +def metrics_to_scalars(metrics: Any) -> Any: + """ + Recursively walk through a collection and convert single-item tensors to scalar values - # TODO: this is duplicated in MetricsHolder. should be unified - new_metrics = {} - for k, v in metrics.items(): - if isinstance(v, torch.Tensor): - if v.numel() != 1: - raise MisconfigurationException( - f"The metric `{k}` does not contain a single element" - f" thus it cannot be converted to float. Found `{v}`" - ) - v = v.item() + Raises: + MisconfigurationException: + If ``value`` contains multiple elements, hence preventing conversion to ``float`` + """ - if isinstance(v, dict): - v = metrics_to_scalars(v) + def to_item(value: torch.Tensor) -> numbers.Number: + if value.numel() != 1: + raise MisconfigurationException( + f"The metric `{value}` does not contain a single element" + f" thus it cannot be converted to float." + ) + return value.item() - new_metrics[k] = v - - return new_metrics + return apply_to_collection(metrics, torch.Tensor, to_item) diff --git a/pytorch_lightning/utilities/model_helpers.py b/pytorch_lightning/utilities/model_helpers.py index 87bd9e6c4545d..e52f8efa2689f 100644 --- a/pytorch_lightning/utilities/model_helpers.py +++ b/pytorch_lightning/utilities/model_helpers.py @@ -11,33 +11,58 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from functools import partial +from typing import Optional, Type, Union +from unittest.mock import Mock -from typing import Union +import pytorch_lightning as pl +from pytorch_lightning.utilities import rank_zero_deprecation -from pytorch_lightning.core.datamodule import LightningDataModule -from pytorch_lightning.core.lightning import LightningModule +def is_overridden( + method_name: str, + instance: Optional[object] = None, + parent: Optional[Type[object]] = None, + model: Optional[Union['pl.LightningModule', 'pl.LightningDataModule']] = None, +) -> bool: + if model is not None and instance is None: + rank_zero_deprecation( + '`is_overriden(model=...)` has been deprecated and will be removed in v1.6.' + 'Please use `is_overriden(instance=...)`' + ) + instance = model -def is_overridden(method_name: str, model: Union[LightningModule, LightningDataModule]) -> bool: - # if you pass DataModule instead of None or a LightningModule, we use LightningDataModule as super - # TODO - refector this function to accept model_name, instance, parent so it makes more sense - super_object = LightningModule if not isinstance(model, LightningDataModule) else LightningDataModule - - if not hasattr(model, method_name) or not hasattr(super_object, method_name): - # in case of calling deprecated method + if instance is None: + # if `self.lightning_module` was passed as instance, it can be `None` return False - instance_attr = getattr(model, method_name) - if not instance_attr: + if parent is None: + if isinstance(instance, pl.LightningModule): + parent = pl.LightningModule + elif isinstance(instance, pl.LightningDataModule): + parent = pl.LightningDataModule + if parent is None: + raise ValueError("Expected a parent") + + instance_attr = getattr(instance, method_name, None) + # `Mock(wraps=...)` support + if isinstance(instance_attr, Mock): + # access the wrapped function + instance_attr = instance_attr._mock_wraps + # `partial` support + elif isinstance(instance_attr, partial): + instance_attr = instance_attr.func + if instance_attr is None: return False - super_attr = getattr(super_object, method_name) - - # when code pointers are different, it was implemented - if hasattr(instance_attr, 'patch_loader_code'): - # cannot pickle __code__ so cannot verify if PatchDataloader - # exists which shows dataloader methods have been overwritten. - # so, we hack it by using the string representation - is_overridden = instance_attr.patch_loader_code != str(super_attr.__code__) - else: - is_overridden = instance_attr.__code__ is not super_attr.__code__ - return is_overridden + + parent_attr = getattr(parent, method_name, None) + if parent_attr is None: + raise ValueError("The parent should define the method") + + # cannot pickle `__code__` so cannot verify if `PatchDataloader` + # exists which shows dataloader methods have been overwritten. + # so, we hack it by using the string representation + instance_code = getattr(instance_attr, 'patch_loader_code', None) or instance_attr.__code__ + parent_code = parent_attr.__code__ + + return instance_code != parent_code diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py index 6141a80b5f97c..c7b57fe3fd4e9 100644 --- a/pytorch_lightning/utilities/parsing.py +++ b/pytorch_lightning/utilities/parsing.py @@ -16,9 +16,10 @@ import pickle import types from argparse import Namespace +from dataclasses import fields, is_dataclass from typing import Any, Dict, Optional, Sequence, Tuple, Union -from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.warnings import rank_zero_warn def str_to_bool_or_str(val: str) -> Union[str, bool]: @@ -32,18 +33,20 @@ def str_to_bool_or_str(val: str) -> Union[str, bool]: lower = val.lower() if lower in ('y', 'yes', 't', 'true', 'on', '1'): return True - elif lower in ('n', 'no', 'f', 'false', 'off', '0'): + if lower in ('n', 'no', 'f', 'false', 'off', '0'): return False - else: - return val + return val def str_to_bool(val: str) -> bool: """Convert a string representation of truth to bool. True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values - are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if - 'val' is anything else. + are 'n', 'no', 'f', 'false', 'off', and '0'. + + Raises: + ValueError: + If ``val`` isn't in one of the aforementioned true or false values. >>> str_to_bool('YES') True @@ -97,7 +100,7 @@ def clean_namespace(hparams): del_attrs = [k for k, v in hparams_dict.items() if not is_picklable(v)] for k in del_attrs: - rank_zero_warn(f"attribute '{k}' removed from hparams because it cannot be pickled", UserWarning) + rank_zero_warn(f"attribute '{k}' removed from hparams because it cannot be pickled") del hparams_dict[k] @@ -164,10 +167,9 @@ def collect_init_args(frame, path_args: list, inside: bool = False) -> list: # recursive update path_args.append(local_args) return collect_init_args(frame.f_back, path_args, inside=True) - elif not inside: + if not inside: return collect_init_args(frame.f_back, path_args, inside) - else: - return path_args + return path_args def flatten_dict(source, result=None): @@ -197,7 +199,11 @@ def save_hyperparameters( if not frame: frame = inspect.currentframe().f_back - init_args = get_init_args(frame) + + if is_dataclass(obj): + init_args = {f.name: getattr(obj, f.name) for f in fields(obj)} + else: + init_args = get_init_args(frame) assert init_args, "failed to inspect the obj init" if ignore is not None: diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py index 51547d5576e74..d5e712b8385bc 100644 --- a/pytorch_lightning/utilities/seed.py +++ b/pytorch_lightning/utilities/seed.py @@ -34,7 +34,7 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: In addition, sets the following environment variables: - `PL_GLOBAL_SEED`: will be passed to spawned subprocesses (e.g. ddp_spawn backend). - - `PL_SEED_WORKERS`: (optional) is set to 1 if ```workers=True``. + - `PL_SEED_WORKERS`: (optional) is set to 1 if ``workers=True``. Args: seed: the integer value seed for global random state in Lightning. @@ -84,8 +84,9 @@ def reset_seed() -> None: If :func:`pytorch_lightning.utilities.seed.seed_everything` is unused, this function will do nothing. """ seed = os.environ.get("PL_GLOBAL_SEED", None) + workers = os.environ.get("PL_SEED_WORKERS", False) if seed is not None: - seed_everything(int(seed)) + seed_everything(int(seed), workers=bool(workers)) def pl_worker_init_function(worker_id: int, rank: Optional = None) -> None: # pragma: no cover @@ -100,6 +101,9 @@ def pl_worker_init_function(worker_id: int, rank: Optional = None) -> None: # p process_seed = torch.initial_seed() # back out the base seed so we can use all the bits base_seed = process_seed - worker_id + log.debug( + f'Initializing random number generators of process {global_rank} worker {worker_id} with base seed {base_seed}' + ) ss = np.random.SeedSequence([base_seed, worker_id, global_rank]) # use 128 bits (4 x 32-bit words) np.random.seed(ss.generate_state(4)) diff --git a/pytorch_lightning/utilities/types.py b/pytorch_lightning/utilities/types.py index 8a81040af07db..ecbfa4c84f523 100644 --- a/pytorch_lightning/utilities/types.py +++ b/pytorch_lightning/utilities/types.py @@ -17,14 +17,31 @@ - Types used in public hooks (as those in the `LightningModule` and `Callback`) should be public (no trailing `_`) """ from numbers import Number -from typing import Any, Dict, Iterator, List, Union +from typing import Any, Dict, Iterator, List, Mapping, Sequence, Type, Union import torch +from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau +from torch.utils.data import DataLoader from torchmetrics import Metric _METRIC = Union[Metric, torch.Tensor, Number] +_METRIC_COLLECTION = Union[_METRIC, Mapping[str, _METRIC]] STEP_OUTPUT = Union[torch.Tensor, Dict[str, Any]] EPOCH_OUTPUT = List[STEP_OUTPUT] _EVALUATE_OUTPUT = List[Dict[str, float]] # 1 dict per DataLoader _PREDICT_OUTPUT = Union[List[Any], List[List[Any]]] _PARAMETERS = Iterator[torch.nn.Parameter] +# yapf: disable +TRAIN_DATALOADERS = Union[ + DataLoader, + Sequence[DataLoader], + Sequence[Sequence[DataLoader]], + Sequence[Dict[str, DataLoader]], + Dict[str, DataLoader], + Dict[str, Dict[str, DataLoader]], + Dict[str, Sequence[DataLoader]], +] +# yapf: enable +EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]] +LRSchedulerTypeTuple = (_LRScheduler, ReduceLROnPlateau) +LRSchedulerType = Union[Type[_LRScheduler], Type[ReduceLROnPlateau]] diff --git a/pytorch_lightning/utilities/warnings.py b/pytorch_lightning/utilities/warnings.py index a3dde95fa928f..0595a41ea5aa0 100644 --- a/pytorch_lightning/utilities/warnings.py +++ b/pytorch_lightning/utilities/warnings.py @@ -11,18 +11,40 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities.distributed import rank_zero_warn +"""Warning-related utilities""" +import warnings +from functools import partial +from pytorch_lightning.utilities.distributed import rank_zero_only -class WarningCache: - def __init__(self): - self.warnings = set() +def _warn(*args, stacklevel: int = 2, **kwargs): + warnings.warn(*args, stacklevel=stacklevel, **kwargs) - def warn(self, m, *args, **kwargs): - if m not in self.warnings: - self.warnings.add(m) - rank_zero_warn(m, *args, **kwargs) - def clear(self): - self.warnings.clear() +@rank_zero_only +def rank_zero_warn(*args, stacklevel: int = 4, **kwargs): + _warn(*args, stacklevel=stacklevel, **kwargs) + + +class LightningDeprecationWarning(DeprecationWarning): + ... + + +# enable our warnings +warnings.simplefilter('default', LightningDeprecationWarning) + +rank_zero_deprecation = partial(rank_zero_warn, category=LightningDeprecationWarning) + + +class WarningCache(set): + + def warn(self, m, *args, stacklevel: int = 5, **kwargs): + if m not in self: + self.add(m) + rank_zero_warn(m, *args, stacklevel=stacklevel, **kwargs) + + def deprecation(self, m, *args, stacklevel: int = 5, **kwargs): + if m not in self: + self.add(m) + rank_zero_deprecation(m, *args, stacklevel=stacklevel, **kwargs) diff --git a/requirements.txt b/requirements.txt index 964bb493a2637..15c0fcbbab8cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,10 @@ torch>=1.4 future>=0.17.1 # required for builtins in setup.py tqdm>=4.41.0 PyYAML>=5.1,<=5.4.1 -fsspec[http]>=2021.4.0 +fsspec[http]>=2021.05.0, !=2021.06.0 tensorboard>=2.2.0, !=2.5.0 # 2.5.0 GPU CI error: 'Couldn't build proto file into descriptor pool!' -torchmetrics>=0.2.0 -pyDeprecate==0.3.0 -packaging +torchmetrics>=0.4.0 +pyDeprecate==0.3.1 +packaging>=17.0 typing-extensions # TypedDict support for python<3.8 +pillow!=8.3.0 # TODO: delete line after https://github.com/python-pillow/Pillow/issues/5571 diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 3d9da2a2f1a22..84879b4e48a34 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -4,7 +4,8 @@ from typing import Dict, Optional VERSIONS = [ - dict(torch="1.9.0", torchvision="", torchtext=""), # nightly + dict(torch="1.10.0", torchvision="", torchtext=""), # nightly + dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"), dict(torch="1.8.1", torchvision="0.9.1", torchtext="0.9.1"), dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"), dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"), @@ -40,6 +41,8 @@ def main(path_req: str, torch_version: Optional[str] = None) -> None: with open(path_req, "r") as fp: req = fp.read() + # remove comments + req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req) latest = find_latest(torch_version) for lib, version in latest.items(): diff --git a/requirements/docs.txt b/requirements/docs.txt index b53549e087e4f..5328c679d1f6c 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,7 +1,7 @@ sphinx>=4.0 -recommonmark # fails with badges -m2r # fails with multi-line text -nbsphinx>=0.8 +myst-parser +nbsphinx>=0.8.5 +ipython[notebook] pandoc>=1.0 docutils>=0.16 sphinxcontrib-fulltoc>=1.0 @@ -11,3 +11,5 @@ sphinx-autodoc-typehints>=1.0 sphinx-paramlinks>=0.5.1 sphinx-togglebutton>=0.2 sphinx-copybutton>=0.3 + +-r ../_notebooks/.actions/requirements.txt diff --git a/requirements/extra.txt b/requirements/extra.txt index c41f464ef383b..291813e05edcd 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -7,4 +7,4 @@ torchtext>=0.5 # onnx>=1.7.0 onnxruntime>=1.3.0 hydra-core>=1.0 -jsonargparse[signatures]>=3.12.0 +jsonargparse[signatures]>=3.15.0 diff --git a/setup.cfg b/setup.cfg index 5a68adb27b443..74e02d932dc3c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -129,6 +129,10 @@ ignore_errors = True [mypy-pytorch_lightning.loggers.*] ignore_errors = True +# todo: add proper typing to this module... +[mypy-pytorch_lightning.loops.*] +ignore_errors = True + # todo: add proper typing to this module... [mypy-pytorch_lightning.metrics.*] ignore_errors = True @@ -163,6 +167,8 @@ ignore_errors = True # whitelist [mypy-pytorch_lightning.trainer.evaluation_loop] ignore_errors = False +[mypy-pytorch_lightning.trainer.connectors.logger_connector] +ignore_errors = False # todo: add proper typing to this module... [mypy-pytorch_lightning.distributed.*] @@ -175,6 +181,8 @@ ignore_errors = True # todo: add proper typing to this module... [mypy-pytorch_lightning.utilities.*] ignore_errors = True +[mypy-pytorch_lightning.utilities.cli] +ignore_errors = False # todo: add proper typing to this module... [mypy-pl_examples.*] diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index e60b86513e5ff..4a9b01281f784 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -18,6 +18,7 @@ import pytest import torch +import torch.distributed from pytorch_lightning import Trainer from pytorch_lightning.accelerators.accelerator import Accelerator @@ -385,6 +386,35 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) +@RunIf(special=True) +def test_accelerator_choice_ddp_cpu_and_plugin(tmpdir): + """ Test that accelerator="ddp_cpu" can work together with an instance of DDPPlugin. """ + _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class=DDPPlugin) + + +@RunIf(special=True) +def test_accelerator_choice_ddp_cpu_and_plugin_spawn(tmpdir): + """ Test that accelerator="ddp_cpu" can work together with an instance of DDPPSpawnPlugin. """ + _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class=DDPSpawnPlugin) + + +def _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class): + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + plugins=[ddp_plugin_class(find_unused_parameters=True)], + fast_dev_run=True, + accelerator='ddp_cpu', + num_processes=2, + ) + assert isinstance(trainer.training_type_plugin, ddp_plugin_class) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert trainer.training_type_plugin.num_processes == 2 + assert trainer.training_type_plugin.parallel_devices == [torch.device("cpu")] * 2 + trainer.fit(model) + + @mock.patch.dict( os.environ, { "SLURM_NTASKS": "2", @@ -396,11 +426,8 @@ def on_fit_start(self, trainer, pl_module): } ) @mock.patch('torch.cuda.device_count', return_value=0) -@mock.patch('pytorch_lightning.plugins.DDPPlugin.setup_distributed', autospec=True) -def test_accelerator_choice_ddp_cpu_custom_cluster(device_count_mock, setup_distributed_mock): - """ - Test that we choose the custom cluster even when SLURM or TE flags are around - """ +def test_accelerator_choice_ddp_cpu_custom_cluster(_, tmpdir): + """ Test that we choose the custom cluster even when SLURM or TE flags are around """ class CustomCluster(LightningEnvironment): @@ -410,25 +437,16 @@ def master_address(self): def creates_children(self) -> bool: return True - class CB(Callback): - - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster) - raise SystemExit() - - model = BoringModel() trainer = Trainer( + default_root_dir=tmpdir, plugins=[CustomCluster()], fast_dev_run=True, accelerator='ddp_cpu', num_processes=2, - callbacks=[CB()], ) - - with pytest.raises(SystemExit): - trainer.fit(model) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster) @mock.patch.dict( @@ -453,8 +471,9 @@ class Prec(PrecisionPlugin): class TrainTypePlugin(SingleDevicePlugin): pass + ttp = TrainTypePlugin(device=torch.device("cpu")) accelerator = Accel( - training_type_plugin=TrainTypePlugin(device=torch.device("cpu")), + training_type_plugin=ttp, precision_plugin=Prec(), ) trainer = Trainer( @@ -465,6 +484,25 @@ class TrainTypePlugin(SingleDevicePlugin): assert isinstance(trainer.accelerator, Accel) assert isinstance(trainer.training_type_plugin, TrainTypePlugin) assert isinstance(trainer.precision_plugin, Prec) + assert trainer.accelerator_connector.training_type_plugin is ttp + + class DistributedPlugin(DDPPlugin): + pass + + ttp = DistributedPlugin() + accelerator = Accel( + training_type_plugin=ttp, + precision_plugin=Prec(), + ) + trainer = Trainer( + accelerator=accelerator, + fast_dev_run=True, + num_processes=2, + ) + assert isinstance(trainer.accelerator, Accel) + assert isinstance(trainer.training_type_plugin, DistributedPlugin) + assert isinstance(trainer.precision_plugin, Prec) + assert trainer.accelerator_connector.training_type_plugin is ttp @mock.patch.dict( diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py index c7d7f98ae995d..7be1c6b9d1b65 100644 --- a/tests/accelerators/test_cpu.py +++ b/tests/accelerators/test_cpu.py @@ -7,6 +7,7 @@ from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.plugins import SingleDevicePlugin from pytorch_lightning.plugins.precision import MixedPrecisionPlugin +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel @@ -50,3 +51,112 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: model = TestModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=CustomPlugin(device=torch.device("cpu"))) trainer.fit(model) + + +def test_accelerator_on_reset_dataloader_hooks(tmpdir): + """ + Ensure data-loader hooks are called using an Accelerator. + """ + + class CustomAccelerator(CPUAccelerator): + train_count: int = 0 + val_count: int = 0 + test_count: int = 0 + predict_count: int = 0 + + def on_reset_train_dataloader(self, dataloader): + self.train_count += 1 + assert self.lightning_module.trainer.training + return super().on_reset_train_dataloader(dataloader) + + def on_reset_val_dataloader(self, dataloader): + self.val_count += 1 + assert self.lightning_module.trainer.training or self.lightning_module.trainer.validating + return super().on_reset_val_dataloader(dataloader) + + def on_reset_test_dataloader(self, dataloader): + self.test_count += 1 + assert self.lightning_module.trainer.testing + return super().on_reset_test_dataloader(dataloader) + + def on_reset_predict_dataloader(self, dataloader): + self.predict_count += 1 + assert self.lightning_module.trainer.predicting + return super().on_reset_predict_dataloader(dataloader) + + model = BoringModel() + accelerator = CustomAccelerator(PrecisionPlugin(), SingleDevicePlugin(device=torch.device('cpu'))) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator=accelerator) + trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, dataloaders=model.test_dataloader()) + # assert that all loader hooks were called + assert accelerator.train_count == 1 + assert accelerator.val_count == 1 # only called once during the entire session + assert accelerator.test_count == 1 + assert accelerator.predict_count == 1 + + accelerator = CustomAccelerator(PrecisionPlugin(), SingleDevicePlugin(device=torch.device('cpu'))) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator=accelerator) + trainer.validate(model) + trainer.test(model) + trainer.predict(model) + # assert val/test/predict loader hooks were called + assert accelerator.val_count == 1 + assert accelerator.test_count == 1 + assert accelerator.predict_count == 1 + + +def test_plugin_on_reset_dataloader_hooks(tmpdir): + """ + Ensure data-loader hooks are called using a Plugin. + """ + + class CustomPlugin(SingleDevicePlugin): + train_count: int = 0 + val_count: int = 0 + test_count: int = 0 + predict_count: int = 0 + + def on_reset_train_dataloader(self, dataloader): + self.train_count += 1 + assert self.lightning_module.trainer.training + return super().on_reset_train_dataloader(dataloader) + + def on_reset_val_dataloader(self, dataloader): + self.val_count += 1 + assert self.lightning_module.trainer.training or self.lightning_module.trainer.validating + return super().on_reset_val_dataloader(dataloader) + + def on_reset_test_dataloader(self, dataloader): + self.test_count += 1 + assert self.lightning_module.trainer.testing + return super().on_reset_test_dataloader(dataloader) + + def on_reset_predict_dataloader(self, dataloader): + self.predict_count += 1 + assert self.lightning_module.trainer.predicting + return super().on_reset_predict_dataloader(dataloader) + + plugin = CustomPlugin(device=torch.device('cpu')) + model = BoringModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin) + trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, dataloaders=model.test_dataloader()) + # assert that all loader hooks were called + assert plugin.train_count == 1 + assert plugin.val_count == 1 # only called once during the entire session + assert plugin.test_count == 1 + assert plugin.predict_count == 1 + plugin = CustomPlugin(device=torch.device('cpu')) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin) + trainer.validate(model) + trainer.test(model) + trainer.predict(model) + # assert val/test/predict loader hooks were called + assert plugin.val_count == 1 + assert plugin.test_count == 1 + assert plugin.predict_count == 1 diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 80580b63bb6e7..f38d08df3daf9 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -32,9 +32,10 @@ @RunIf(min_gpus=2) -def test_multi_gpu_model_ddp_fit_only(tmpdir): +@pytest.mark.parametrize("as_module", [True, False]) +def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module): # call the script - call_training_script(ddp_model, CLI_ARGS, 'fit', tmpdir, timeout=120) + call_training_script(ddp_model, CLI_ARGS, 'fit', tmpdir, timeout=120, as_module=as_module) # load the results of the script result_path = os.path.join(tmpdir, 'ddp.result') @@ -45,9 +46,10 @@ def test_multi_gpu_model_ddp_fit_only(tmpdir): @RunIf(min_gpus=2) -def test_multi_gpu_model_ddp_test_only(tmpdir): +@pytest.mark.parametrize("as_module", [True, False]) +def test_multi_gpu_model_ddp_test_only(tmpdir, as_module): # call the script - call_training_script(ddp_model, CLI_ARGS, 'test', tmpdir) + call_training_script(ddp_model, CLI_ARGS, 'test', tmpdir, as_module=as_module) # load the results of the script result_path = os.path.join(tmpdir, 'ddp.result') @@ -58,9 +60,10 @@ def test_multi_gpu_model_ddp_test_only(tmpdir): @RunIf(min_gpus=2) -def test_multi_gpu_model_ddp_fit_test(tmpdir): +@pytest.mark.parametrize("as_module", [True, False]) +def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): # call the script - call_training_script(ddp_model, CLI_ARGS, 'fit_test', tmpdir, timeout=20) + call_training_script(ddp_model, CLI_ARGS, 'fit_test', tmpdir, timeout=20, as_module=as_module) # load the results of the script result_path = os.path.join(tmpdir, 'ddp.result') @@ -123,7 +126,16 @@ def setup(self, stage: Optional[str] = None) -> None: @RunIf(min_gpus=2, min_torch="1.8.1", special=True) -def test_ddp_wrapper(tmpdir): +def test_ddp_wrapper_16(tmpdir): + _test_ddp_wrapper(tmpdir, precision=16) + + +@RunIf(min_gpus=2, min_torch="1.8.1", special=True) +def test_ddp_wrapper_32(tmpdir): + _test_ddp_wrapper(tmpdir, precision=32) + + +def _test_ddp_wrapper(tmpdir, precision): """ Test parameters to ignore are carried over for DDP. """ @@ -150,5 +162,12 @@ def on_train_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') assert trainer.training_type_plugin.model.module._ddp_params_and_buffers_to_ignore == ('something') model = CustomModel() - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=CustomCallback()) + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + precision=precision, + accelerator="ddp", + gpus=2, + callbacks=CustomCallback(), + ) trainer.fit(model) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py new file mode 100644 index 0000000000000..363648c9f681d --- /dev/null +++ b/tests/accelerators/test_ipu.py @@ -0,0 +1,547 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Optional + +import pytest +import torch +import torch.nn.functional as F + +from pytorch_lightning import Callback, seed_everything, Trainer +from pytorch_lightning.accelerators import IPUAccelerator +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin +from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.utilities import _IPU_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.helpers.boring_model import BoringModel +from tests.helpers.datamodules import ClassifDataModule +from tests.helpers.runif import RunIf +from tests.helpers.simple_models import ClassificationModel + +if _IPU_AVAILABLE: + import poptorch + + +class IPUModel(BoringModel): + + def training_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def validation_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def test_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def training_epoch_end(self, outputs) -> None: + pass + + def validation_epoch_end(self, outputs) -> None: + pass + + def test_epoch_end(self, outputs) -> None: + pass + + +class IPUClassificationModel(ClassificationModel): + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def accuracy(self, logits, y): + # todo (sean): currently IPU poptorch doesn't implicit convert bools to tensor + # hence we use an explicit calculation for accuracy here. Once fixed in poptorch + # we can use the accuracy metric. + acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) + return acc + + def validation_epoch_end(self, outputs) -> None: + self.log('val_acc', torch.stack(outputs).mean()) + + def test_epoch_end(self, outputs) -> None: + self.log('test_acc', torch.stack(outputs).mean()) + + +@pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine") +def test_fail_if_no_ipus(tmpdir): + with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): + Trainer(default_root_dir=tmpdir, ipus=1) + + with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): + Trainer(default_root_dir=tmpdir, ipus=1, accelerator='ipu') + + +@RunIf(ipu=True) +def test_accelerator_selected(tmpdir): + trainer = Trainer(default_root_dir=tmpdir, ipus=1) + assert isinstance(trainer.accelerator, IPUAccelerator) + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accelerator='ipu') + assert isinstance(trainer.accelerator, IPUAccelerator) + + +@RunIf(ipu=True) +@pytest.mark.parametrize('ipus', [1, 4]) +def test_all_stages(tmpdir, ipus): + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=ipus) + trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, model.val_dataloader()) + + +@RunIf(ipu=True) +@pytest.mark.parametrize('ipus', [1, 4]) +def test_inference_only(tmpdir, ipus): + model = IPUModel() + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=ipus) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, model.val_dataloader()) + + +@RunIf(ipu=True) +def test_optimization(tmpdir): + seed_everything(42) + + dm = ClassifDataModule(length=1024) + model = IPUClassificationModel() + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + ipus=2, + ) + + # fit model + trainer.fit(model, dm) + assert trainer.state.finished, f"Training failed with {trainer.state}" + assert dm.trainer is not None + + # validate + result = trainer.validate(datamodule=dm) + assert dm.trainer is not None + assert result[0]['val_acc'] > 0.7 + + # test + result = trainer.test(model, datamodule=dm) + assert dm.trainer is not None + test_result = result[0]['test_acc'] + assert test_result > 0.6 + + # test saved model + model_path = os.path.join(tmpdir, 'model.pt') + trainer.save_checkpoint(model_path) + + model = IPUClassificationModel.load_from_checkpoint(model_path) + + trainer = Trainer(default_root_dir=tmpdir, ipus=2) + + result = trainer.test(model, datamodule=dm) + saved_result = result[0]['test_acc'] + assert saved_result == test_result + + +@RunIf(ipu=True) +def test_mixed_precision(tmpdir): + + class TestCallback(Callback): + + def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: + assert trainer.accelerator.model.precision == 16 + raise SystemExit + + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_pure_half_precision(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + assert trainer.accelerator.model.precision == 16 + for param in trainer.accelerator.model.parameters(): + assert param.dtype == torch.float16 + raise SystemExit + + model = IPUModel() + model = model.half() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) + + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_device_iterations_ipu_plugin(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + assert trainer.accelerator.training_type_plugin.device_iterations == 2 + # assert device iterations has been set correctly within the poptorch options + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] + assert poptorch_model._options.toDict()['device_iterations'] == 2 + raise SystemExit + + model = IPUModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + ipus=1, + plugins=IPUPlugin(device_iterations=2), + callbacks=TestCallback() + ) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_accumulated_batches(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + # ensure the accumulation_scheduler is overridden to accumulate every batch + # since ipu handle accumulation + assert trainer.accumulation_scheduler.scheduling == {0: 1} + # assert poptorch option have been set correctly + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] + assert poptorch_model._options.Training.toDict()['gradient_accumulation'] == 2 + raise SystemExit + + model = IPUModel() + trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, ipus=1, accumulate_grad_batches=2, callbacks=TestCallback() + ) + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_stages_correct(tmpdir): + """Ensure all stages correctly are traced correctly by asserting the output for each stage""" + + class StageModel(IPUModel): + + def training_step(self, batch, batch_idx): + loss = super().training_step(batch, batch_idx) + # tracing requires a loss value that depends on the model. + # force it to be a value but ensure we use the loss. + return (loss - loss) + torch.tensor(1) + + def validation_step(self, batch, batch_idx): + loss = super().validation_step(batch, batch_idx) + return (loss - loss) + torch.tensor(2) + + def test_step(self, batch, batch_idx): + loss = super().validation_step(batch, batch_idx) + return (loss - loss) + torch.tensor(3) + + def predict_step(self, batch, batch_idx, dataloader_idx=None): + output = super().predict_step(batch, batch_idx) + return (output - output) + torch.tensor(4) + + class TestCallback(Callback): + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs['loss'].item() == 1 + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs.item() == 2 + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs.item() == 3 + + def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert torch.all(outputs == 4).item() + + model = StageModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, callbacks=TestCallback()) + trainer.fit(model) + trainer.test(model) + trainer.validate(model) + trainer.predict(model, model.test_dataloader()) + + +@RunIf(ipu=True) +def test_accumulate_grad_batches_dict_fails(tmpdir): + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={0: 1}) + with pytest.raises( + MisconfigurationException, match="IPUs currently only support accumulate_grad_batches being an integer value." + ): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_clip_gradients_fails(tmpdir): + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, ipus=1, gradient_clip_val=10) + with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_autoreport(tmpdir): + """Ensure autoreport dumps to a file.""" + model = IPUModel() + autoreport_path = os.path.join(tmpdir, 'report/') + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(autoreport=True, autoreport_dir=autoreport_path) + ) + trainer.fit(model) + assert os.path.exists(autoreport_path) + assert os.path.isfile(autoreport_path + 'profile.pop') + + +@RunIf(ipu=True) +def test_manual_poptorch_opts(tmpdir): + """Ensure if the user passes manual poptorch Options, we run with the correct object.""" + model = IPUModel() + inference_opts = poptorch.Options() + training_opts = poptorch.Options() + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + trainer.fit(model) + + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.training_opts == training_opts + assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_ipu_count(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and the number of ipus do not match, we warn and we set it for the user. + """ + + manual_ipus = 1 + expected_ipus = 2 + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.replicationFactor(manual_ipus) + + training_opts = poptorch.Options() + training_opts.replicationFactor(manual_ipus) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=expected_ipus, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} " + f"which differs to the ipus={expected_ipus} flag passed to the Trainer. " + f"Setting to {expected_ipus} in the poptorch.Options." + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2 + assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_inference_grad_accum(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and grad accumulation is set greater than 1 for inference, we warn and set to 1. + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.Training.gradientAccumulation(4) + + training_opts = poptorch.Options() + training_opts.Training.gradientAccumulation(1) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match="Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options.", + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_train_grad_accum(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and grad accumulation differs to accumulate_grad_batches, we + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.Training.gradientAccumulation(1) + + training_opts = poptorch.Options() + training_opts.Training.gradientAccumulation(2) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + accumulate_grad_batches=1, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match=f"Training poptorch.Options set gradientAccumulation to {2}. " + f"This is different to accumulate_grad_batches which was set to {1}. " + f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " + f"Setting poptorch.Options gradientAccumulation to {1}", + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_custom(tmpdir): + """ + Ensure if the user passes manual poptorch Options with custom parameters set, + we respect them in our poptorch options. + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.deviceIterations(16) + inference_opts.replicationFactor(2) + inference_opts.Training.gradientAccumulation(1) + + training_opts = poptorch.Options() + training_opts.deviceIterations(8) + training_opts.replicationFactor(2) + training_opts.Training.gradientAccumulation(2) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=2, + fast_dev_run=True, + accumulate_grad_batches=2, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + trainer.fit(model) + plugin = trainer.accelerator.training_type_plugin + assert isinstance(plugin, IPUPlugin) + inference_opts = plugin.inference_opts + training_opts = plugin.training_opts + assert inference_opts.device_iterations == 16 + assert inference_opts.replication_factor == 2 + assert inference_opts.Training.gradient_accumulation == 1 + + assert training_opts.device_iterations == 8 + assert training_opts.replication_factor == 2 + assert training_opts.Training.gradient_accumulation == 2 + + +@RunIf(ipu=True) +def test_default_opts(tmpdir): + """ + Ensure default opts are set correctly in the IPUPlugin. + """ + + model = IPUModel() + + trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True) + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + inference_opts = trainer.accelerator.training_type_plugin.inference_opts + training_opts = trainer.accelerator.training_type_plugin.training_opts + for opts in (inference_opts, training_opts): + assert isinstance(opts, poptorch.Options) + assert opts.Training.gradient_accumulation == 1 + assert opts.device_iterations == 1 + assert opts.replication_factor == 1 + + +@RunIf(ipu=True) +def test_multi_optimizers_fails(tmpdir): + """ + Ensure if there are multiple optimizers, we throw an exception + """ + + class TestModel(IPUModel): + + def configure_optimizers(self): + return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())] + + model = TestModel() + + trainer = Trainer(default_root_dir=tmpdir, ipus=1) + with pytest.raises(MisconfigurationException, match="IPUs currently only support one optimizer."): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_precision_plugin(tmpdir): + """ + Ensure precision plugin value is set correctly. + """ + + plugin = IPUPrecisionPlugin(precision=16) + assert plugin.precision == 16 diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py index 42a9b1c064199..463307ead8717 100644 --- a/tests/accelerators/test_multi_nodes_gpu.py +++ b/tests/accelerators/test_multi_nodes_gpu.py @@ -13,7 +13,6 @@ # limitations under the License. import os import sys -from unittest import mock import pytest import torch @@ -73,7 +72,6 @@ def validation_step(self, batch, batch_idx): # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` @pytest.mark.skip("Multi-node testing is currently disabled") @RunIf(special=True) -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test__validation_step__log(tmpdir): """ Tests that validation_step can log @@ -117,7 +115,7 @@ def backward(self, loss, optimizer, optimizer_idx): trainer.fit(model) # make sure all the metrics are available for callbacks - expected_logged_metrics = { + assert set(trainer.logged_metrics) == { 'a2', 'a_step', 'a_epoch', @@ -125,12 +123,7 @@ def backward(self, loss, optimizer, optimizer_idx): 'b_epoch', 'epoch', } - logged_metrics = set(trainer.logged_metrics.keys()) - assert expected_logged_metrics == logged_metrics # we don't want to enable val metrics during steps because it is not something that users should do - # on purpose DO NOT allow step_b... it's silly to monitor val step metrics - callback_metrics = set(trainer.callback_metrics.keys()) - callback_metrics.remove('debug_epoch') - expected_cb_metrics = {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'} - assert expected_cb_metrics == callback_metrics + # on purpose DO NOT allow b_step... it's silly to monitor val step metrics + assert set(trainer.callback_metrics) == {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'} diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index c24cf5ded575a..2ef83ffd5a2de 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC -from collections import OrderedDict class TrainingStepVariations(ABC): @@ -31,18 +30,7 @@ def training_step(self, batch, batch_idx, optimizer_idx=None): # calculate loss loss_train = self.loss(y, y_hat) - log_train = loss_train - - # alternate between tensors and scalars for "log" and "progress_bar" - if batch_idx % 2 == 0: - log_train = log_train.item() - - output = OrderedDict({ - 'loss': loss_train, - 'progress_bar': dict(some_val=log_train * log_train), - 'log': dict(train_some_val=log_train * log_train), - }) - return output + return {'loss': loss_train} def training_step__multiple_dataloaders(self, batch, batch_idx, optimizer_idx=None): """Training step for multiple train loaders""" @@ -61,19 +49,4 @@ def training_step__multiple_dataloaders(self, batch, batch_idx, optimizer_idx=No # calculate loss loss_val = self.loss(y, y_hat) - log_val = loss_val - - # alternate between tensors and scalars for "log" and "progress_bar" - if batch_idx % 2 == 0: - log_val = log_val.item() - - output = OrderedDict({ - 'loss': loss_val, - 'progress_bar': { - 'some_val': log_val * log_val - }, - 'log': { - 'train_some_val': log_val * log_val - }, - }) - return output + return {'loss': loss_val} diff --git a/tests/callbacks/test_callback_hook_outputs.py b/tests/callbacks/test_callback_hook_outputs.py index 36322482c5eba..eac95e9bf18c6 100644 --- a/tests/callbacks/test_callback_hook_outputs.py +++ b/tests/callbacks/test_callback_hook_outputs.py @@ -70,7 +70,7 @@ def test_free_memory_on_eval_outputs(tmpdir): class CB(Callback): def on_epoch_end(self, trainer, pl_module): - assert len(trainer.evaluation_loop.outputs) == 0 + assert len(trainer._evaluation_loop.outputs) == 0 model = BoringModel() diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index a22e72ce09184..57fdd1bf66322 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -11,168 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock -from unittest.mock import ANY, call, MagicMock, Mock +from unittest.mock import call, Mock from pytorch_lightning import Trainer from tests.helpers import BoringModel -@mock.patch("torch.save") # need to mock torch.save or we get pickle error -def test_trainer_callback_hook_system_fit(_, tmpdir): - """Test the callback hook system for fit.""" - - model = BoringModel() - callback_mock = MagicMock() - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=[callback_mock], - max_epochs=1, - limit_val_batches=1, - limit_train_batches=3, - progress_bar_refresh_rate=0, - ) - - # check that only the to calls exists - assert trainer.callbacks[0] == callback_mock - assert callback_mock.method_calls == [ - call.on_init_start(trainer), - call.on_init_end(trainer), - ] - - # fit model - trainer.fit(model) - - assert callback_mock.method_calls == [ - call.on_init_start(trainer), - call.on_init_end(trainer), - call.on_before_accelerator_backend_setup(trainer, model), - call.setup(trainer, model, 'fit'), - call.on_configure_sharded_model(trainer, model), - call.on_fit_start(trainer, model), - call.on_pretrain_routine_start(trainer, model), - call.on_pretrain_routine_end(trainer, model), - call.on_sanity_check_start(trainer, model), - call.on_validation_start(trainer, model), - call.on_epoch_start(trainer, model), - call.on_validation_epoch_start(trainer, model), - call.on_validation_batch_start(trainer, model, ANY, 0, 0), - call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), - call.on_validation_epoch_end(trainer, model), - call.on_epoch_end(trainer, model), - call.on_validation_end(trainer, model), - call.on_sanity_check_end(trainer, model), - call.on_train_start(trainer, model), - call.on_epoch_start(trainer, model), - call.on_train_epoch_start(trainer, model), - call.on_batch_start(trainer, model), - call.on_train_batch_start(trainer, model, ANY, 0, 0), - call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), - call.on_after_backward(trainer, model), - call.on_train_batch_end(trainer, model, ANY, ANY, 0, 0), - call.on_batch_end(trainer, model), - call.on_batch_start(trainer, model), - call.on_train_batch_start(trainer, model, ANY, 1, 0), - call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), - call.on_after_backward(trainer, model), - call.on_train_batch_end(trainer, model, ANY, ANY, 1, 0), - call.on_batch_end(trainer, model), - call.on_batch_start(trainer, model), - call.on_train_batch_start(trainer, model, ANY, 2, 0), - call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), - call.on_after_backward(trainer, model), - call.on_train_batch_end(trainer, model, ANY, ANY, 2, 0), - call.on_batch_end(trainer, model), - call.on_validation_start(trainer, model), - call.on_epoch_start(trainer, model), - call.on_validation_epoch_start(trainer, model), - call.on_validation_batch_start(trainer, model, ANY, 0, 0), - call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), - call.on_validation_epoch_end(trainer, model), - call.on_epoch_end(trainer, model), - call.on_validation_end(trainer, model), - call.on_save_checkpoint(trainer, model), # should take ANY but we are inspecting signature for BC - call.on_train_epoch_end(trainer, model, ANY), - call.on_epoch_end(trainer, model), - call.on_train_end(trainer, model), - call.on_fit_end(trainer, model), - call.teardown(trainer, model, 'fit'), - ] - - -def test_trainer_callback_hook_system_test(tmpdir): - """Test the callback hook system for test.""" - - model = BoringModel() - callback_mock = MagicMock() - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=[callback_mock], - max_epochs=1, - limit_test_batches=2, - progress_bar_refresh_rate=0, - ) - - trainer.test(model) - - assert callback_mock.method_calls == [ - call.on_init_start(trainer), - call.on_init_end(trainer), - call.on_before_accelerator_backend_setup(trainer, model), - call.setup(trainer, model, 'test'), - call.on_configure_sharded_model(trainer, model), - call.on_test_start(trainer, model), - call.on_epoch_start(trainer, model), - call.on_test_epoch_start(trainer, model), - call.on_test_batch_start(trainer, model, ANY, 0, 0), - call.on_test_batch_end(trainer, model, ANY, ANY, 0, 0), - call.on_test_batch_start(trainer, model, ANY, 1, 0), - call.on_test_batch_end(trainer, model, ANY, ANY, 1, 0), - call.on_test_epoch_end(trainer, model), - call.on_epoch_end(trainer, model), - call.on_test_end(trainer, model), - call.teardown(trainer, model, 'test'), - ] - - -def test_trainer_callback_hook_system_validate(tmpdir): - """Test the callback hook system for validate.""" - - model = BoringModel() - callback_mock = MagicMock() - trainer = Trainer( - default_root_dir=tmpdir, - callbacks=[callback_mock], - max_epochs=1, - limit_val_batches=2, - progress_bar_refresh_rate=0, - ) - - trainer.validate(model) - - assert callback_mock.method_calls == [ - call.on_init_start(trainer), - call.on_init_end(trainer), - call.on_before_accelerator_backend_setup(trainer, model), - call.setup(trainer, model, 'validate'), - call.on_configure_sharded_model(trainer, model), - call.on_validation_start(trainer, model), - call.on_epoch_start(trainer, model), - call.on_validation_epoch_start(trainer, model), - call.on_validation_batch_start(trainer, model, ANY, 0, 0), - call.on_validation_batch_end(trainer, model, ANY, ANY, 0, 0), - call.on_validation_batch_start(trainer, model, ANY, 1, 0), - call.on_validation_batch_end(trainer, model, ANY, ANY, 1, 0), - call.on_validation_epoch_end(trainer, model), - call.on_epoch_end(trainer, model), - call.on_validation_end(trainer, model), - call.teardown(trainer, model, 'validate'), - ] - - -# TODO: add callback tests for predict and tune - - def test_callbacks_configured_in_model(tmpdir): """ Test the callback system with callbacks added through the model hook. """ diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index 7d303e6ed00d6..d7a6f15459912 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -86,7 +86,7 @@ def test_resume_early_stopping_from_checkpoint(tmpdir): callbacks=[early_stop_callback], ) - with pytest.raises(MisconfigurationException, match=r'.*you restored a checkpoint with current_epoch*'): + with pytest.raises(MisconfigurationException, match=r'You restored a checkpoint with current_epoch'): new_trainer.fit(model) @@ -123,7 +123,7 @@ def test_early_stopping_patience(tmpdir, loss_values: list, patience: int, expec """Test to ensure that early stopping is not triggered before patience is exhausted.""" class ModelOverrideValidationReturn(BoringModel): - validation_return_values = torch.Tensor(loss_values) + validation_return_values = torch.tensor(loss_values) def validation_epoch_end(self, outputs): loss = self.validation_return_values[self.current_epoch] @@ -137,6 +137,7 @@ def validation_epoch_end(self, outputs): val_check_interval=1.0, num_sanity_val_steps=0, max_epochs=10, + progress_bar_refresh_rate=0, ) trainer.fit(model) assert trainer.current_epoch == expected_stop_epoch @@ -176,6 +177,7 @@ def training_epoch_end(self, outputs): callbacks=[early_stop_callback], num_sanity_val_steps=0, max_epochs=10, + progress_bar_refresh_rate=0, ) trainer.fit(model) assert trainer.current_epoch == expected_stop_epoch diff --git a/tests/callbacks/test_finetuning_callback.py b/tests/callbacks/test_finetuning_callback.py index 53d34c4645bef..7492bcac7804a 100644 --- a/tests/callbacks/test_finetuning_callback.py +++ b/tests/callbacks/test_finetuning_callback.py @@ -27,7 +27,8 @@ class TestBackboneFinetuningCallback(BackboneFinetuning): - def on_train_epoch_end(self, trainer, pl_module): + def on_train_epoch_start(self, trainer, pl_module): + super().on_train_epoch_start(trainer, pl_module) epoch = trainer.current_epoch if self.unfreeze_backbone_at_epoch <= epoch: optimizer = trainer.optimizers[0] @@ -275,7 +276,7 @@ def configure_optimizers(self): model = FreezeModel() cb = OnEpochLayerFinetuning() trainer = Trainer(max_epochs=10, resume_from_checkpoint=chk.last_model_path, callbacks=[cb]) - with pytest.raises(IndexError, match="index 6 is out of range"): + with pytest.raises(ValueError, match="loaded state dict has a different number of parameter groups"): trainer.fit(model) @@ -307,7 +308,11 @@ def configure_optimizers(self): trainer.fit(model) -def test_deep_nested_model(): +def test_complex_nested_model(): + """ + Test flattening, freezing, and thawing of models which contain parent (non-leaf) modules with parameters + directly themselves rather than exclusively their submodules containing parameters. + """ class ConvBlock(nn.Module): @@ -322,23 +327,41 @@ def forward(self, x): x = self.act(x) return self.bn(x) + class ConvBlockParam(nn.Module): + + def __init__(self, in_channels, out_channels): + super().__init__() + self.module_dict = nn.ModuleDict({ + "conv": nn.Conv2d(in_channels, out_channels, 3), + "act": nn.ReLU(), + }) + # add trivial test parameter to convblock to validate parent (non-leaf) module parameter handling + self.parent_param = nn.Parameter(torch.zeros((1), dtype=torch.float)) + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.module_dict["conv"](x) + x = self.module_dict["act"](x) + return self.bn(x) + model = nn.Sequential( OrderedDict([ - ("encoder", nn.Sequential(ConvBlock(3, 64), ConvBlock(64, 128))), + ("encoder", nn.Sequential(ConvBlockParam(3, 64), ConvBlock(64, 128))), ("decoder", ConvBlock(128, 10)), ]) ) - # There's 9 leaf layers in that model - assert len(BaseFinetuning.flatten_modules(model)) == 9 + # There are 10 leaf modules or parent modules w/ parameters in the test model + assert len(BaseFinetuning.flatten_modules(model)) == 10 BaseFinetuning.freeze(model.encoder, train_bn=True) - assert not model.encoder[0].conv.weight.requires_grad + assert not model.encoder[0].module_dict["conv"].weight.requires_grad # Validate a leaf module parameter is frozen + assert not model.encoder[0].parent_param.requires_grad # Validate the parent module parameter is frozen assert model.encoder[0].bn.weight.requires_grad BaseFinetuning.make_trainable(model) encoder_params = list(BaseFinetuning.filter_params(model.encoder, train_bn=True)) - # The 8 parameters of the encoder are: - # conv0.weight, conv0.bias, bn0.weight, bn0.bias + # The 9 parameters of the encoder are: + # conv0.weight, conv0.bias, bn0.weight, bn0.bias, parent_param # conv1.weight, conv1.bias, bn1.weight, bn1.bias - assert len(encoder_params) == 8 + assert len(encoder_params) == 9 diff --git a/tests/callbacks/test_lambda_function.py b/tests/callbacks/test_lambda_function.py index 8d9f85fa56e8a..845846dfd1cfc 100644 --- a/tests/callbacks/test_lambda_function.py +++ b/tests/callbacks/test_lambda_function.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect +from functools import partial from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import Callback, LambdaCallback @@ -28,9 +29,13 @@ def on_train_epoch_start(self): raise KeyboardInterrupt checker = set() - hooks = [m for m, _ in inspect.getmembers(Callback, predicate=inspect.isfunction)] - hooks_args = {h: (lambda x: lambda *_: checker.add(x))(h) for h in hooks} - hooks_args["on_save_checkpoint"] = (lambda x: lambda *_: [checker.add(x)])("on_save_checkpoint") + + def call(hook, *_, **__): + checker.add(hook) + + hooks = {m for m, _ in inspect.getmembers(Callback, predicate=inspect.isfunction)} + hooks_args = {h: partial(call, h) for h in hooks} + hooks_args["on_save_checkpoint"] = lambda *_: [checker.add('on_save_checkpoint')] model = CustomModel() @@ -59,4 +64,4 @@ def on_train_epoch_start(self): trainer.test(model) trainer.predict(model) - assert checker == set(hooks) + assert checker == hooks diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py index bea6c45e95ced..7956b756dcb3c 100644 --- a/tests/callbacks/test_lr_monitor.py +++ b/tests/callbacks/test_lr_monitor.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest +import torch from torch import optim import tests.helpers.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import LearningRateMonitor +from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.callbacks.finetuning import BackboneFinetuning from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.datamodules import ClassifDataModule @@ -278,3 +281,173 @@ def configure_optimizers(self): ) trainer.fit(TestModel()) assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name'] + + +def test_lr_monitor_custom_pg_name(tmpdir): + + class TestModel(BoringModel): + + def configure_optimizers(self): + optimizer = torch.optim.SGD([{'params': list(self.layer.parameters()), 'name': 'linear'}], lr=0.1) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + lr_monitor = LearningRateMonitor() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_val_batches=2, + limit_train_batches=2, + callbacks=[lr_monitor], + progress_bar_refresh_rate=0, + weights_summary=None, + ) + trainer.fit(TestModel()) + assert lr_monitor.lr_sch_names == ['lr-SGD'] + assert list(lr_monitor.lrs) == ['lr-SGD/linear'] + + +def test_lr_monitor_duplicate_custom_pg_names(tmpdir): + tutils.reset_seed() + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.linear_a = torch.nn.Linear(32, 16) + self.linear_b = torch.nn.Linear(16, 2) + + def forward(self, x): + x = self.linear_a(x) + x = self.linear_b(x) + return x + + def configure_optimizers(self): + param_groups = [ + { + 'params': list(self.linear_a.parameters()), + 'name': 'linear' + }, + { + 'params': list(self.linear_b.parameters()), + 'name': 'linear' + }, + ] + optimizer = torch.optim.SGD(param_groups, lr=0.1) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + lr_monitor = LearningRateMonitor() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_val_batches=2, + limit_train_batches=2, + callbacks=[lr_monitor], + progress_bar_refresh_rate=0, + weights_summary=None, + ) + + with pytest.raises( + MisconfigurationException, match='A single `Optimizer` cannot have multiple parameter groups with identical' + ): + trainer.fit(TestModel()) + + +def test_multiple_optimizers_basefinetuning(tmpdir): + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.backbone = torch.nn.Sequential( + torch.nn.Linear(32, 32), + torch.nn.Linear(32, 32), + torch.nn.Linear(32, 32), + torch.nn.ReLU(True), + ) + self.layer = torch.nn.Linear(32, 2) + + def training_step(self, batch, batch_idx, optimizer_idx): + return super().training_step(batch, batch_idx) + + def forward(self, x): + return self.layer(self.backbone(x)) + + def configure_optimizers(self): + parameters = list(filter(lambda p: p.requires_grad, self.parameters())) + opt = optim.Adam(parameters, lr=0.1) + opt_2 = optim.Adam(parameters, lr=0.1) + opt_3 = optim.Adam(parameters, lr=0.1) + optimizers = [opt, opt_2, opt_3] + schedulers = [ + optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.5), + optim.lr_scheduler.StepLR(opt_2, step_size=1, gamma=0.5), + ] + return optimizers, schedulers + + class Check(Callback): + + def on_train_epoch_start(self, trainer, pl_module) -> None: + num_param_groups = sum([len(opt.param_groups) for opt in trainer.optimizers]) + assert lr_monitor.lr_sch_names == ['lr-Adam', 'lr-Adam-1'] + if trainer.current_epoch == 0: + assert num_param_groups == 3 + elif trainer.current_epoch == 1: + assert num_param_groups == 4 + assert list(lr_monitor.lrs) == ['lr-Adam-1', 'lr-Adam/pg1', 'lr-Adam/pg2'] + elif trainer.current_epoch == 2: + assert num_param_groups == 5 + assert list(lr_monitor.lrs) == ['lr-Adam/pg1', 'lr-Adam/pg2', 'lr-Adam-1/pg1', 'lr-Adam-1/pg2'] + else: + expected = ['lr-Adam/pg1', 'lr-Adam/pg2', 'lr-Adam-1/pg1', 'lr-Adam-1/pg2', 'lr-Adam-1/pg3'] + assert list(lr_monitor.lrs) == expected + + class TestFinetuning(BackboneFinetuning): + + def freeze_before_training(self, pl_module): + self.freeze(pl_module.backbone[0]) + self.freeze(pl_module.backbone[1]) + self.freeze(pl_module.layer) + + def finetune_function(self, pl_module, epoch: int, optimizer, opt_idx: int): + """Called when the epoch begins.""" + if epoch == 1 and opt_idx == 0: + self.unfreeze_and_add_param_group(pl_module.backbone[0], optimizer, lr=0.1) + if epoch == 2 and opt_idx == 1: + self.unfreeze_and_add_param_group(pl_module.layer, optimizer, lr=0.1) + + if epoch == 3 and opt_idx == 1: + assert len(optimizer.param_groups) == 2 + self.unfreeze_and_add_param_group(pl_module.backbone[1], optimizer, lr=0.1) + assert len(optimizer.param_groups) == 3 + + lr_monitor = LearningRateMonitor() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=5, + limit_val_batches=0, + limit_train_batches=2, + callbacks=[TestFinetuning(), lr_monitor, Check()], + progress_bar_refresh_rate=0, + weights_summary=None, + checkpoint_callback=False + ) + model = TestModel() + model.training_epoch_end = None + trainer.fit(model) + + expected = [0.1, 0.05, 0.025, 0.0125, 0.00625] + assert lr_monitor.lrs['lr-Adam/pg1'] == expected + + expected = [0.1, 0.05, 0.025, 0.0125] + assert lr_monitor.lrs['lr-Adam/pg2'] == expected + + expected = [0.1, 0.05, 0.025, 0.0125, 0.00625] + assert lr_monitor.lrs['lr-Adam-1/pg1'] == expected + + expected = [0.1, 0.05, 0.025] + assert lr_monitor.lrs['lr-Adam-1/pg2'] == expected + + expected = [0.1, 0.05] + assert lr_monitor.lrs['lr-Adam-1/pg3'] == expected diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py index f4f8f34c1b4c1..aafb29d51b161 100644 --- a/tests/callbacks/test_progress_bar.py +++ b/tests/callbacks/test_progress_bar.py @@ -20,12 +20,14 @@ import pytest import torch +from torch.utils.data.dataloader import DataLoader from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint, ProgressBar, ProgressBarBase from pytorch_lightning.callbacks.progress import tqdm from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers import BoringModel +from tests.helpers.boring_model import BoringModel, RandomDataset +from tests.helpers.runif import RunIf @pytest.mark.parametrize( @@ -192,11 +194,11 @@ class CurrentProgressBar(ProgressBar): def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): super().on_train_batch_start(trainer, pl_module, batch, batch_idx, dataloader_idx) - assert self.train_batch_idx == trainer.train_loop.batch_idx + assert self.train_batch_idx == trainer.fit_loop.batch_idx def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) - assert self.train_batch_idx == trainer.train_loop.batch_idx + 1 + assert self.train_batch_idx == trainer.fit_loop.batch_idx + 1 if not self.is_disabled and self.train_batch_idx % self.refresh_rate == 0: assert self.main_progress_bar.n == self.train_batch_idx self.train_batches_seen += 1 @@ -350,7 +352,8 @@ def test_main_progress_bar_update_amount( checkpoint_callback=False, ) trainer.fit(model) - progress_bar.main_progress_bar.update.assert_has_calls([call(delta) for delta in train_deltas]) + if train_batches > 0: + progress_bar.main_progress_bar.update.assert_has_calls([call(delta) for delta in train_deltas]) if val_batches > 0: progress_bar.val_progress_bar.update.assert_has_calls([call(delta) for delta in val_deltas]) @@ -384,8 +387,9 @@ def test_tensor_to_float_conversion(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): - self.log('foo', torch.tensor(0.123), prog_bar=True) - self.log('bar', {"baz": torch.tensor([1])}, prog_bar=True) + self.log('a', torch.tensor(0.123), prog_bar=True, on_epoch=False) + self.log('b', {"b1": torch.tensor([1])}, prog_bar=True, on_epoch=False) + self.log('c', {"c1": 2}, prog_bar=True, on_epoch=False) return super().training_step(batch, batch_idx) trainer = Trainer( @@ -397,9 +401,12 @@ def training_step(self, batch, batch_idx): ) trainer.fit(TestModel()) + torch.testing.assert_allclose(trainer.progress_bar_metrics['a'], 0.123) + assert trainer.progress_bar_metrics['b'] == {'b1': 1.0} + assert trainer.progress_bar_metrics['c'] == {'c1': 2.0} pbar = trainer.progress_bar_callback.main_progress_bar actual = str(pbar.postfix) - assert actual.endswith("foo=0.123, bar={'baz': tensor([1])}") + assert actual.endswith("a=0.123, b={'b1': 1.0}, c={'c1': 2.0}"), actual @pytest.mark.parametrize( @@ -533,3 +540,58 @@ def test_progress_bar_can_be_pickled(): pickle.dumps(bar) trainer.predict(model) pickle.dumps(bar) + + +@RunIf(min_gpus=2, special=True) +def test_progress_bar_max_val_check_interval_0(tmpdir): + _test_progress_bar_max_val_check_interval( + tmpdir, + total_train_samples=8, + train_batch_size=4, + total_val_samples=2, + val_batch_size=1, + val_check_interval=0.2 + ) + + +@RunIf(min_gpus=2, special=True) +def test_progress_bar_max_val_check_interval_1(tmpdir): + _test_progress_bar_max_val_check_interval( + tmpdir, + total_train_samples=8, + train_batch_size=4, + total_val_samples=2, + val_batch_size=1, + val_check_interval=0.5 + ) + + +def _test_progress_bar_max_val_check_interval( + tmpdir, total_train_samples, train_batch_size, total_val_samples, val_batch_size, val_check_interval +): + world_size = 2 + train_data = DataLoader(RandomDataset(32, total_train_samples), batch_size=train_batch_size) + val_data = DataLoader(RandomDataset(32, total_val_samples), batch_size=val_batch_size) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + num_sanity_val_steps=0, + max_epochs=1, + weights_summary=None, + val_check_interval=val_check_interval, + gpus=world_size, + accelerator="ddp", + ) + trainer.fit(model, train_dataloader=train_data, val_dataloaders=val_data) + + total_train_batches = total_train_samples // (train_batch_size * world_size) + val_check_batch = max(1, int(total_train_batches * val_check_interval)) + assert trainer.val_check_batch == val_check_batch + val_checks_per_epoch = total_train_batches / val_check_batch + total_val_batches = total_val_samples // (val_batch_size * world_size) + assert trainer.progress_bar_callback.total_train_batches == total_train_batches + assert trainer.progress_bar_callback.total_val_batches == total_val_batches + total_val_batches = total_val_batches * val_checks_per_epoch + if trainer.is_global_zero: + assert trainer.progress_bar_callback.main_progress_bar.total == total_train_batches + total_val_batches diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index d4957905454d8..1a5ddad64106e 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -161,14 +161,45 @@ def test_pruning_callback( ) -@RunIf(special=True) -@pytest.mark.parametrize("parameters_to_prune", [False, True]) -@pytest.mark.parametrize("use_global_unstructured", [False, True]) -def test_pruning_callback_ddp(tmpdir, use_global_unstructured: bool, parameters_to_prune: bool): +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_0(tmpdir): train_with_pruning_callback( tmpdir, - parameters_to_prune=parameters_to_prune, - use_global_unstructured=use_global_unstructured, + parameters_to_prune=False, + use_global_unstructured=False, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_1(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=False, + use_global_unstructured=True, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_2(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=True, + use_global_unstructured=False, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_3(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=True, + use_global_unstructured=True, accelerator="ddp", gpus=2, ) diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py index 81efc12b34662..8518fe16f0359 100644 --- a/tests/callbacks/test_stochastic_weight_avg.py +++ b/tests/callbacks/test_stochastic_weight_avg.py @@ -23,7 +23,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers import BoringModel, RandomDataset +from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset from tests.helpers.runif import RunIf if _TORCH_GREATER_EQUAL_1_6: @@ -33,7 +33,7 @@ class SwaTestModel(BoringModel): - def __init__(self, batchnorm: bool = True, interval: str = "epoch"): + def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False): super().__init__() layers = [nn.Linear(32, 32)] if batchnorm: @@ -41,6 +41,7 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch"): layers += [nn.ReLU(), nn.Linear(32, 2)] self.layer = nn.Sequential(*layers) self.interval = interval + self.iterable_dataset = iterable_dataset def training_step(self, batch, batch_idx): output = self.forward(batch) @@ -48,7 +49,11 @@ def training_step(self, batch, batch_idx): return {"loss": loss} def train_dataloader(self): - return DataLoader(RandomDataset(32, 64), batch_size=2) + + dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset + dset = dset_cls(32, 64) + + return DataLoader(dset, batch_size=2) def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) @@ -74,7 +79,7 @@ def transfer_weights(self, *args, **kwargs): def on_train_epoch_start(self, trainer, *args): super().on_train_epoch_start(trainer, *args) - assert trainer.train_loop._skip_backward == (trainer.current_epoch > self.swa_end) + assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end) if self.swa_start <= trainer.current_epoch: assert isinstance(trainer.lr_schedulers[0]["scheduler"], SWALR) assert trainer.lr_schedulers[0]["interval"] == "epoch" @@ -92,7 +97,7 @@ def on_train_end(self, trainer, pl_module): super().on_train_end(trainer, pl_module) # make sure these are correctly set again - assert not trainer.train_loop._skip_backward + assert not trainer.fit_loop._skip_backward assert trainer.accumulate_grad_batches == 2 assert trainer.num_training_batches == 5 @@ -107,8 +112,10 @@ def on_train_end(self, trainer, pl_module): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def train_with_swa(tmpdir, batchnorm=True, accelerator=None, gpus=None, num_processes=1, interval="epoch"): - model = SwaTestModel(batchnorm=batchnorm, interval=interval) +def train_with_swa( + tmpdir, batchnorm=True, accelerator=None, gpus=None, num_processes=1, interval="epoch", iterable_dataset=False +): + model = SwaTestModel(batchnorm=batchnorm, interval=interval, iterable_dataset=iterable_dataset) swa_start = 2 max_epochs = 5 swa_callback = SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1) @@ -155,8 +162,9 @@ def test_swa_callback_1_gpu(tmpdir): @RunIf(min_torch="1.6.0") @pytest.mark.parametrize("batchnorm", (True, False)) -def test_swa_callback(tmpdir, batchnorm: bool): - train_with_swa(tmpdir, batchnorm=batchnorm) +@pytest.mark.parametrize('iterable_dataset', (True, False)) +def test_swa_callback(tmpdir, batchnorm: bool, iterable_dataset: bool): + train_with_swa(tmpdir, batchnorm=batchnorm, iterable_dataset=iterable_dataset) @RunIf(min_torch="1.6.0") diff --git a/tests/callbacks/test_timer.py b/tests/callbacks/test_timer.py index c27eebbeb7805..16e01a6adcaf4 100644 --- a/tests/callbacks/test_timer.py +++ b/tests/callbacks/test_timer.py @@ -95,7 +95,7 @@ def test_timer_time_remaining(time_mock): assert round(timer.time_elapsed()) == 3 -def test_timer_stops_training(tmpdir): +def test_timer_stops_training(tmpdir, caplog): """ Test that the timer stops training before reaching max_epochs """ model = BoringModel() duration = timedelta(milliseconds=100) @@ -106,9 +106,12 @@ def test_timer_stops_training(tmpdir): max_epochs=1000, callbacks=[timer], ) - trainer.fit(model) + with caplog.at_level(logging.INFO): + trainer.fit(model) assert trainer.global_step > 1 assert trainer.current_epoch < 999 + assert "Time limit reached." in caplog.text + assert "Signaling Trainer to stop." in caplog.text @pytest.mark.parametrize("interval", ["step", "epoch"]) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 9fdd69dba7a9a..8617a9f8f7050 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -107,8 +107,17 @@ def training_step(self, batch, batch_idx): @mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): +def test_top_k_ddp_0(save_mock, tmpdir): + _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) + + +@mock.patch('torch.save') +@RunIf(special=True, min_gpus=2) +def test_top_k_ddp_1(save_mock, tmpdir): + _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) + + +def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): @@ -120,7 +129,7 @@ def training_step(self, batch, batch_idx): def training_epoch_end(self, outputs) -> None: local_rank = int(os.getenv("LOCAL_RANK")) if self.trainer.is_global_zero: - self.log('my_loss_2', (1 + local_rank), on_epoch=True) + self.log('my_loss_2', (1 + local_rank), on_epoch=True, rank_zero_only=True) data = str(self.global_rank) obj = [[data], (data, ), set(data)] out = self.trainer.training_type_plugin.broadcast(obj) diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py index fbcb700e4a3d2..13ae7300375e8 100644 --- a/tests/checkpointing/test_legacy_checkpoints.py +++ b/tests/checkpointing/test_legacy_checkpoints.py @@ -65,6 +65,12 @@ "1.3.0", "1.3.1", "1.3.2", + "1.3.3", + "1.3.4", + "1.3.5", + "1.3.6", + "1.3.7", + "1.3.8", ] ) def test_resume_legacy_checkpoints(tmpdir, pl_version: str): diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 2f867d4e998b4..82432cfc7c601 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -83,6 +83,7 @@ def __init__(self): super().__init__() self.train_log_epochs = torch.randn(max_epochs, limit_train_batches) self.val_logs = torch.randn(max_epochs, limit_val_batches) + self.scores = [] def training_step(self, batch, batch_idx): log_value = self.train_log_epochs[self.current_epoch, batch_idx] @@ -109,6 +110,14 @@ def configure_optimizers(self): return [optimizer], [lr_scheduler] + def on_train_epoch_end(self): + if 'train' in monitor: + self.scores.append(self.trainer.logged_metrics[monitor]) + + def on_validation_epoch_end(self): + if not self.trainer.sanity_checking and 'val' in monitor: + self.scores.append(self.trainer.logged_metrics[monitor]) + filename = '{' + f'{monitor}' + ':.4f}-{epoch}' checkpoint = ModelCheckpoint(dirpath=tmpdir, filename=filename, monitor=monitor, save_top_k=-1) @@ -131,13 +140,12 @@ def configure_optimizers(self): assert trainer.state.finished, f"Training failed with {trainer.state}" ckpt_files = list(Path(tmpdir).glob('*.ckpt')) - scores = [metric[monitor] for metric in trainer.dev_debugger.logged_metrics if monitor in metric] lr_scheduler_debug = trainer.dev_debugger.saved_lr_scheduler_updates - assert len(ckpt_files) == len(scores) == max_epochs + assert len(ckpt_files) == len(model.scores) == max_epochs assert len(lr_scheduler_debug) == max_epochs for epoch in range(max_epochs): - score = scores[epoch] + score = model.scores[epoch] expected_score = getattr(model, f'{monitor}s')[epoch].mean().item() expected_filename = f'{monitor}={score:.4f}-epoch={epoch}.ckpt' assert math.isclose(score, expected_score, rel_tol=1e-4) @@ -154,10 +162,9 @@ def configure_optimizers(self): if not reduce_lr_on_plateau: actual_step_count = chk['lr_schedulers'][0]['_step_count'] actual_lr = chk['lr_schedulers'][0]['_last_lr'][0] - # if validation_step_none, the checkpoint gets saved after the learning rate update - # so we need to increase the count by one - assert actual_step_count == epoch + 1 + validation_step_none - assert actual_lr == lr * gamma**(epoch + validation_step_none) + # checkpoint is saved after updating lr_scheduler states + assert actual_step_count == epoch + 2 # step_count starts at 1 + assert actual_lr == lr * gamma**(epoch + 1) assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None) assert lr_scheduler_debug[epoch]['monitor_key'] == (monitor if reduce_lr_on_plateau else None) @@ -193,6 +200,7 @@ def __init__(self): super().__init__() self.val_logs = torch.randn(per_epoch_val_checks * max_epochs, limit_val_batches) self.val_loop_count = 0 + self.scores = [] def validation_step(self, batch, batch_idx): log_value = self.val_logs[self.val_loop_count, batch_idx] @@ -202,6 +210,7 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs): self.val_loop_count += 1 super().validation_epoch_end(outputs) + self.scores.append(self.trainer.logged_metrics[monitor]) def configure_optimizers(self): optimizer = optim.SGD(self.parameters(), lr=lr) @@ -236,24 +245,21 @@ def configure_optimizers(self): assert trainer.state.finished, f"Training failed with {trainer.state}" ckpt_files = list(Path(tmpdir).glob('*.ckpt')) - scores = [metric[monitor] for metric in trainer.dev_debugger.logged_metrics if monitor in metric] lr_scheduler_debug = trainer.dev_debugger.saved_lr_scheduler_updates - # on_train_end ckpt callback is called which creates an additional ckpt in case no ckpt is created at the - # end of epoch, thus if val_check_interval doesn't align with the training steps we create an additional ckpt - additional_ckpt, additional_ckpt_path = False, None - if not epoch_aligned: - additional_ckpt_path = [f for f in ckpt_files if 'v1' in f.stem][0] - additional_ckpt = True - - assert len(ckpt_files) == len(scores) + additional_ckpt == per_epoch_val_checks * max_epochs + additional_ckpt + assert len(ckpt_files) == len(model.scores) == per_epoch_val_checks * max_epochs assert len(lr_scheduler_debug) == max_epochs def _make_assertions(epoch, ix, version=''): global_ix = ix + per_epoch_val_checks * epoch duplicated = bool(version) - score = scores[global_ix] + # checkpoint saved at the end of training epoch will have updated lr_scheduler states + epoch_end_checkpoint = duplicated + if epoch_aligned: + epoch_end_checkpoint = ix == (per_epoch_val_checks - 1) + + score = model.scores[global_ix] expected_score = getattr(model, f'{monitor}s')[global_ix].mean().item() expected_filename = f'{monitor}={score:.4f}-epoch={epoch}{version}.ckpt' assert math.isclose(score, expected_score, rel_tol=1e-4) @@ -272,8 +278,8 @@ def _make_assertions(epoch, ix, version=''): if not reduce_lr_on_plateau: actual_step_count = chk['lr_schedulers'][0]['_step_count'] actual_lr = chk['lr_schedulers'][0]['_last_lr'][0] - assert actual_step_count == epoch + 1 + duplicated - assert actual_lr == lr * gamma**(epoch + duplicated) + assert actual_step_count == epoch + 1 + epoch_end_checkpoint + assert actual_lr == lr * gamma**(epoch + epoch_end_checkpoint) return score @@ -284,10 +290,6 @@ def _make_assertions(epoch, ix, version=''): assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None) assert lr_scheduler_debug[epoch]['monitor_key'] == (monitor if reduce_lr_on_plateau else None) - # check the ckpt file saved on_train_end - if additional_ckpt_path: - _make_assertions(max_epochs - 1, per_epoch_val_checks - 1, version='-v1') - @pytest.mark.parametrize("save_top_k", [-1, 0, 1, 2]) def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k: int): @@ -327,7 +329,7 @@ def test_model_checkpoint_to_yaml(tmpdir, save_top_k: int): path_yaml = os.path.join(tmpdir, 'best_k_models.yaml') checkpoint.to_yaml(path_yaml) d = yaml.full_load(open(path_yaml, 'r')) - best_k = {k: v for k, v in checkpoint.best_k_models.items()} + best_k = dict(checkpoint.best_k_models.items()) assert d == best_k @@ -810,7 +812,7 @@ def test_model_checkpoint_topk_all(tmpdir): assert checkpoint_callback.best_model_path == tmpdir / "epoch=2.ckpt" assert checkpoint_callback.best_model_score == epochs - 1 assert len(os.listdir(tmpdir)) == len(checkpoint_callback.best_k_models) == epochs - assert set(checkpoint_callback.best_k_models.keys()) == set(str(tmpdir / f"epoch={i}.ckpt") for i in range(epochs)) + assert set(checkpoint_callback.best_k_models.keys()) == {str(tmpdir / f"epoch={i}.ckpt") for i in range(epochs)} assert checkpoint_callback.kth_best_model_path == tmpdir / 'epoch=0.ckpt' @@ -879,6 +881,8 @@ def test_model_checkpoint_save_last_warning( default_root_dir=tmpdir, callbacks=[ckpt], max_epochs=max_epochs, + limit_train_batches=1, + limit_val_batches=1, ) with caplog.at_level(logging.INFO): trainer.fit(model) @@ -897,6 +901,8 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): default_root_dir=tmpdir, callbacks=[model_checkpoint], max_epochs=num_epochs, + limit_train_batches=2, + limit_val_batches=2, ) trainer.fit(model) @@ -907,7 +913,9 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): ckpt_last_epoch = torch.load(path_last_epoch) ckpt_last = torch.load(path_last) - assert all(ckpt_last_epoch[k] == ckpt_last[k] for k in ("epoch", "global_step")) + + assert ckpt_last_epoch["epoch"] == ckpt_last["epoch"] + assert ckpt_last_epoch["global_step"] == ckpt_last["global_step"] ch_type = type(model_checkpoint) assert ckpt_last["callbacks"][ch_type] == ckpt_last_epoch["callbacks"][ch_type] @@ -1008,7 +1016,6 @@ def validation_epoch_end(self, *_): ... def assert_trainer_init(trainer): - assert not trainer.checkpoint_connector.has_trained assert trainer.global_step == 0 assert trainer.current_epoch == 0 @@ -1044,7 +1051,6 @@ def assert_checkpoint_log_dir(idx): model = ExtendedBoringModel() trainer.fit(model) - assert trainer.checkpoint_connector.has_trained assert trainer.global_step == epochs * limit_train_batches assert trainer.current_epoch == epochs - 1 assert_checkpoint_log_dir(0) @@ -1068,19 +1074,16 @@ def assert_checkpoint_log_dir(idx): model = ExtendedBoringModel() trainer.test(model) - assert not trainer.checkpoint_connector.has_trained # resume_from_checkpoint is resumed when calling `.fit` assert trainer.global_step == 0 assert trainer.current_epoch == 0 trainer.fit(model) - assert not trainer.checkpoint_connector.has_trained assert trainer.global_step == epochs * limit_train_batches assert trainer.current_epoch == epochs assert_checkpoint_log_dir(idx) trainer.validate(model) - assert not trainer.checkpoint_connector.has_trained assert trainer.global_step == epochs * limit_train_batches assert trainer.current_epoch == epochs @@ -1259,10 +1262,11 @@ def test_ckpt_version_after_rerun_new_trainer(tmpdir): # check best_k_models state expected = {"epoch=0-v1.ckpt", "epoch=1-v1.ckpt"} if i else {"epoch=0.ckpt", "epoch=1.ckpt"} - assert {Path(f).name for f in mc.best_k_models.keys()} == expected + assert {Path(f).name for f in mc.best_k_models} == expected # check created ckpts - assert set(f.basename for f in tmpdir.listdir()) == { + actual = {f.basename for f in tmpdir.listdir()} + assert actual == { "epoch=0.ckpt", "epoch=1.ckpt", "epoch=0-v1.ckpt", @@ -1288,13 +1292,13 @@ def test_ckpt_version_after_rerun_same_trainer(tmpdir): progress_bar_refresh_rate=0, ) trainer.fit(BoringModel()) - trainer.train_loop.max_epochs = 4 + trainer.fit_loop.max_epochs = 4 trainer.fit(BoringModel()) ckpt_range = range(mc.STARTING_VERSION, trainer.max_epochs + mc.STARTING_VERSION) expected = {'test.ckpt', *[f"test-v{i}.ckpt" for i in ckpt_range]} # check best_k_models state - assert {Path(f).name for f in mc.best_k_models.keys()} == expected + assert {Path(f).name for f in mc.best_k_models} == expected # check created ckpts assert set(os.listdir(tmpdir)) == expected diff --git a/tests/conftest.py b/tests/conftest.py index 7f6407ecfd82b..3f767d8b6fad2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from http.server import SimpleHTTPRequestHandler import pytest +import torch.distributed import torch.multiprocessing as mp @@ -41,6 +42,14 @@ def restore_env_variables(): os.environ.update(env_backup) +@pytest.fixture(scope="function", autouse=True) +def teardown_process_group(): + """ Ensures that the distributed process group gets closed before the next test runs. """ + yield + if torch.distributed.is_available() and torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + def pytest_configure(config): config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn") diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index e6500a15eeed1..30131cdcc80d2 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -34,6 +34,7 @@ @mock.patch("pytorch_lightning.trainer.trainer.Trainer.local_rank", new_callable=PropertyMock) def test_can_prepare_data(local_rank, node_rank): + model = BoringModel() dm = BoringDataModule() trainer = Trainer() trainer.datamodule = dm @@ -43,30 +44,54 @@ def test_can_prepare_data(local_rank, node_rank): # local rank = 0 (True) trainer.prepare_data_per_node = True + dm.random_full = None + dm._has_prepared_data = False local_rank.return_value = 0 assert trainer.local_rank == 0 assert trainer.data_connector.can_prepare_data() + trainer.data_connector.prepare_data(model) + assert dm.random_full is not None + # local rank = 1 (False) + dm.random_full = None + dm._has_prepared_data = False local_rank.return_value = 1 assert trainer.local_rank == 1 assert not trainer.data_connector.can_prepare_data() + trainer.data_connector.prepare_data(model) + assert dm.random_full is None + # prepare_data_per_node = False (prepare across all nodes) # global rank = 0 (True) + dm.random_full = None + dm._has_prepared_data = False trainer.prepare_data_per_node = False node_rank.return_value = 0 local_rank.return_value = 0 assert trainer.data_connector.can_prepare_data() + trainer.data_connector.prepare_data(model) + assert dm.random_full is not None + # global rank = 1 (False) + dm.random_full = None + dm._has_prepared_data = False node_rank.return_value = 1 local_rank.return_value = 0 assert not trainer.data_connector.can_prepare_data() + + trainer.data_connector.prepare_data(model) + assert dm.random_full is None + node_rank.return_value = 0 local_rank.return_value = 1 assert not trainer.data_connector.can_prepare_data() + trainer.data_connector.prepare_data(model) + assert dm.random_full is None + # 2 dm # prepar per node = True # local rank = 0 (True) @@ -355,12 +380,12 @@ def test_full_loop(tmpdir): assert dm.trainer is not None # validate - result = trainer.validate(datamodule=dm) + result = trainer.validate(model, dm) assert dm.trainer is not None assert result[0]['val_acc'] > 0.7 # test - result = trainer.test(datamodule=dm) + result = trainer.test(model, dm) assert dm.trainer is not None assert result[0]['test_acc'] > 0.6 @@ -524,46 +549,3 @@ def test_dm_init_from_datasets_dataloaders(iterable): call(test_dss[0], batch_size=4, shuffle=False, num_workers=0, pin_memory=True), call(test_dss[1], batch_size=4, shuffle=False, num_workers=0, pin_memory=True) ]) - - -def test_datamodule_hooks_calls(tmpdir): - """Test that repeated calls to DataHooks' hooks have no effect""" - - class TestDataModule(BoringDataModule): - setup_calls = [] - teardown_calls = [] - prepare_data_calls = 0 - - def setup(self, stage=None): - super().setup(stage=stage) - self.setup_calls.append(stage) - - def teardown(self, stage=None): - super().teardown(stage=stage) - self.teardown_calls.append(stage) - - def prepare_data(self): - super().prepare_data() - self.prepare_data_calls += 1 - - dm = TestDataModule() - dm.prepare_data() - dm.prepare_data() - dm.setup('fit') - dm.setup('fit') - dm.setup() - dm.setup() - dm.teardown('validate') - dm.teardown('validate') - - assert dm.prepare_data_calls == 1 - assert dm.setup_calls == ['fit', None] - assert dm.teardown_calls == ['validate'] - - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) - trainer.test(BoringModel(), datamodule=dm) - - # same number of calls - assert dm.prepare_data_calls == 1 - assert dm.setup_calls == ['fit', None] - assert dm.teardown_calls == ['validate', 'test'] diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 84d206dead22c..f05305c785c7e 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -13,14 +13,12 @@ # limitations under the License. from unittest.mock import Mock -import pytest import torch from torch import nn from torch.optim import Adam, SGD from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger -from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -76,27 +74,6 @@ def test_property_logger(tmpdir): assert model.logger == logger -def test_automatic_optimization_raises(tmpdir): - - class TestModel(BoringModel): - - def optimizer_step(self, *_, **__): - pass - - model = TestModel() - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=2, - limit_val_batches=2, - accumulate_grad_batches=2, - ) - - with pytest.raises( - MisconfigurationException, match='overriding .* optimizer_step .* `accumulate_grad_batches` .* should be 1' - ): - trainer.fit(model) - - def test_params_groups_and_state_are_accessible(tmpdir): class TestModel(BoringModel): diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index d79cae75956a2..b0e96c6d42fbf 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -123,7 +123,6 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, - accumulate_grad_batches=999, # does not do anything if manual optimization ) with patch.multiple(torch.optim.SGD, zero_grad=DEFAULT, step=DEFAULT) as sgd, \ @@ -243,7 +242,7 @@ def training_epoch_end(self, outputs): ... def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, **_): - assert optimizer_closure.__name__ == "training_step_and_backward_closure" + assert optimizer_closure.__name__ == "_training_step_and_backward_closure" # not passing the closure to the optimizer because step is mocked # zero_grad is called inside the closure if isinstance(optimizer, SGD) and batch_idx % 2 == 0: diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py index 3088743f71488..96e1bfaec14cb 100644 --- a/tests/core/test_memory.py +++ b/tests/core/test_memory.py @@ -17,6 +17,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.core.memory import ModelSummary, UNKNOWN_SIZE +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_9 from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.advanced_models import ParityModuleRNN @@ -101,6 +102,41 @@ def forward(self, x): return self.layer2(self.layer1(x)) +class LazyModel(LightningModule): + """ A model which contains lazy layers with unintialized parameters. """ + + def __init__(self): + super().__init__() + self.layer1 = nn.LazyLinear(5) + self.layer2 = nn.LazyLinear(2) + + def forward(self, inp): + return self.layer2(self.layer1(inp)) + + +class DeepNestedModel(LightningModule): + """ A model with deep nested layers. """ + + def __init__(self): + super().__init__() + self.branch1 = nn.Sequential( + nn.Linear(5, 5), + nn.Sequential( + nn.Linear(5, 5), + nn.Sequential( + nn.Linear(5, 5), + nn.Sequential(nn.Linear(5, 5), nn.Sequential(nn.Linear(5, 5), nn.Sequential(nn.Linear(5, 3)))) + ) + ) + ) + self.branch2 = nn.Linear(5, 10) + self.head = UnorderedModel() + self.example_input_array = torch.rand(2, 5) + + def forward(self, inp): + return self.head(self.branch1(inp), self.branch2(inp)) + + def test_invalid_weights_summmary(): """ Test that invalid value for weights_summary raises an error. """ with pytest.raises(MisconfigurationException, match='`mode` can be None, .* got temp'): @@ -110,8 +146,8 @@ def test_invalid_weights_summmary(): Trainer(weights_summary='temp') -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) -def test_empty_model_summary_shapes(mode: ModelSummary): +@pytest.mark.parametrize('mode', ["full", "top"]) +def test_empty_model_summary_shapes(mode: str): """ Test that the summary works for models that have no submodules. """ model = EmptyModule() summary = model.summarize(mode=mode) @@ -121,7 +157,7 @@ def test_empty_model_summary_shapes(mode: ModelSummary): @RunIf(min_gpus=1) -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) @pytest.mark.parametrize(['device'], [ pytest.param(torch.device('cpu')), pytest.param(torch.device('cuda', 0)), @@ -164,18 +200,18 @@ def test_mixed_dtype_model_summary(): ] -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) -def test_hooks_removed_after_summarize(mode): +@pytest.mark.parametrize('max_depth', [-1, 0]) +def test_hooks_removed_after_summarize(max_depth): """ Test that all hooks were properly removed after summary, even ones that were not run. """ model = UnorderedModel() - summary = ModelSummary(model, mode=mode) + summary = ModelSummary(model, max_depth=max_depth) # hooks should be removed for _, layer in summary.summarize().items(): handle = layer._hook_handle assert handle.id not in handle.hooks_dict_ref() -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) def test_rnn_summary_shapes(mode): """ Test that the model summary works for RNNs. """ model = ParityModuleRNN() @@ -199,7 +235,7 @@ def test_rnn_summary_shapes(mode): ] -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) def test_summary_parameter_count(mode): """ Test that the summary counts the number of parameters in every submodule. """ model = UnorderedModel() @@ -213,7 +249,7 @@ def test_summary_parameter_count(mode): ] -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) def test_summary_layer_types(mode): """ Test that the summary displays the layer names correctly. """ model = UnorderedModel() @@ -227,7 +263,7 @@ def test_summary_layer_types(mode): ] -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) def test_summary_with_scripted_modules(mode): model = PartialScriptModel() summary = model.summarize(mode=mode) @@ -236,7 +272,7 @@ def test_summary_with_scripted_modules(mode): assert summary.out_sizes == [UNKNOWN_SIZE, [2, 2]] -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) @pytest.mark.parametrize(['example_input', 'expected_size'], [ pytest.param([], UNKNOWN_SIZE), pytest.param((1, 2, 3), [UNKNOWN_SIZE] * 3), @@ -270,7 +306,7 @@ def forward(self, *args, **kwargs): assert summary.in_sizes == [expected_size] -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) def test_model_size(mode): """ Test model size is calculated correctly. """ model = PreCalculatedModel() @@ -278,7 +314,7 @@ def test_model_size(mode): assert model.pre_calculated_model_size == summary.model_size -@pytest.mark.parametrize('mode', [ModelSummary.MODE_FULL, ModelSummary.MODE_TOP]) +@pytest.mark.parametrize('mode', ["full", "top"]) def test_empty_model_size(mode): """ Test empty model size is zero. """ model = EmptyModule() @@ -302,3 +338,53 @@ def test_model_size_precision(tmpdir): trainer.fit(model) summary = model.summarize() assert model.pre_calculated_model_size == summary.model_size + + +@RunIf(min_torch="1.8") +def test_lazy_model_summary(): + """ Test that the model summary can work with lazy layers. """ + lazy_model = LazyModel() + summary = ModelSummary(lazy_model) + + with pytest.warns( + UserWarning, + match=r"A layer with UninitializedParameter was found. " + r"Thus, the total number of parameters detected may be inaccurate." + ): + if _TORCH_GREATER_EQUAL_1_9: + assert summary.total_parameters == 0 + assert summary.trainable_parameters == 0 + else: + # bug in 1.8: the bias of a LazyLinear layer is initialized! + # https://github.com/pytorch/pytorch/issues/58350 + assert summary.total_parameters == 7 + assert summary.trainable_parameters == 7 + + +def test_max_depth_equals_mode_interface(): + """Test model.summarize(full/top) interface mapping matches max_depth""" + model = DeepNestedModel() + + summary_top = model.summarize(mode="top") + summary_0 = model.summarize(max_depth=1) + assert str(summary_top) == str(summary_0) + + summary_full = model.summarize(mode="full") + summary_minus1 = model.summarize(max_depth=-1) + assert str(summary_full) == str(summary_minus1) + + +@pytest.mark.parametrize('max_depth', [-1, 0, 1, 3, 999]) +def test_max_depth_param(max_depth): + """Test that only the modules up to the desired depth are shown""" + model = DeepNestedModel() + summary = ModelSummary(model, max_depth=max_depth) + for lname in summary.layer_names: + if max_depth >= 0: + assert lname.count(".") < max_depth + + +@pytest.mark.parametrize('max_depth', [-99, -2, "invalid"]) +def test_raise_invalid_max_depth_value(max_depth): + with pytest.raises(ValueError, match=f"`max_depth` can be -1, 0 or > 0, got {max_depth}"): + DeepNestedModel().summarize(max_depth=max_depth) diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 734b9e7f56152..7471914886a27 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -11,14 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pickle +from copy import deepcopy +import pytest import torch import torch.distributed as dist import torch.multiprocessing as mp from torchmetrics import Metric import tests.helpers.utils as tutils -from pytorch_lightning.core.step_result import Result +from pytorch_lightning import Trainer +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync, MetricSource, ResultCollection +from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -52,12 +58,14 @@ def _ddp_test_fn(rank, worldsize): metric_b = DummyMetric() metric_c = DummyMetric() - # dist_sync_on_step is False by default - result = Result() + metric_a = metric_a.to(f"cuda:{rank}") + metric_b = metric_b.to(f"cuda:{rank}") + metric_c = metric_c.to(f"cuda:{rank}") - for epoch in range(3): - cumulative_sum = 0 + result = ResultCollection(True, torch.device(f"cuda:{rank}")) + for _ in range(3): + cumulative_sum = 0 for i in range(5): metric_a(i) metric_b(i) @@ -65,32 +73,25 @@ def _ddp_test_fn(rank, worldsize): cumulative_sum += i - result.log('a', metric_a, on_step=True, on_epoch=True) - result.log('b', metric_b, on_step=False, on_epoch=True) - result.log('c', metric_c, on_step=True, on_epoch=False) + result.log('h', 'a', metric_a, on_step=True, on_epoch=True) + result.log('h', 'b', metric_b, on_step=False, on_epoch=True) + result.log('h', 'c', metric_c, on_step=True, on_epoch=False) - batch_log = result.get_batch_log_metrics() - batch_expected = {"a_step": i, "a": i, "c": i} - assert set(batch_log.keys()) == set(batch_expected.keys()) - for k in batch_expected.keys(): - assert batch_expected[k] == batch_log[k] + batch_log = result.metrics(True)[MetricSource.LOG] + assert batch_log == {"a_step": i, "c": i} - epoch_log = result.get_epoch_log_metrics() + epoch_log = result.metrics(False)[MetricSource.LOG] result.reset() # assert metric state reset to default values - assert metric_a.x == metric_a._defaults['x'] + assert metric_a.x == metric_a._defaults['x'], (metric_a.x, metric_a._defaults['x']) assert metric_b.x == metric_b._defaults['x'] assert metric_c.x == metric_c._defaults['x'] - epoch_expected = {"b": cumulative_sum * worldsize, "a_epoch": cumulative_sum * worldsize} + assert epoch_log == {"b": cumulative_sum * worldsize, "a_epoch": cumulative_sum * worldsize} - assert set(epoch_log.keys()) == set(epoch_expected.keys()) - for k in epoch_expected.keys(): - assert epoch_expected[k] == epoch_log[k] - -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, min_gpus=2) def test_result_reduce_ddp(): """Make sure result logging works with DDP""" tutils.set_random_master_port() @@ -104,11 +105,10 @@ def test_result_metric_integration(): metric_b = DummyMetric() metric_c = DummyMetric() - result = Result() + result = ResultCollection(True, torch.device("cpu")) - for epoch in range(3): + for _ in range(3): cumulative_sum = 0 - for i in range(5): metric_a(i) metric_b(i) @@ -116,17 +116,14 @@ def test_result_metric_integration(): cumulative_sum += i - result.log('a', metric_a, on_step=True, on_epoch=True) - result.log('b', metric_b, on_step=False, on_epoch=True) - result.log('c', metric_c, on_step=True, on_epoch=False) + result.log('h', 'a', metric_a, on_step=True, on_epoch=True) + result.log('h', 'b', metric_b, on_step=False, on_epoch=True) + result.log('h', 'c', metric_c, on_step=True, on_epoch=False) - batch_log = result.get_batch_log_metrics() - batch_expected = {"a_step": i, "a": i, "c": i} - assert set(batch_log.keys()) == set(batch_expected.keys()) - for k in batch_expected.keys(): - assert batch_expected[k] == batch_log[k] + batch_log = result.metrics(True)[MetricSource.LOG] + assert batch_log == {"a_step": i, "c": i} - epoch_log = result.get_epoch_log_metrics() + epoch_log = result.metrics(False)[MetricSource.LOG] result.reset() # assert metric state reset to default values @@ -134,8 +131,211 @@ def test_result_metric_integration(): assert metric_b.x == metric_b._defaults['x'] assert metric_c.x == metric_c._defaults['x'] - epoch_expected = {"b": cumulative_sum, "a_epoch": cumulative_sum} + assert epoch_log == {"b": cumulative_sum, "a_epoch": cumulative_sum} + + assert str(result) == ( + "ResultCollection(True, cpu, {" + "'h.a': ResultMetric('a', value=DummyMetric()), " + "'h.b': ResultMetric('b', value=DummyMetric()), " + "'h.c': ResultMetric('c', value=DummyMetric())" + "})" + ) + + +def test_result_collection_simple_loop(): + result = ResultCollection(True, torch.device("cpu")) + current_fx_name = None + batch_idx = None + + def lightning_log(fx, *args, **kwargs): + nonlocal current_fx_name + if current_fx_name != fx and batch_idx in (None, 0): + result.reset(metrics=False, fx=fx) + result.log(fx, *args, **kwargs) + current_fx_name = fx + + lightning_log('a0', 'a', torch.tensor(0.), on_step=True, on_epoch=True) + lightning_log('a1', 'a', torch.tensor(0.), on_step=True, on_epoch=True) + for epoch in range(2): + lightning_log('b0', 'a', torch.tensor(1.) + epoch, on_step=True, on_epoch=True) + lightning_log('b1', 'a', torch.tensor(1.) + epoch, on_step=True, on_epoch=True) + for batch_idx in range(2): + lightning_log('c0', 'a', torch.tensor(2.) + epoch, on_step=True, on_epoch=True) + lightning_log('c1', 'a', torch.tensor(2.) + epoch, on_step=True, on_epoch=True) + lightning_log('c2', 'a', torch.tensor(2.) + epoch, on_step=True, on_epoch=True) + batch_idx = None + lightning_log('d0', 'a', torch.tensor(3.) + epoch, on_step=False, on_epoch=True) + lightning_log('d1', 'a', torch.tensor(3.) + epoch, on_step=False, on_epoch=True) + + for k in ('a0.a', 'a1.a'): + assert result[k].value == torch.tensor(0.), k + assert result[k].cumulated_batch_size == torch.tensor(1.), k + + for k in ('b0.a', 'b1.a'): + assert result[k].value == torch.tensor(1.) + epoch, k + assert result[k].cumulated_batch_size == torch.tensor(1.), k + + for k in ('c0.a', 'c1.a', 'c2.a'): + assert result[k].value == torch.tensor(4.) + epoch * 2, k + assert result[k].cumulated_batch_size == torch.tensor(2.), k + + for k in ('d0.a', 'd1.a'): + assert result[k].value == torch.tensor(3.) + epoch, k + assert result[k].cumulated_batch_size == torch.tensor(1.), k + + +def my_sync_dist(x, *_, **__): + return x + + +def test_result_collection_restoration(tmpdir): + """" + This test make sure metrics are properly reloaded on failure. + """ + + result = ResultCollection(True, torch.device("cpu")) + metric_a = DummyMetric() + metric_b = DummyMetric() + metric_c = DummyMetric() + metric_d = DummyMetric() + current_fx_name = None + batch_idx = None + + def lightning_log(fx, *args, **kwargs): + nonlocal current_fx_name + if current_fx_name != fx and batch_idx in (None, 0): + result.reset(metrics=False, fx=fx) + result.log(fx, *args, **kwargs, sync_dist_fn=my_sync_dist) + current_fx_name = fx + + for epoch in range(2): + + cumulative_sum = 0 + + for i in range(3): + + a = metric_a(i) + b = metric_b(i) + c = metric_c(i) + metric_d(i) + + cumulative_sum += i + + metric = metric_a if i < 1 else metric_d + lightning_log('training_step', 'a', metric, on_step=True, on_epoch=True, metric_attribute="metric") + lightning_log('training_step', 'b', metric_b, on_step=False, on_epoch=True, metric_attribute="metric_b") + lightning_log('training_step', 'c', metric_c, on_step=True, on_epoch=False, metric_attribute="metric_c") + lightning_log('training_step', 'a_1', a, on_step=True, on_epoch=True) + lightning_log('training_step', 'b_1', b, on_step=False, on_epoch=True) + lightning_log('training_step', 'c_1', {'1': c, '2': c}, on_step=True, on_epoch=False) + + batch_log = result.metrics(on_step=True)[MetricSource.LOG] + assert set(batch_log) == {"a_step", "c", "a_1_step", "c_1"} + assert set(batch_log['c_1']) == {'1', '2'} + + result_copy = deepcopy(result) + new_result = ResultCollection(True, torch.device("cpu")) + state_dict = result.state_dict() + # check the sync fn was dropped + assert 'fn' not in state_dict['items']['training_step.a']['meta']['_sync'] + + assert not new_result.result_metrics + assert len(result.result_metrics) == 7 + epoch > 0 + + new_result.load_state_dict( + state_dict, metrics={ + "metric": metric, + "metric_b": metric_b, + "metric_c": metric_c + } + ) + # should match + assert result_copy == new_result + # the sync fn has been kept + assert result_copy['training_step.a'].meta.sync.fn == new_result['training_step.a'].meta.sync.fn + + epoch_log = result.metrics(on_step=False)[MetricSource.LOG] + epoch_log_copy = result_copy.metrics(on_step=False)[MetricSource.LOG] + assert epoch_log == epoch_log_copy + + lightning_log('train_epoch_end', 'a', metric_a, on_step=False, on_epoch=True) + epoch_log = result.metrics(on_step=False)[MetricSource.LOG] + assert epoch_log == { + 'a_1_epoch': 1, + 'a_epoch': cumulative_sum, + 'a': cumulative_sum, + 'b': cumulative_sum, + 'b_1': 1 + } + + # make sure can be pickled + pickle.loads(pickle.dumps(result)) + # make sure can be torch.loaded + filepath = str(tmpdir / 'result') + torch.save(result, filepath) + torch.load(filepath) + + # assert metric state reset to default values + result.reset() + assert metric_a.x == metric_a._defaults['x'] + assert metric_b.x == metric_b._defaults['x'] + assert metric_c.x == metric_c._defaults['x'] + + batch_idx = None + + +@pytest.mark.parametrize('device', ('cpu', pytest.param('cuda', marks=RunIf(min_gpus=1)))) +def test_lightning_module_logging_result_collection(tmpdir, device): + + class LoggingModel(BoringModel): + + def __init__(self): + super().__init__() + self.metric = DummyMetric() + + def validation_step(self, batch, batch_idx): + v = self.metric(batch_idx) + self.log_dict({"v": v, "m": self.metric}) + return super().validation_step(batch, batch_idx) + + def on_save_checkpoint(self, checkpoint) -> None: + results = self.trainer._results + # simplify logic + state_dict = results.state_dict(drop_value=False) + + # check device + assert results['validation_step.v'].value.device.type == device + assert state_dict['items']['validation_step.v']['value'].device.type == device + + # sync fn should be kept + assert results['validation_step.v'].meta.sync.fn == self.trainer.training_type_plugin.reduce + + # sync fn dropped from the state dict + assert 'fn' not in state_dict['items']['validation_step.v']['meta']['_sync'] + results.load_state_dict(state_dict) + + # check device after loading + assert results['validation_step.v'].value.device.type == device + + # sync fn was preserved in the original result + assert results['validation_step.v'].meta.sync.fn == self.trainer.training_type_plugin.reduce + + # default sync fn + new_results = ResultCollection(False, device) + new_results.load_state_dict(state_dict, map_location='cpu') + assert new_results['validation_step.v'].meta.sync.fn == _Sync.no_op + + # check map location + assert new_results['validation_step.v'].value.device.type == 'cpu' - assert set(epoch_log.keys()) == set(epoch_expected.keys()) - for k in epoch_expected.keys(): - assert epoch_expected[k] == epoch_log[k] + model = LoggingModel() + ckpt = ModelCheckpoint(dirpath=tmpdir, save_last=True) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_train_batches=2, + limit_val_batches=2, + callbacks=[ckpt], + gpus=1 if device == 'cuda' else 0, + ) + trainer.fit(model) diff --git a/tests/core/test_results.py b/tests/core/test_results.py index 02d30d9f79ee3..e2e3c892cc124 100644 --- a/tests/core/test_results.py +++ b/tests/core/test_results.py @@ -18,11 +18,11 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp -from torch.utils.data import DataLoader import tests.helpers.utils as tutils -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.core.step_result import Result +from pytorch_lightning import Trainer +from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync +from pytorch_lightning.utilities.distributed import sync_ddp_if_available from tests.helpers import BoringDataModule, BoringModel from tests.helpers.runif import RunIf @@ -39,7 +39,8 @@ def _setup_ddp(rank, worldsize): def _ddp_test_fn(rank, worldsize): _setup_ddp(rank, worldsize) tensor = torch.tensor([1.0]) - actual = LightningModule._LightningModule__sync(tensor, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM) + sync = _Sync(sync_ddp_if_available, should=True, op='SUM') + actual = sync(tensor) assert actual.item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors" @@ -51,23 +52,21 @@ def test_result_reduce_ddp(): mp.spawn(_ddp_test_fn, args=(worldsize, ), nprocs=worldsize) -@pytest.mark.parametrize( - "test_option,do_train,gpus", [ - pytest.param(0, True, 0, id='full_loop'), - pytest.param(0, False, 0, id='test_only'), - pytest.param( - 1, False, 0, id='test_only_mismatching_tensor', marks=pytest.mark.xfail(raises=ValueError, match="Mism.*") - ), - pytest.param(2, False, 0, id='mix_of_tensor_dims'), - pytest.param(3, False, 0, id='string_list_predictions'), - pytest.param(4, False, 0, id='int_list_predictions'), - pytest.param(5, False, 0, id='nested_list_predictions'), - pytest.param(6, False, 0, id='dict_list_predictions'), - pytest.param(7, True, 0, id='write_dict_predictions'), - pytest.param(0, True, 1, id='full_loop_single_gpu', marks=RunIf(min_gpus=1)) - ] -) -def test_result_obj_predictions(tmpdir, test_option: int, do_train: bool, gpus: int): +@pytest.mark.parametrize(["option", "do_train", "gpus"], [ + pytest.param(0, True, 0, id='full_loop'), + pytest.param(0, False, 0, id='test_only'), + pytest.param( + 1, False, 0, id='test_only_mismatching_tensor', marks=pytest.mark.xfail(raises=ValueError, match="Mism.*") + ), + pytest.param(2, False, 0, id='mix_of_tensor_dims'), + pytest.param(3, False, 0, id='string_list_predictions'), + pytest.param(4, False, 0, id='int_list_predictions'), + pytest.param(5, False, 0, id='nested_list_predictions'), + pytest.param(6, False, 0, id='dict_list_predictions'), + pytest.param(7, True, 0, id='write_dict_predictions'), + pytest.param(0, True, 1, id='full_loop_single_gpu', marks=RunIf(min_gpus=1)) +]) +def test_write_predictions(tmpdir, option: int, do_train: bool, gpus: int): class CustomBoringModel(BoringModel): @@ -82,8 +81,6 @@ def test_step(self, batch, batch_idx, optimizer_idx=None): lst_of_lst = [[x] for x in lst_of_int] lst_of_dict = [{k: v} for k, v in zip(lst_of_str, lst_of_int)] - # This is passed in from pytest via parameterization - option = getattr(self, 'test_option', 0) prediction_file = getattr(self, 'prediction_file', 'predictions.pt') lazy_ids = torch.arange(batch_idx * batch_size, batch_idx * batch_size + batch_size) @@ -127,32 +124,13 @@ def test_step(self, batch, batch_idx, optimizer_idx=None): elif option == 7: self.write_prediction_dict({'idxs': lazy_ids, 'preds': output}, prediction_file) - class CustomBoringDataModule(BoringDataModule): - - def train_dataloader(self): - return DataLoader(self.random_train, batch_size=4) - - def val_dataloader(self): - return DataLoader(self.random_val, batch_size=4) - - def test_dataloader(self): - return DataLoader(self.random_test, batch_size=4) - - tutils.reset_seed() prediction_file = Path(tmpdir) / 'predictions.pt' dm = BoringDataModule() model = CustomBoringModel() - model.test_step_end = None model.test_epoch_end = None - model.test_end = None - - model.test_option = test_option model.prediction_file = prediction_file.as_posix() - if prediction_file.exists(): - prediction_file.unlink() - trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, @@ -175,11 +153,3 @@ def test_dataloader(self): assert prediction_file.exists() predictions = torch.load(prediction_file) assert len(predictions) == len(dm.random_test) - - -def test_result_retrieve_last_logged_item(): - result = Result() - result.log('a', 5., on_step=True, on_epoch=True) - assert result['a_epoch'] == 5. - assert result['a_step'] == 5. - assert result['a'] == 5. diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py index 37d8abfdf905d..23df12586d328 100644 --- a/tests/deprecated_api/test_remove_1-4.py +++ b/tests/deprecated_api/test_remove_1-4.py @@ -66,3 +66,16 @@ def training_step(self, batch, batch_idx): with pytest.deprecated_call(match=r"Relying on.*is deprecated in v1.2 and will be removed in v1.4"): trainer.fit(TestModel()) + + +def test_v1_4_0_deprecated_hpc_load(tmpdir): + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + ) + trainer.fit(model) + trainer.checkpoint_connector.hpc_save(tmpdir, trainer.logger) + checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(str(tmpdir)) + with pytest.deprecated_call(match=r"`CheckpointConnector.hpc_load\(\)` was deprecated in v1.4"): + trainer.checkpoint_connector.hpc_load(checkpoint_path) diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py index d6c9b6d8f8f31..70bcc71d0a2a6 100644 --- a/tests/deprecated_api/test_remove_1-5.py +++ b/tests/deprecated_api/test_remove_1-5.py @@ -25,12 +25,14 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.decorators import auto_move_data from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.plugins import DeepSpeedPlugin from pytorch_lightning.profiler import AdvancedProfiler, BaseProfiler, PyTorchProfiler, SimpleProfiler from pytorch_lightning.trainer.callback_hook import warning_cache as callback_warning_cache from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.imports import _compare_version from tests.deprecated_api import no_deprecated_call from tests.helpers import BoringDataModule, BoringModel +from tests.helpers.runif import RunIf from tests.helpers.utils import no_warning_call @@ -242,7 +244,7 @@ def on_train_epoch_end(self, outputs): # noqa with pytest.deprecated_call(match="old signature will be removed in v1.5"): trainer.fit(model) - trainer.train_loop.warning_cache.clear() + trainer.fit_loop.epoch_loop._warning_cache.clear() class NewSignature(Callback): @@ -367,10 +369,24 @@ def test_v1_5_0_datamodule_setter(): datamodule = BoringDataModule() with no_deprecated_call(match="The `LightningModule.datamodule`"): model.datamodule = datamodule - with pytest.deprecated_call(match="The `LightningModule.datamodule`"): - _ = model.datamodule + from pytorch_lightning.core.lightning import warning_cache + warning_cache.clear() + _ = model.datamodule + assert any("The `LightningModule.datamodule`" in w for w in warning_cache) def test_v1_5_0_trainer_tbptt_steps(tmpdir): with pytest.deprecated_call(match="is deprecated in v1.3 and will be removed in v1.5"): _ = Trainer(truncated_bptt_steps=1) + + +@RunIf(deepspeed=True) +@pytest.mark.parametrize( + "params", [dict(cpu_offload=True), + dict(cpu_offload_params=True), + dict(cpu_offload_use_pin_memory=True)] +) +def test_v1_5_0_deepspeed_cpu_offload(tmpdir, params): + + with pytest.deprecated_call(match="is deprecated since v1.4 and will be removed in v1.5"): + DeepSpeedPlugin(**params) diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py index 63b1c60fe7c62..ba033a0ebeced 100644 --- a/tests/deprecated_api/test_remove_1-6.py +++ b/tests/deprecated_api/test_remove_1-6.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Test deprecated functionality which will be removed in v1.6.0 """ - import pytest from pytorch_lightning import Trainer +from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.plugins.training_type import DDPPlugin, DDPSpawnPlugin -from tests.helpers import BoringModel +from pytorch_lightning.utilities.distributed import rank_zero_deprecation, rank_zero_warn +from pytorch_lightning.utilities.model_helpers import is_overridden +from tests.helpers import BoringDataModule, BoringModel def test_v1_6_0_trainer_model_hook_mixin(tmpdir): @@ -31,6 +34,28 @@ def test_v1_6_0_trainer_model_hook_mixin(tmpdir): trainer.has_arg("training_step", "batch") +def test_v1_6_0_dataloader_renaming(tmpdir): + model = BoringModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) + dl = model.train_dataloader() + + with pytest.deprecated_call(match=r"fit\(train_dataloader\)` is deprecated in v1.4"): + trainer.fit(model, train_dataloader=dl) + + with pytest.deprecated_call(match=r"validate\(val_dataloaders\)` is deprecated in v1.4"): + trainer.validate(model, val_dataloaders=dl) + + with pytest.deprecated_call(match=r"test\(test_dataloaders\)` is deprecated in v1.4"): + trainer.test(model, test_dataloaders=dl) + + with pytest.deprecated_call(match=r"tune\(train_dataloader\)` is deprecated in v1.4"): + trainer.tune(model, train_dataloader=dl) + with pytest.deprecated_call(match=r"tune\(train_dataloader\)` is deprecated in v1.4"): + trainer.tuner.scale_batch_size(model, train_dataloader=dl) + with pytest.deprecated_call(match=r"tune\(train_dataloader\)` is deprecated in v1.4"): + trainer.tuner.lr_find(model, train_dataloader=dl) + + def test_old_transfer_batch_to_device_hook(tmpdir): class OldModel(BoringModel): @@ -54,12 +79,12 @@ def test_v1_6_0_ddp_sync_batchnorm(): def test_v1_6_0_ddp_spawn_num_nodes(): - with pytest.deprecated_call(match="Argument `num_nodes` in `DDPPlugin` is deprecated in v1.4"): + with pytest.deprecated_call(match="Argument `num_nodes` in `DDPSpawnPlugin` is deprecated in v1.4"): DDPSpawnPlugin(num_nodes=1) def test_v1_6_0_ddp_spawn_sync_batchnorm(): - with pytest.deprecated_call(match="Argument `sync_batchnorm` in `DDPPlugin` is deprecated in v1.4"): + with pytest.deprecated_call(match="Argument `sync_batchnorm` in `DDPSpawnPlugin` is deprecated in v1.4"): DDPSpawnPlugin(sync_batchnorm=False) @@ -87,3 +112,166 @@ def training_step(self, *args): trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) with pytest.deprecated_call(match=r"tbptt_pad_token=...\)` is no longer supported"): trainer.fit(TestModel()) + + +def test_v1_6_0_sync_dist_op(tmpdir): + + class TestModel(BoringModel): + + def training_step(self, *args): + self.log("foo", 1, sync_dist_op='sum') + return super().training_step(*args) + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) + with pytest.deprecated_call(match=r"`self.log\(sync_dist_op='sum'\)` is deprecated"): + trainer.fit(TestModel()) + + +def test_v1_6_0_datamodule_lifecycle_properties(tmpdir): + dm = BoringDataModule() + with pytest.deprecated_call(match=r"DataModule property `has_prepared_data` was deprecated in v1.4"): + dm.has_prepared_data + with pytest.deprecated_call(match=r"DataModule property `has_setup_fit` was deprecated in v1.4"): + dm.has_setup_fit + with pytest.deprecated_call(match=r"DataModule property `has_setup_validate` was deprecated in v1.4"): + dm.has_setup_validate + with pytest.deprecated_call(match=r"DataModule property `has_setup_test` was deprecated in v1.4"): + dm.has_setup_test + with pytest.deprecated_call(match=r"DataModule property `has_setup_predict` was deprecated in v1.4"): + dm.has_setup_predict + with pytest.deprecated_call(match=r"DataModule property `has_teardown_fit` was deprecated in v1.4"): + dm.has_teardown_fit + with pytest.deprecated_call(match=r"DataModule property `has_teardown_validate` was deprecated in v1.4"): + dm.has_teardown_validate + with pytest.deprecated_call(match=r"DataModule property `has_teardown_test` was deprecated in v1.4"): + dm.has_teardown_test + with pytest.deprecated_call(match=r"DataModule property `has_teardown_predict` was deprecated in v1.4"): + dm.has_teardown_predict + + +def test_v1_6_0_datamodule_hooks_calls(tmpdir): + """Test that repeated calls to DataHooks' hooks show a warning about the coming API change.""" + + class TestDataModule(BoringDataModule): + setup_calls = [] + teardown_calls = [] + prepare_data_calls = 0 + + def setup(self, stage=None): + super().setup(stage=stage) + self.setup_calls.append(stage) + + def teardown(self, stage=None): + super().teardown(stage=stage) + self.teardown_calls.append(stage) + + def prepare_data(self): + super().prepare_data() + self.prepare_data_calls += 1 + + dm = TestDataModule() + dm.prepare_data() + dm.prepare_data() + dm.setup('fit') + with pytest.deprecated_call( + match=r"DataModule.setup has already been called, so it will not be called again. " + "In v1.6 this behavior will change to always call DataModule.setup" + ): + dm.setup('fit') + dm.setup() + dm.setup() + dm.teardown('validate') + with pytest.deprecated_call( + match=r"DataModule.teardown has already been called, so it will not be called again. " + "In v1.6 this behavior will change to always call DataModule.teardown" + ): + dm.teardown('validate') + + assert dm.prepare_data_calls == 1 + assert dm.setup_calls == ['fit', None] + assert dm.teardown_calls == ['validate'] + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + trainer.test(BoringModel(), datamodule=dm) + + # same number of calls + assert dm.prepare_data_calls == 1 + assert dm.setup_calls == ['fit', None] + assert dm.teardown_calls == ['validate', 'test'] + + +def test_v1_6_0_is_overridden_model(): + model = BoringModel() + with pytest.deprecated_call(match="and will be removed in v1.6"): + assert is_overridden("validation_step", model=model) + with pytest.deprecated_call(match="and will be removed in v1.6"): + assert not is_overridden("foo", model=model) + + +def test_v1_6_0_early_stopping_monitor(tmpdir): + with pytest.deprecated_call( + match=r"The `EarlyStopping\(monitor\)` argument will be required starting in v1.6." + " For backward compatibility, setting this to `early_stop_on`." + ): + EarlyStopping() + + +def test_v1_6_0_extras_with_gradients(tmpdir): + + class TestModel(BoringModel): + + def training_step(self, *args): + loss = super().training_step(*args)['loss'] + return {"loss": loss, 'foo': loss} + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + model = TestModel() + match = r"\{'foo'\} has a `grad_fn`.*behaviour will change in v1\.6" + with pytest.deprecated_call(match=match): + trainer.fit(model) + + +def test_v1_6_0_train_loop(tmpdir): + trainer = Trainer() + with pytest.deprecated_call( + match=r"`Trainer.train_loop` has been renamed to `Trainer.fit_loop` and will be removed in v1.6." + ): + _ = trainer.train_loop + + +def test_v1_6_0_rank_zero_warnings_moved(): + with pytest.deprecated_call(match='in v1.3.7 and will be removed in v1.6'): + rank_zero_warn('test') + with pytest.deprecated_call(match='in v1.3.7 and will be removed in v1.6'): + rank_zero_deprecation('test') + + +def test_v1_6_0_ddp_plugin_task_idx(): + plugin = DDPPlugin() + with pytest.deprecated_call(match='Use `DDPPlugin.local_rank` instead'): + _ = plugin.task_idx + + +def test_v1_6_0_lightning_module_loaded_optimizer_states_dict(): + from pytorch_lightning.core.lightning import warning_cache + model = BoringModel() + _ = model.loaded_optimizer_states_dict + assert any( + "The `LightningModule.loaded_optimizer_states_dict` property is deprecated in v1.4" in w for w in warning_cache + ) + warning_cache.clear() + + model.loaded_optimizer_states_dict = {} + assert any( + "The `LightningModule.loaded_optimizer_states_dict` property is deprecated in v1.4" in w for w in warning_cache + ) + warning_cache.clear() + + +def test_v1_6_0_deprecated_model_summary_mode(tmpdir): + model = BoringModel() + with pytest.deprecated_call(match="Argument `mode` in `ModelSummary` is deprecated in v1.4"): + ModelSummary(model, mode="top") + + with pytest.deprecated_call(match="Argument `mode` in `LightningModule.summarize` is deprecated in v1.4"): + model.summarize(mode="top") diff --git a/tests/helpers/advanced_models.py b/tests/helpers/advanced_models.py index 2b0146e1ee099..8f3b9663aa2d7 100644 --- a/tests/helpers/advanced_models.py +++ b/tests/helpers/advanced_models.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from collections import OrderedDict import numpy as np import torch @@ -122,13 +121,8 @@ def training_step(self, batch, batch_idx, optimizer_idx=None): # adversarial loss is binary cross-entropy g_loss = self.adversarial_loss(self.discriminator(self.generated_imgs), valid) - tqdm_dict = {'g_loss': g_loss} - output = OrderedDict({ - 'loss': g_loss, - 'progress_bar': tqdm_dict, - 'log': tqdm_dict, - }) - return output + self.log('g_loss', g_loss, prog_bar=True, logger=True) + return g_loss # train discriminator if optimizer_idx == 1: @@ -148,13 +142,8 @@ def training_step(self, batch, batch_idx, optimizer_idx=None): # discriminator loss is the average of these d_loss = (real_loss + fake_loss) / 2 - tqdm_dict = {'d_loss': d_loss} - output = OrderedDict({ - 'loss': d_loss, - 'progress_bar': tqdm_dict, - 'log': tqdm_dict, - }) - return output + self.log('d_loss', d_loss, prog_bar=True, logger=True) + return d_loss def configure_optimizers(self): lr = self.learning_rate diff --git a/tests/helpers/boring_model.py b/tests/helpers/boring_model.py index eb81baeb2c29d..185baac51f41f 100644 --- a/tests/helpers/boring_model.py +++ b/tests/helpers/boring_model.py @@ -34,19 +34,6 @@ def __len__(self): return self.len -class RandomDictStringDataset(Dataset): - - def __init__(self, size, length): - self.len = length - self.data = torch.randn(length, size) - - def __getitem__(self, index): - return {"id": str(index), "x": self.data[index]} - - def __len__(self): - return self.len - - class RandomDataset(Dataset): def __init__(self, size, length): @@ -174,7 +161,7 @@ def __init__(self, data_dir: str = './'): self.checkpoint_state: Optional[str] = None def prepare_data(self): - self.random_full = RandomDataset(32, 192) + self.random_full = RandomDataset(32, 64 * 4) def setup(self, stage: Optional[str] = None): if stage == "fit" or stage is None: @@ -182,12 +169,16 @@ def setup(self, stage: Optional[str] = None): self.dims = self.random_train[0].shape if stage in ("fit", "validate") or stage is None: - self.random_val = Subset(self.random_full, indices=range(64, 128)) + self.random_val = Subset(self.random_full, indices=range(64, 64 * 2)) if stage == "test" or stage is None: - self.random_test = Subset(self.random_full, indices=range(128, 192)) + self.random_test = Subset(self.random_full, indices=range(64 * 2, 64 * 3)) self.dims = getattr(self, "dims", self.random_test[0].shape) + if stage == "predict" or stage is None: + self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4)) + self.dims = getattr(self, "dims", self.random_predict[0].shape) + def train_dataloader(self): return DataLoader(self.random_train) @@ -196,3 +187,6 @@ def val_dataloader(self): def test_dataloader(self): return DataLoader(self.random_test) + + def predict_dataloader(self): + return DataLoader(self.random_predict) diff --git a/tests/helpers/datasets.py b/tests/helpers/datasets.py index 77035796ca3b1..9fadd947ac9c9 100644 --- a/tests/helpers/datasets.py +++ b/tests/helpers/datasets.py @@ -105,7 +105,7 @@ def prepare_data(self, download: bool = True): raise RuntimeError('Dataset not found.') def _download(self, data_folder: str) -> None: - os.makedirs(data_folder) + os.makedirs(data_folder, exist_ok=True) for url in self.RESOURCES: logging.info(f'Downloading {url}') fpath = os.path.join(data_folder, os.path.basename(url)) diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py index f7a6484f6b27e..961b5cf080396 100644 --- a/tests/helpers/pipelines.py +++ b/tests/helpers/pipelines.py @@ -91,11 +91,12 @@ def run_model_test( trainer.checkpoint_connector.hpc_save(save_dir, logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(save_dir) - trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) + trainer.checkpoint_connector.restore(checkpoint_path) @torch.no_grad() def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50): + orig_device = trained_model.device # run prediction on 1 batch trained_model.cpu() trained_model.eval() @@ -108,3 +109,4 @@ def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50): acc = accuracy(y_hat.cpu(), y.cpu(), top_k=2).item() assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})" + trained_model.to(orig_device) diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 630a341ec2d30..e4a1d20f72872 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -25,10 +25,9 @@ _DEEPSPEED_AVAILABLE, _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE, - _FAIRSCALE_PIPE_AVAILABLE, _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, _NATIVE_AMP_AVAILABLE, - _RPC_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE, _TPU_AVAILABLE, ) @@ -63,13 +62,12 @@ def __new__( amp_apex: bool = False, amp_native: bool = False, tpu: bool = False, + ipu: bool = False, horovod: bool = False, horovod_nccl: bool = False, skip_windows: bool = False, special: bool = False, - rpc: bool = False, fairscale: bool = False, - fairscale_pipe: bool = False, fairscale_fully_sharded: bool = False, deepspeed: bool = False, **kwargs @@ -85,13 +83,12 @@ def __new__( amp_apex: NVIDIA Apex is installed amp_native: if native PyTorch native AMP is supported tpu: if TPU is available + ipu: if IPU is available horovod: if Horovod is installed horovod_nccl: if Horovod is installed with NCCL support skip_windows: skip test for Windows platform (typically fo some limited torch functionality) special: running in special mode, outside pytest suit - rpc: requires Remote Procedure Call (RPC) fairscale: if `fairscale` module is required to run the test - fairscale_pipe: if `fairscale` with pipe module is required to run the test fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test deepspeed: if `deepspeed` module is required to run the test kwargs: native pytest.mark.skipif keyword arguments @@ -139,6 +136,10 @@ def __new__( conditions.append(not _TPU_AVAILABLE) reasons.append("TPU") + if ipu: + conditions.append(not _IPU_AVAILABLE) + reasons.append("IPU") + if horovod: conditions.append(not _HOROVOD_AVAILABLE) reasons.append("Horovod") @@ -152,18 +153,10 @@ def __new__( conditions.append(env_flag != '1') reasons.append("Special execution") - if rpc: - conditions.append(not _RPC_AVAILABLE) - reasons.append("RPC") - if fairscale: conditions.append(not _FAIRSCALE_AVAILABLE) reasons.append("Fairscale") - if fairscale_pipe: - conditions.append(not _FAIRSCALE_PIPE_AVAILABLE) - reasons.append("Fairscale Pipe") - if fairscale_fully_sharded: conditions.append(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE) reasons.append("Fairscale Fully Sharded") diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index a18b180be1f49..1bad12b1f9a3d 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -24,6 +24,7 @@ from pytorch_lightning import Callback, Trainer from pytorch_lightning.loggers import ( CometLogger, + CSVLogger, MLFlowLogger, NeptuneLogger, TensorBoardLogger, @@ -233,6 +234,7 @@ def name(self): "logger_class", [ CometLogger, + CSVLogger, MLFlowLogger, NeptuneLogger, TensorBoardLogger, @@ -325,6 +327,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_ @pytest.mark.parametrize( "logger_class", [ CometLogger, + CSVLogger, MLFlowLogger, NeptuneLogger, TensorBoardLogger, diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index c20b6096585cd..9209083148265 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -59,6 +59,7 @@ def __init__(self): self.hparams_logged = None self.metrics_logged = {} self.finalized = False + self.after_save_checkpoint_called = False @property def experiment(self): @@ -92,6 +93,9 @@ def name(self): def version(self): return "1" + def after_save_checkpoint(self, checkpoint_callback): + self.after_save_checkpoint_called = True + def test_custom_logger(tmpdir): @@ -115,6 +119,7 @@ def training_step(self, batch, batch_idx): assert trainer.state.finished, f"Training failed with {trainer.state}" assert logger.hparams_logged == model.hparams assert logger.metrics_logged != {} + assert logger.after_save_checkpoint_called assert logger.finalized_status == "success" diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index f7fe1c3bfd47e..b8bafae8508e8 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import os from argparse import Namespace from unittest import mock @@ -275,7 +276,7 @@ def __init__(self): def training_step(self, *args): self.log('foo', 1, on_step=True, on_epoch=True) - if not self.trainer.train_loop.should_accumulate(): + if not self.trainer.fit_loop.should_accumulate(): if self.trainer.logger_connector.should_update_logs: self.indexes.append(self.trainer.global_step) return super().training_step(*args) @@ -340,3 +341,15 @@ def test_tensorboard_with_symlink(log, tmpdir): _ = logger.version log.warning.assert_not_called() + + +def test_tensorboard_missing_folder_warning(tmpdir, caplog): + """Verify that the logger throws a warning for invalid directory""" + + name = "fake_dir" + logger = TensorBoardLogger(save_dir=tmpdir, name=name) + + with caplog.at_level(logging.WARNING): + assert logger.version == 0 + + assert 'Missing logger folder:' in caplog.text diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 22be315eaabe2..27185b911b6d0 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -24,14 +24,8 @@ from tests.helpers import BoringModel -def get_warnings(recwarn): - warnings_text = '\n'.join(str(w.message) for w in recwarn.list) - recwarn.clear() - return warnings_text - - @mock.patch('pytorch_lightning.loggers.wandb.wandb') -def test_wandb_logger_init(wandb, recwarn): +def test_wandb_logger_init(wandb): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" @@ -51,8 +45,6 @@ def test_wandb_logger_init(wandb, recwarn): run = wandb.init() logger = WandbLogger(experiment=run) assert logger.experiment - assert run.dir is not None - assert logger.save_dir == run.dir # test wandb.init not called if there is a W&B run wandb.init().log.reset_mock() @@ -140,10 +132,8 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir): # mock return values of experiment wandb.run = None - wandb.init().step = 0 logger.experiment.id = '1' logger.experiment.project_name.return_value = 'project' - logger.experiment.step = 0 for _ in range(2): _ = logger.experiment @@ -164,6 +154,71 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir): assert trainer.log_dir == logger.save_dir +@mock.patch('pytorch_lightning.loggers.wandb.wandb') +def test_wandb_log_model(wandb, tmpdir): + """ Test that the logger creates the folders and files in the right place. """ + + wandb.run = None + model = BoringModel() + + # test log_model=True + logger = WandbLogger(log_model=True) + logger.experiment.id = '1' + logger.experiment.project_name.return_value = 'project' + trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3) + trainer.fit(model) + wandb.init().log_artifact.assert_called_once() + + # test log_model='all' + wandb.init().log_artifact.reset_mock() + wandb.init.reset_mock() + logger = WandbLogger(log_model='all') + logger.experiment.id = '1' + logger.experiment.project_name.return_value = 'project' + trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3) + trainer.fit(model) + assert wandb.init().log_artifact.call_count == 2 + + # test log_model=False + wandb.init().log_artifact.reset_mock() + wandb.init.reset_mock() + logger = WandbLogger(log_model=False) + logger.experiment.id = '1' + logger.experiment.project_name.return_value = 'project' + trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3) + trainer.fit(model) + assert not wandb.init().log_artifact.called + + # test correct metadata + import pytorch_lightning.loggers.wandb as pl_wandb + pl_wandb._WANDB_GREATER_EQUAL_0_10_22 = True + wandb.init().log_artifact.reset_mock() + wandb.init.reset_mock() + wandb.Artifact.reset_mock() + logger = pl_wandb.WandbLogger(log_model=True) + logger.experiment.id = '1' + logger.experiment.project_name.return_value = 'project' + trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3) + trainer.fit(model) + wandb.Artifact.assert_called_once_with( + name='model-1', + type='model', + metadata={ + 'score': None, + 'original_filename': 'epoch=1-step=5-v3.ckpt', + 'ModelCheckpoint': { + 'monitor': None, + 'mode': 'min', + 'save_last': None, + 'save_top_k': None, + 'save_weights_only': False, + '_every_n_train_steps': 0, + '_every_n_val_epochs': 1 + } + } + ) + + def test_wandb_sanitize_callable_params(tmpdir): """ Callback function are not serializiable. Therefore, we get them a chance to return diff --git a/tests/loops/__init__.py b/tests/loops/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py new file mode 100644 index 0000000000000..1930dc46566fd --- /dev/null +++ b/tests/loops/test_loop_state_dict.py @@ -0,0 +1,54 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from pytorch_lightning.loops import FitLoop +from pytorch_lightning.trainer.trainer import Trainer +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +def test_loops_state_dict(): + fit_loop = FitLoop() + with pytest.raises(MisconfigurationException, match="Loop FitLoop should be connected to a"): + fit_loop.connect(object()) # noqa + + fit_loop.connect(Trainer()) + state_dict = fit_loop.state_dict() + new_fit_loop = FitLoop() + new_fit_loop.load_state_dict(state_dict) + assert fit_loop.state_dict() == new_fit_loop.state_dict() + + +def test_loops_state_dict_structure(): + trainer = Trainer() + # structure saved by the checkpoint connector + state_dict = { + "fit_loop": trainer.fit_loop.state_dict(), + "validate_loop": trainer.validate_loop.state_dict(), + "test_loop": trainer.test_loop.state_dict(), + "predict_loop": trainer.predict_loop.state_dict(), + } + expected = { + "fit_loop": { + 'epoch_loop': { + 'batch_loop': {}, + 'val_loop': {}, + } + }, + "validate_loop": {}, + "test_loop": {}, + "predict_loop": {}, + } + assert state_dict == expected diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py new file mode 100644 index 0000000000000..af5801d2b4552 --- /dev/null +++ b/tests/loops/test_loops.py @@ -0,0 +1,74 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Iterator + +from pytorch_lightning.loops.base import Loop + + +def test_loop_restore(): + + class CustomExpection(Exception): + pass + + class Simple(Loop): + + def __init__(self, dataset: Iterator): + super().__init__() + self.dataset = dataset + + def restore(self) -> None: + self.iter_dataset = iter(self.dataset) + for _ in range(self.iteration_count): + next(self.iter_dataset) + self.iteration_count += 1 + + @property + def done(self) -> bool: + return self.iteration_count > len(self.dataset) + + def reset(self) -> None: + self.iter_dataset = iter(self.dataset) + self.outputs = [] + + def advance(self) -> None: + value = next(self.iter_dataset) + + if self.iteration_count == 5: + raise CustomExpection + + self.outputs.append(value) + + def state_dict(self) -> Dict: + return {"iteration_count": self.iteration_count, "outputs": self.outputs} + + def load_state_dict(self, state_dict: Dict) -> None: + self.iteration_count = state_dict["iteration_count"] + self.outputs = state_dict["outputs"] + + data = range(10) + loop = Simple(data) + try: + loop.run() + state_dict = {} + except CustomExpection: + state_dict = loop.state_dict() + + loop = Simple(data) + loop.load_state_dict(state_dict) + loop.restarting = True + loop.run() + + assert not loop.restarting + assert loop.outputs == list(range(10)) diff --git a/tests/metrics/test_metric_lightning.py b/tests/metrics/test_metric_lightning.py index e52e39cb16488..6be288b00113b 100644 --- a/tests/metrics/test_metric_lightning.py +++ b/tests/metrics/test_metric_lightning.py @@ -1,9 +1,26 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest import torch +from torch import nn from torchmetrics import Metric as TMetric from pytorch_lightning import Trainer from pytorch_lightning.metrics import Metric as PLMetric from pytorch_lightning.metrics import MetricCollection +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel @@ -78,6 +95,7 @@ def __init__(self): self.metric_step = SumMetric() self.metric_epoch = SumMetric() self.sum = 0.0 + self.total_sum = 0.0 def on_epoch_start(self): self.sum = 0.0 @@ -90,7 +108,10 @@ def training_step(self, batch, batch_idx): return {'loss': self.step(x), 'data': x} def training_epoch_end(self, outs): - self.log("sum_epoch", self.metric_epoch(torch.stack([o['data'] for o in outs]).sum())) + total = torch.stack([o['data'] for o in outs]).sum() + self.metric_epoch(total) + self.log("sum_epoch", self.metric_epoch) + self.total_sum = total model = TestModel() model.val_dataloader = None @@ -107,7 +128,7 @@ def training_epoch_end(self, outs): logged = trainer.logged_metrics assert torch.allclose(torch.tensor(logged["sum_step"]), model.sum) - assert torch.allclose(torch.tensor(logged["sum_epoch"]), model.sum) + assert torch.allclose(torch.tensor(logged["sum_epoch"]), model.total_sum) def test_scriptable(tmpdir): @@ -188,3 +209,59 @@ def training_epoch_end(self, outputs): logged = trainer.logged_metrics assert torch.allclose(torch.tensor(logged["SumMetric_epoch"]), model.sum) assert torch.allclose(torch.tensor(logged["DiffMetric_epoch"]), model.diff) + + +def test_log_metric_no_attributes_raises(tmpdir): + + class TestModel(BoringModel): + + def training_step(self, *args): + metric = SumMetric() + self.log("foo", metric) + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + model = TestModel() + with pytest.raises(MisconfigurationException, match="Could not find the `LightningModule` attribute"): + trainer.fit(model) + + +def test_log_metric_dict(tmpdir): + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.metrics = nn.ModuleDict({'sum': SumMetric(), 'diff': DiffMetric()}) + self.sum = 0.0 + self.diff = 0.0 + + def training_step(self, batch, batch_idx): + x = batch + self.metrics['sum'](x.sum()) + self.metrics['diff'](x.sum()) + self.sum += x.sum() + self.diff -= x.sum() + self.log_dict({f'{k}_step': v for k, v in self.metrics.items()}) + return self.step(x) + + def training_epoch_end(self, outputs): + self.metrics['sum'].compute() + self.metrics['diff'].compute() + self.log_dict({f'{k}_epoch': v for k, v in self.metrics.items()}) + + model = TestModel() + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.fit(model) + + logged = trainer.logged_metrics + assert torch.allclose(torch.tensor(logged["sum_epoch"]), model.sum) + assert torch.allclose(torch.tensor(logged["diff_epoch"]), model.diff) diff --git a/tests/metrics/test_remove_1-5_metrics.py b/tests/metrics/test_remove_1-5_metrics.py index d3703bf3691c9..aa7d4977d1133 100644 --- a/tests/metrics/test_remove_1-5_metrics.py +++ b/tests/metrics/test_remove_1-5_metrics.py @@ -215,7 +215,7 @@ def test_v1_5_metric_classif_mix(): preds = torch.tensor([0, 1, 0, 0]) confusion_matrix._warned = False with pytest.deprecated_call(match='It will be removed in v1.5.0'): - assert torch.equal(confusion_matrix(preds, target, num_classes=2), torch.tensor([[2., 0.], [1., 1.]])) + assert torch.equal(confusion_matrix(preds, target, num_classes=2).float(), torch.tensor([[2., 0.], [1., 1.]])) target = torch.tensor([0, 1, 2, 0, 1, 2]) preds = torch.tensor([0, 2, 1, 0, 0, 1]) diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py index f1f17d0624936..29c530953f99c 100644 --- a/tests/metrics/utils.py +++ b/tests/metrics/utils.py @@ -66,7 +66,7 @@ def _class_test( metric_class: Metric, sk_metric: Callable, dist_sync_on_step: bool, - metric_args: dict = {}, + metric_args: dict = None, check_dist_sync_on_step: bool = True, check_batch: bool = True, atol: float = 1e-8, @@ -89,6 +89,8 @@ def _class_test( check_batch: bool, if true will check if the metric is also correctly calculated across devices for each batch (and not just at the end) """ + if metric_args is None: + metric_args = {} # Instanciate lightning metric metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args) @@ -130,7 +132,7 @@ def _functional_test( target: torch.Tensor, metric_functional: Callable, sk_metric: Callable, - metric_args: dict = {}, + metric_args: dict = None, atol: float = 1e-8, ): """Utility function doing the actual comparison between lightning functional metric @@ -143,6 +145,8 @@ def _functional_test( sk_metric: callable function that is used for comparison metric_args: dict with additional arguments used for class initialization """ + if metric_args is None: + metric_args = {} metric = partial(metric_functional, **metric_args) for i in range(NUM_BATCHES): @@ -185,7 +189,7 @@ def run_functional_metric_test( target: torch.Tensor, metric_functional: Callable, sk_metric: Callable, - metric_args: dict = {}, + metric_args: dict = None, ): """Main method that should be used for testing functions. Call this inside testing method @@ -197,6 +201,8 @@ def run_functional_metric_test( sk_metric: callable function that is used for comparison metric_args: dict with additional arguments used for class initialization """ + if metric_args is None: + metric_args = {} _functional_test( preds=preds, target=target, @@ -214,7 +220,7 @@ def run_class_metric_test( metric_class: Metric, sk_metric: Callable, dist_sync_on_step: bool, - metric_args: dict = {}, + metric_args: dict = None, check_dist_sync_on_step: bool = True, check_batch: bool = True, ): @@ -235,6 +241,8 @@ def run_class_metric_test( check_batch: bool, if true will check if the metric is also correctly calculated across devices for each batch (and not just at the end) """ + if metric_args is None: + metric_args = {} if ddp: if sys.platform == "win32": pytest.skip("DDP not supported on windows") diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index ed0d33f5e8c82..c4cbaeb1363c9 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -87,7 +87,7 @@ def training_epoch_end(self, outputs) -> None: trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) # test HPC loading checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path) - trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) + trainer.checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index b54e0d091bd16..84721fe8b575c 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import pytest import torch import tests.helpers.pipelines as tpipes @@ -322,7 +323,8 @@ def test_all_features_cpu_model(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.01) -def test_tbptt_cpu_model(tmpdir): +@pytest.mark.parametrize("n_hidden_states", [1, 2]) +def test_tbptt_cpu_model(tmpdir, n_hidden_states): """Test truncated back propagation through time works.""" truncated_bptt_steps = 2 sequence_size = 30 @@ -341,15 +343,19 @@ def __len__(self): class BpttTestModel(BoringModel): - def __init__(self, batch_size, in_features, out_features, *args, **kwargs): + def __init__(self, batch_size, in_features, out_features, n_hidden_states, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None self.batch_size = batch_size self.layer = torch.nn.Linear(in_features, out_features) + self.n_hidden_states = n_hidden_states def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" - self.test_hidden = torch.rand(1) + if self.n_hidden_states == 1: + self.test_hidden = torch.rand(1) + else: + self.test_hidden = tuple([torch.rand(1)] * self.n_hidden_states) x_tensor, y_list = batch assert x_tensor.shape[1] == truncated_bptt_steps, "tbptt split Tensor failed" @@ -378,7 +384,12 @@ def train_dataloader(self): sampler=None, ) - model = BpttTestModel(batch_size=batch_size, in_features=truncated_bptt_steps, out_features=truncated_bptt_steps) + model = BpttTestModel( + batch_size=batch_size, + in_features=truncated_bptt_steps, + out_features=truncated_bptt_steps, + n_hidden_states=n_hidden_states + ) model.example_input_array = torch.randn(5, truncated_bptt_steps) # fit model @@ -390,5 +401,4 @@ def train_dataloader(self): weights_summary=None, ) trainer.fit(model) - - assert trainer.state.finished, f"Training failed with {trainer.state}" + assert trainer.state.finished, f"Training model with `{n_hidden_states}` hidden state failed with {trainer.state}" diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 65a1e093a9e96..cd7c90552ab2e 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import operator +import os from collections import namedtuple +from unittest import mock from unittest.mock import patch import pytest @@ -21,6 +23,7 @@ import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version @@ -219,6 +222,29 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun device_parser.parse_gpu_ids(gpus) +@mock.patch.dict( + os.environ, { + "CUDA_VISIBLE_DEVICES": "0", + "LOCAL_RANK": "1", + "GROUP_RANK": "1", + "RANK": "3", + "WORLD_SIZE": "4", + "LOCAL_WORLD_SIZE": "2", + } +) +@mock.patch('torch.cuda.device_count', return_value=1) +@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, '0']) +def test_torchelastic_gpu_parsing(mocked_device_count, gpus): + """ + Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device + That we omit sanitizing the gpus as only one of the GPUs is visible. + """ + trainer = Trainer(gpus=gpus) + assert isinstance(trainer.accelerator_connector.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert trainer.gpus == gpus + + @RunIf(min_gpus=1) def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index 0e380e085ce6a..384e643e184fe 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -11,8 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -from unittest import mock from unittest.mock import patch import numpy as np @@ -59,15 +57,19 @@ def on_after_backward(self): self.stored_grad_norms.append(out) -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @pytest.mark.parametrize("norm_type", [1., 1.25, 2, 3, 5, 10, 'inf']) def test_grad_tracking(tmpdir, norm_type, rtol=5e-3): # rtol=5e-3 respects the 3 decimals rounding in `.grad_norms` and above - reset_seed() - # use a custom grad tracking module and a list logger - model = ModelWithManualGradTracker(norm_type) + class TestModel(ModelWithManualGradTracker): + logged_metrics = [] + + def on_train_batch_end(self, *_) -> None: + # copy so they don't get reduced + self.logged_metrics.append(self.trainer.logged_metrics.copy()) + + model = TestModel(norm_type) trainer = Trainer( default_root_dir=tmpdir, @@ -76,18 +78,13 @@ def test_grad_tracking(tmpdir, norm_type, rtol=5e-3): log_every_n_steps=1, # request grad_norms every batch ) trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" - logged_metrics = trainer.dev_debugger.logged_metrics - assert len(logged_metrics) == len(model.stored_grad_norms) + assert len(model.logged_metrics) == len(model.stored_grad_norms) # compare the logged metrics against tracked norms on `.backward` - for mod, log in zip(model.stored_grad_norms, logged_metrics): - common = mod.keys() & log.keys() - - log, mod = [log[k] for k in common], [mod[k] for k in common] - - assert np.allclose(log, mod, rtol=rtol) + for mod, log in zip(model.stored_grad_norms, model.logged_metrics): + for k in (mod.keys() & log.keys()): + assert np.allclose(mod[k], log[k], rtol=rtol), k @pytest.mark.parametrize("log_every_n_steps", [1, 2, 3]) @@ -111,5 +108,9 @@ def test_grad_tracking_interval(tmpdir, log_every_n_steps): if grad_norm_dict: grad_norm_dicts.append(grad_norm_dict) - assert len(grad_norm_dicts) == expected - assert all(grad_norm_dicts[0].keys() == g.keys() for g in grad_norm_dicts) + # logging on n steps + 1 epochs + assert len(grad_norm_dicts) == expected + 1 + # check all metrics derived from steps have the same keys + assert all(grad_norm_dicts[0].keys() == g.keys() for g in grad_norm_dicts[:-1]) + epoch_end_keys = [k.replace("step", "epoch") for k in grad_norm_dicts[0]] + assert epoch_end_keys == list(grad_norm_dicts[-1]) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 913f403a14dd3..9a689fe9d725a 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -11,14 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from functools import partial +from inspect import getmembers, isfunction from unittest import mock -from unittest.mock import PropertyMock +from unittest.mock import ANY, PropertyMock import pytest import torch from torch.utils.data import DataLoader -from pytorch_lightning import Trainer +from pytorch_lightning import __version__, Callback, LightningDataModule, LightningModule, Trainer from tests.helpers import BoringDataModule, BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -229,215 +231,151 @@ def train_dataloader(self): trainer.fit(model) -class HookedModel(BoringModel): - - def __init__(self): - super().__init__() - self.called = [] - self.train_batch = [ - 'on_train_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'training_step', - 'on_before_zero_grad', - 'optimizer_zero_grad', - 'backward', - 'on_after_backward', - 'optimizer_step', - 'on_train_batch_end', - ] - self.val_batch = [ - 'on_validation_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_validation_batch_end', - ] - - def training_step(self, *args, **kwargs): - self.called.append("training_step") - return super().training_step(*args, **kwargs) - - def optimizer_zero_grad(self, *args, **kwargs): - self.called.append("optimizer_zero_grad") - super().optimizer_zero_grad(*args, **kwargs) - - def training_epoch_end(self, *args, **kwargs): - self.called.append("training_epoch_end") - super().training_epoch_end(*args, **kwargs) - - def backward(self, *args, **kwargs): - self.called.append("backward") - super().backward(*args, **kwargs) - - def on_after_backward(self): - self.called.append("on_after_backward") - super().on_after_backward() - - def optimizer_step(self, *args, **kwargs): - super().optimizer_step(*args, **kwargs) - self.called.append("optimizer_step") # append after as closure calls other methods - - def validation_epoch_end(self, *args, **kwargs): - self.called.append("validation_epoch_end") - super().validation_epoch_end(*args, **kwargs) - - def on_before_zero_grad(self, *args, **kwargs): - self.called.append("on_before_zero_grad") - super().on_before_zero_grad(*args, **kwargs) - - def on_epoch_start(self): - self.called.append("on_epoch_start") - super().on_epoch_start() - - def on_epoch_end(self): - self.called.append("on_epoch_end") - super().on_epoch_end() - - def on_fit_start(self): - self.called.append("on_fit_start") - super().on_fit_start() - - def on_fit_end(self): - self.called.append("on_fit_end") - super().on_fit_end() - - def on_hpc_load(self, *args, **kwargs): - self.called.append("on_hpc_load") - super().on_hpc_load(*args, **kwargs) - - def on_hpc_save(self, *args, **kwargs): - self.called.append("on_hpc_save") - super().on_hpc_save(*args, **kwargs) - - def on_load_checkpoint(self, *args, **kwargs): - self.called.append("on_load_checkpoint") - super().on_load_checkpoint(*args, **kwargs) - - def on_save_checkpoint(self, *args, **kwargs): - self.called.append("on_save_checkpoint") - super().on_save_checkpoint(*args, **kwargs) +def get_members(cls): + return {h for h, _ in getmembers(cls, predicate=isfunction) if not h.startswith('_')} - def on_pretrain_routine_start(self): - self.called.append("on_pretrain_routine_start") - super().on_pretrain_routine_start() - def on_pretrain_routine_end(self): - self.called.append("on_pretrain_routine_end") - super().on_pretrain_routine_end() +class HookedCallback(Callback): - def on_train_start(self): - self.called.append("on_train_start") - super().on_train_start() + def __init__(self, called): - def on_train_end(self): - self.called.append("on_train_end") - super().on_train_end() + def call(hook, *args, **kwargs): + d = {'name': f'Callback.{hook}'} + if args: + d['args'] = args + if kwargs: + d['kwargs'] = kwargs + called.append(d) - def on_before_batch_transfer(self, *args, **kwargs): - self.called.append("on_before_batch_transfer") - return super().on_before_batch_transfer(*args, **kwargs) + for h in get_members(Callback): + setattr(self, h, partial(call, h)) - def transfer_batch_to_device(self, *args, **kwargs): - self.called.append("transfer_batch_to_device") - return super().transfer_batch_to_device(*args, **kwargs) - def on_after_batch_transfer(self, *args, **kwargs): - self.called.append("on_after_batch_transfer") - return super().on_after_batch_transfer(*args, **kwargs) - - def on_train_batch_start(self, *args, **kwargs): - self.called.append("on_train_batch_start") - super().on_train_batch_start(*args, **kwargs) - - def on_train_batch_end(self, *args, **kwargs): - self.called.append("on_train_batch_end") - super().on_train_batch_end(*args, **kwargs) - - def on_train_epoch_start(self): - self.called.append("on_train_epoch_start") - super().on_train_epoch_start() - - def on_train_epoch_end(self): - self.called.append("on_train_epoch_end") - super().on_train_epoch_end() - - def on_validation_start(self): - self.called.append("on_validation_start") - super().on_validation_start() - - def on_validation_end(self): - self.called.append("on_validation_end") - super().on_validation_end() - - def on_validation_batch_start(self, *args, **kwargs): - self.called.append("on_validation_batch_start") - super().on_validation_batch_start(*args, **kwargs) - - def on_validation_batch_end(self, *args, **kwargs): - self.called.append("on_validation_batch_end") - super().on_validation_batch_end(*args, **kwargs) - - def on_validation_epoch_start(self): - self.called.append("on_validation_epoch_start") - super().on_validation_epoch_start() - - def on_validation_epoch_end(self, *args, **kwargs): - self.called.append("on_validation_epoch_end") - super().on_validation_epoch_end(*args, **kwargs) - - def on_test_start(self): - self.called.append("on_test_start") - super().on_test_start() - - def on_test_batch_start(self, *args, **kwargs): - self.called.append("on_test_batch_start") - super().on_test_batch_start(*args, **kwargs) - - def on_test_batch_end(self, *args, **kwargs): - self.called.append("on_test_batch_end") - super().on_test_batch_end(*args, **kwargs) - - def on_test_epoch_start(self): - self.called.append("on_test_epoch_start") - super().on_test_epoch_start() - - def on_test_epoch_end(self, *args, **kwargs): - self.called.append("on_test_epoch_end") - super().on_test_epoch_end(*args, **kwargs) - - def on_validation_model_eval(self): - self.called.append("on_validation_model_eval") - super().on_validation_model_eval() - - def on_validation_model_train(self): - self.called.append("on_validation_model_train") - super().on_validation_model_train() - - def on_test_model_eval(self): - self.called.append("on_test_model_eval") - super().on_test_model_eval() - - def on_test_model_train(self): - self.called.append("on_test_model_train") - super().on_test_model_train() +class HookedModel(BoringModel): - def on_test_end(self): - self.called.append("on_test_end") - super().on_test_end() + def __init__(self, called): + super().__init__() + pl_module_hooks = get_members(LightningModule) + # remove most `nn.Module` hooks + module_hooks = get_members(torch.nn.Module) + pl_module_hooks.difference_update(module_hooks - {'forward', 'zero_grad', 'train'}) + + def call(hook, fn, *args, **kwargs): + out = fn(*args, **kwargs) + d = {'name': hook} + if args: + d['args'] = args + if kwargs: + d['kwargs'] = kwargs + called.append(d) + return out + + for h in pl_module_hooks: + attr = getattr(self, h) + setattr(self, h, partial(call, h, attr)) - def setup(self, stage=None): - self.called.append(f"setup_{stage}") - super().setup(stage=stage) + def validation_epoch_end(self, *args, **kwargs): + # `BoringModel` does not have a return for `validation_step_end` so this would fail + pass + + def test_epoch_end(self, *args, **kwargs): + # `BoringModel` does not have a return for `test_step_end` so this would fail + pass + + @staticmethod + def _train_batch(trainer, model, batches, current_epoch=0): + out = [] + for i in range(batches): + out.extend([ + # TODO: `on_batch_{start,end}` + dict(name='Callback.on_batch_start', args=(trainer, model)), + dict(name='Callback.on_train_batch_start', args=(trainer, model, ANY, i, 0)), + dict(name='on_train_batch_start', args=(ANY, i, 0)), + dict(name='on_before_batch_transfer', args=(ANY, None)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='on_after_batch_transfer', args=(ANY, None)), + dict(name='forward', args=(ANY, )), + dict(name='training_step', args=(ANY, i)), + dict(name='training_step_end', args=(dict(loss=ANY), )), + dict(name='Callback.on_before_zero_grad', args=(trainer, model, ANY)), + dict(name='on_before_zero_grad', args=(ANY, )), + dict(name='optimizer_zero_grad', args=(current_epoch, i, ANY, 0)), + # TODO: `on_before_backward` + dict(name='backward', args=(ANY, ANY, 0)), + dict(name='Callback.on_after_backward', args=(trainer, model)), + dict(name='on_after_backward'), + # TODO: `on_before_optimizer_step` + dict( + name='optimizer_step', + args=(current_epoch, i, ANY, 0, ANY), + kwargs=dict(on_tpu=False, using_lbfgs=False, using_native_amp=False) + ), + dict(name='Callback.on_train_batch_end', args=(trainer, model, dict(loss=ANY), ANY, i, 0)), + dict(name='on_train_batch_end', args=(dict(loss=ANY), ANY, i, 0)), + dict(name='Callback.on_batch_end', args=(trainer, model)), + ]) + return out + + @staticmethod + def _eval_epoch(fn, trainer, model, batches, key): + outputs = {key: ANY} + return [ + dict(name='Callback.on_epoch_start', args=(trainer, model)), + dict(name='on_epoch_start'), + dict(name=f'Callback.on_{fn}_epoch_start', args=(trainer, model)), + dict(name=f'on_{fn}_epoch_start'), + *HookedModel._eval_batch(fn, trainer, model, batches, key), + dict(name=f'{fn}_epoch_end', args=([outputs] * batches, )), + dict(name=f'Callback.on_{fn}_epoch_end', args=(trainer, model)), + dict(name=f'on_{fn}_epoch_end'), + dict(name='Callback.on_epoch_end', args=(trainer, model)), + dict(name='on_epoch_end'), + ] - def teardown(self, stage=None): - self.called.append(f"teardown_{stage}") - super().teardown(stage) + @staticmethod + def _eval_batch(fn, trainer, model, batches, key): + out = [] + outputs = {key: ANY} + for i in range(batches): + out.extend([ + # TODO: `{,Callback}.on_batch_{start,end}` + dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)), + dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)), + dict(name='on_before_batch_transfer', args=(ANY, None)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='on_after_batch_transfer', args=(ANY, None)), + dict(name='forward', args=(ANY, )), + dict(name=f'{fn}_step', args=(ANY, i)), + dict(name=f'{fn}_step_end', args=(outputs, )), + dict(name=f'Callback.on_{fn}_batch_end', args=(trainer, model, outputs, ANY, i, 0)), + dict(name=f'on_{fn}_batch_end', args=(outputs, ANY, i, 0)), + ]) + return out + + @staticmethod + def _predict_batch(trainer, model, batches): + out = [] + for i in range(batches): + out.extend([ + # TODO: `{,Callback}.on_batch_{start,end}` + dict(name='Callback.on_predict_batch_start', args=(trainer, model, ANY, i, 0)), + dict(name='on_predict_batch_start', args=(ANY, i, 0)), + dict(name='on_before_batch_transfer', args=(ANY, None)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='on_after_batch_transfer', args=(ANY, None)), + dict(name='forward', args=(ANY, )), + dict(name='predict_step', args=(ANY, i)), + # TODO: `predict_step_end` + dict(name='Callback.on_predict_batch_end', args=(trainer, model, ANY, ANY, i, 0)), + dict(name='on_predict_batch_end', args=(ANY, ANY, i, 0)), + ]) + return out def test_trainer_model_hook_system_fit(tmpdir): - model = HookedModel() + called = [] + model = HookedModel(called) + callback = HookedCallback(called) train_batches = 2 val_batches = 2 trainer = Trainer( @@ -447,143 +385,303 @@ def test_trainer_model_hook_system_fit(tmpdir): limit_val_batches=val_batches, progress_bar_refresh_rate=0, weights_summary=None, + callbacks=[callback] ) - assert model.called == [] + assert called == [ + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + ] trainer.fit(model) + saved_ckpt = { + 'callbacks': ANY, + 'epoch': 1, + 'global_step': train_batches, + 'lr_schedulers': ANY, + 'optimizer_states': ANY, + 'pytorch-lightning_version': __version__, + 'state_dict': ANY, + } expected = [ - 'setup_fit', - 'on_fit_start', - 'on_pretrain_routine_start', - 'on_pretrain_routine_end', - 'on_validation_model_eval', - 'on_validation_start', - 'on_epoch_start', - 'on_validation_epoch_start', - *(model.val_batch * val_batches), - 'validation_epoch_end', - 'on_validation_epoch_end', - 'on_epoch_end', - 'on_validation_end', - 'on_validation_model_train', - 'on_train_start', - 'on_epoch_start', - 'on_train_epoch_start', - *(model.train_batch * train_batches), - 'on_validation_model_eval', - 'on_validation_start', - 'on_epoch_start', - 'on_validation_epoch_start', - *(model.val_batch * val_batches), - 'validation_epoch_end', - 'on_validation_epoch_end', - 'on_epoch_end', - 'on_save_checkpoint', - 'on_validation_end', - 'on_validation_model_train', - 'training_epoch_end', - 'on_train_epoch_end', - 'on_epoch_end', - 'on_train_end', - 'on_fit_end', - 'teardown_fit', + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + dict(name='prepare_data'), + dict(name='configure_callbacks'), + dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)), + dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='fit')), + dict(name='setup', kwargs=dict(stage='fit')), + dict(name='configure_sharded_model'), + dict(name='Callback.on_configure_sharded_model', args=(trainer, model)), + dict(name='configure_optimizers'), + dict(name='Callback.on_fit_start', args=(trainer, model)), + dict(name='on_fit_start'), + dict(name='Callback.on_pretrain_routine_start', args=(trainer, model)), + dict(name='on_pretrain_routine_start'), + dict(name='Callback.on_pretrain_routine_end', args=(trainer, model)), + dict(name='on_pretrain_routine_end'), + dict(name='Callback.on_sanity_check_start', args=(trainer, model)), + dict(name='on_val_dataloader'), + dict(name='val_dataloader'), + dict(name='train', args=(False, )), + dict(name='on_validation_model_eval'), + dict(name='zero_grad'), + dict(name='Callback.on_validation_start', args=(trainer, model)), + dict(name='on_validation_start'), + *model._eval_epoch('validation', trainer, model, val_batches, 'x'), + dict(name='Callback.on_validation_end', args=(trainer, model)), + dict(name='on_validation_end'), + dict(name='train'), + dict(name='on_validation_model_train'), + dict(name='Callback.on_sanity_check_end', args=(trainer, model)), + # duplicate `train` because `_run_train` calls it again in case validation wasn't run + dict(name='train'), + dict(name='on_train_dataloader'), + dict(name='train_dataloader'), + dict(name='Callback.on_train_start', args=(trainer, model)), + dict(name='on_train_start'), + dict(name='Callback.on_epoch_start', args=(trainer, model)), + dict(name='on_epoch_start'), + dict(name='Callback.on_train_epoch_start', args=(trainer, model)), + dict(name='on_train_epoch_start'), + *model._train_batch(trainer, model, train_batches), + dict(name='train', args=(False, )), + dict(name='on_validation_model_eval'), + dict(name='zero_grad'), + dict(name='Callback.on_validation_start', args=(trainer, model)), + dict(name='on_validation_start'), + *model._eval_epoch('validation', trainer, model, val_batches, 'x'), + dict(name='Callback.on_validation_end', args=(trainer, model)), + # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_validation_end` + dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), + dict(name='on_save_checkpoint', args=(saved_ckpt, )), + dict(name='on_validation_end'), + dict(name='train'), + dict(name='on_validation_model_train'), + dict(name='training_epoch_end', args=([dict(loss=ANY)] * train_batches, )), + dict(name='Callback.on_train_epoch_end', args=(trainer, model, [dict(loss=ANY)] * train_batches)), + dict(name='on_train_epoch_end', args=([dict(loss=ANY)] * train_batches, )), + dict(name='Callback.on_epoch_end', args=(trainer, model)), + dict(name='on_epoch_end'), + dict(name='Callback.on_train_end', args=(trainer, model)), + dict(name='on_train_end'), + dict(name='Callback.on_fit_end', args=(trainer, model)), + dict(name='on_fit_end'), + dict(name='Callback.teardown', args=(trainer, model), kwargs=dict(stage='fit')), + dict(name='teardown', kwargs=dict(stage='fit')), ] - assert model.called == expected + assert called == expected -def test_trainer_model_hook_system_fit_no_val(tmpdir): - model = HookedModel() +def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): + # initial training to get a checkpoint + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + limit_val_batches=0, + progress_bar_refresh_rate=0, + weights_summary=None, + ) + trainer.fit(model) + best_model_path = trainer.checkpoint_callback.best_model_path + + # resume from checkpoint with HookedModel + called = [] + model = HookedModel(called) + callback = HookedCallback(called) train_batches = 2 trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1, + # already performed 1 step, now resuming to do an additional 2 + max_steps=(1 + train_batches), limit_val_batches=0, - limit_train_batches=train_batches, progress_bar_refresh_rate=0, weights_summary=None, + resume_from_checkpoint=best_model_path, + callbacks=[callback] ) - assert model.called == [] + assert called == [ + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + ] trainer.fit(model) + saved_ckpt = { + 'callbacks': ANY, + 'epoch': 2, # TODO: wrong saved epoch + 'global_step': (1 + train_batches), + 'lr_schedulers': ANY, + 'optimizer_states': ANY, + 'pytorch-lightning_version': __version__, + 'state_dict': ANY, + } expected = [ - 'setup_fit', - 'on_fit_start', - 'on_pretrain_routine_start', - 'on_pretrain_routine_end', - 'on_train_start', - 'on_epoch_start', - 'on_train_epoch_start', - *(model.train_batch * train_batches), - 'training_epoch_end', - 'on_train_epoch_end', - 'on_epoch_end', - 'on_save_checkpoint', # from train epoch end - 'on_train_end', - 'on_fit_end', - 'teardown_fit', + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + dict(name='prepare_data'), + dict(name='configure_callbacks'), + dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)), + dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='fit')), + dict(name='setup', kwargs=dict(stage='fit')), + dict( + name='on_load_checkpoint', + args=({ + 'callbacks': ANY, + 'epoch': 1, + 'global_step': 1, + 'lr_schedulers': ANY, + 'optimizer_states': ANY, + 'pytorch-lightning_version': __version__, + 'state_dict': ANY, + }, ) + ), + dict(name='configure_sharded_model'), + dict(name='Callback.on_configure_sharded_model', args=(trainer, model)), + dict(name='configure_optimizers'), + dict(name='Callback.on_fit_start', args=(trainer, model)), + dict(name='on_fit_start'), + dict(name='Callback.on_pretrain_routine_start', args=(trainer, model)), + dict(name='on_pretrain_routine_start'), + dict(name='Callback.on_pretrain_routine_end', args=(trainer, model)), + dict(name='on_pretrain_routine_end'), + dict(name='train'), + dict(name='on_train_dataloader'), + dict(name='train_dataloader'), + # even though no validation runs, we initialize the val dataloader for properties like `num_val_batches` + dict(name='on_val_dataloader'), + dict(name='val_dataloader'), + dict(name='Callback.on_train_start', args=(trainer, model)), + dict(name='on_train_start'), + dict(name='Callback.on_epoch_start', args=(trainer, model)), + dict(name='on_epoch_start'), + dict(name='Callback.on_train_epoch_start', args=(trainer, model)), + dict(name='on_train_epoch_start'), + # TODO: wrong current epoch after reload + *model._train_batch(trainer, model, train_batches, current_epoch=1), + dict(name='training_epoch_end', args=([dict(loss=ANY)] * train_batches, )), + dict(name='Callback.on_train_epoch_end', args=( + trainer, + model, + [dict(loss=ANY)] * train_batches, + )), + dict(name='on_train_epoch_end', args=([dict(loss=ANY)] * train_batches, )), + dict(name='Callback.on_epoch_end', args=(trainer, model)), + dict(name='on_epoch_end'), + dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), + dict(name='on_save_checkpoint', args=(saved_ckpt, )), + dict(name='Callback.on_train_end', args=(trainer, model)), + dict(name='on_train_end'), + dict(name='Callback.on_fit_end', args=(trainer, model)), + dict(name='on_fit_end'), + dict(name='Callback.teardown', args=(trainer, model), kwargs=dict(stage='fit')), + dict(name='teardown', kwargs=dict(stage='fit')), ] - assert model.called == expected - - -def test_trainer_model_hook_system_validate(tmpdir): - model = HookedModel() + assert called == expected + + +@pytest.mark.parametrize('batches', (0, 2)) +@pytest.mark.parametrize(['verb', 'noun', 'dataloader', 'key'], [ + ('validate', 'validation', 'val', 'x'), + ('test', 'test', 'test', 'y'), +]) +def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader, key): + called = [] + model = HookedModel(called) + callback = HookedCallback(called) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - limit_val_batches=1, + limit_val_batches=batches, + limit_test_batches=batches, progress_bar_refresh_rate=0, weights_summary=None, + callbacks=[callback], ) - assert model.called == [] - trainer.validate(model, verbose=False) + assert called == [ + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + ] + fn = getattr(trainer, verb) + fn(model, verbose=False) + hooks = [ + dict(name='train', args=(False, )), + dict(name=f'on_{noun}_model_eval'), + dict(name='zero_grad'), + dict(name=f'Callback.on_{noun}_start', args=(trainer, model)), + dict(name=f'on_{noun}_start'), + *model._eval_epoch(noun, trainer, model, batches, key), + dict(name=f'Callback.on_{noun}_end', args=(trainer, model)), + dict(name=f'on_{noun}_end'), + dict(name='train'), + dict(name=f'on_{noun}_model_train'), + ] expected = [ - 'setup_validate', - 'on_validation_model_eval', - 'on_validation_start', - 'on_epoch_start', - 'on_validation_epoch_start', - 'on_validation_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_validation_batch_end', - 'validation_epoch_end', - 'on_validation_epoch_end', - 'on_epoch_end', - 'on_validation_end', - 'on_validation_model_train', - 'teardown_validate', + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + dict(name='prepare_data'), + dict(name='configure_callbacks'), + dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)), + dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage=verb)), + dict(name='setup', kwargs=dict(stage=verb)), + dict(name='configure_sharded_model'), + dict(name='Callback.on_configure_sharded_model', args=(trainer, model)), + dict(name=f'on_{dataloader}_dataloader'), + dict(name=f'{dataloader}_dataloader'), + *(hooks if batches else []), + dict(name='Callback.teardown', args=(trainer, model), kwargs=dict(stage=verb)), + dict(name='teardown', kwargs=dict(stage=verb)), ] - assert model.called == expected + assert called == expected -def test_trainer_model_hook_system_test(tmpdir): - model = HookedModel() +def test_trainer_model_hook_system_predict(tmpdir): + called = [] + model = HookedModel(called) + callback = HookedCallback(called) + batches = 2 trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1, - limit_test_batches=1, + limit_predict_batches=batches, progress_bar_refresh_rate=0, - weights_summary=None, + callbacks=[callback], ) - assert model.called == [] - trainer.test(model, verbose=False) + assert called == [ + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + ] + trainer.predict(model) expected = [ - 'setup_test', - 'on_test_model_eval', - 'on_test_start', - 'on_epoch_start', - 'on_test_epoch_start', - 'on_test_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_test_batch_end', - 'on_test_epoch_end', - 'on_epoch_end', - 'on_test_end', - 'on_test_model_train', - 'teardown_test', + dict(name='Callback.on_init_start', args=(trainer, )), + dict(name='Callback.on_init_end', args=(trainer, )), + dict(name='prepare_data'), + dict(name='configure_callbacks'), + dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)), + dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='predict')), + dict(name='setup', kwargs=dict(stage='predict')), + dict(name='configure_sharded_model'), + dict(name='Callback.on_configure_sharded_model', args=(trainer, model)), + dict(name='on_predict_dataloader'), + dict(name='predict_dataloader'), + dict(name='train', args=(False, )), + dict(name='on_predict_model_eval'), + dict(name='zero_grad'), + dict(name='Callback.on_predict_start', args=(trainer, model)), + dict(name='on_predict_start'), + # TODO: `{,Callback}.on_epoch_{start,end}` + dict(name='Callback.on_predict_epoch_start', args=(trainer, model)), + dict(name='on_predict_epoch_start'), + *model._predict_batch(trainer, model, batches), + # TODO: `predict_epoch_end` + dict(name='Callback.on_predict_epoch_end', args=(trainer, model, [[ANY] * batches])), + dict(name='on_predict_epoch_end', args=([[ANY] * batches], )), + dict(name='Callback.on_predict_end', args=(trainer, model)), + dict(name='on_predict_end'), + # TODO: `on_predict_model_train` + dict(name='Callback.teardown', args=(trainer, model), kwargs=dict(stage='predict')), + dict(name='teardown', kwargs=dict(stage='predict')), ] - assert model.called == expected + assert called == expected + + +# TODO: add test for tune def test_hooks_with_different_argument_names(tmpdir): @@ -644,107 +742,102 @@ def test_trainer_datamodule_hook_system(tmpdir): class HookedDataModule(BoringDataModule): - def __init__(self): + def __init__(self, called): super().__init__() - self.called = [] - def prepare_data(self): - self.called.append("prepare_data") - super().prepare_data() + def call(hook, fn, *args, **kwargs): + out = fn(*args, **kwargs) + d = {'name': hook} + if args: + d['args'] = args + if kwargs: + d['kwargs'] = kwargs + called.append(d) + return out - def setup(self, stage=None): - self.called.append(f"setup_{stage}") - super().setup(stage=stage) - - def teardown(self, stage=None): - self.called.append(f"teardown_{stage}") - super().teardown(stage=stage) - - def train_dataloader(self): - self.called.append("train_dataloader") - return super().train_dataloader() - - def test_dataloader(self): - self.called.append("test_dataloader") - return super().test_dataloader() - - def val_dataloader(self): - self.called.append("val_dataloader") - return super().val_dataloader() - - def predict_dataloader(self): - self.called.append("predict_dataloader") - - def transfer_batch_to_device(self, *args, **kwargs): - self.called.append("transfer_batch_to_device") - return super().transfer_batch_to_device(*args, **kwargs) - - def on_before_batch_transfer(self, *args, **kwargs): - self.called.append("on_before_batch_transfer") - return super().on_before_batch_transfer(*args, **kwargs) - - def on_after_batch_transfer(self, *args, **kwargs): - self.called.append("on_after_batch_transfer") - return super().on_after_batch_transfer(*args, **kwargs) + for h in get_members(LightningDataModule): + attr = getattr(self, h) + setattr(self, h, partial(call, h, attr)) model = BoringModel() - dm = HookedDataModule() - + batches = 2 trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - limit_val_batches=1, - limit_train_batches=2, - limit_test_batches=1, + limit_train_batches=batches, + limit_val_batches=batches, + limit_test_batches=batches, + limit_predict_batches=batches, progress_bar_refresh_rate=0, weights_summary=None, reload_dataloaders_every_epoch=True, ) + + called = [] + dm = HookedDataModule(called) trainer.fit(model, datamodule=dm) + batch_transfer = [ + dict(name='on_before_batch_transfer', args=(ANY, None)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='on_after_batch_transfer', args=(ANY, None)), + ] expected = [ - 'prepare_data', - 'setup_fit', - 'val_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'train_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'val_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'teardown_fit', + dict(name='prepare_data'), + dict(name='setup', kwargs=dict(stage='fit')), + dict(name='val_dataloader'), + *batch_transfer * batches, + dict(name='train_dataloader'), + *batch_transfer * batches, + dict(name='val_dataloader'), + *batch_transfer * batches, + dict( + name='on_save_checkpoint', + args=({ + 'callbacks': ANY, + 'epoch': 1, + 'global_step': 2, + 'lr_schedulers': ANY, + 'optimizer_states': ANY, + 'pytorch-lightning_version': __version__, + 'state_dict': ANY, + }, ) + ), + dict(name='teardown', kwargs=dict(stage='fit')), ] - assert dm.called == expected + assert called == expected - dm = HookedDataModule() + called = [] + dm = HookedDataModule(called) trainer.validate(model, datamodule=dm, verbose=False) expected = [ - 'prepare_data', - 'setup_validate', - 'val_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'teardown_validate', + dict(name='prepare_data'), + dict(name='setup', kwargs=dict(stage='validate')), + dict(name='val_dataloader'), + *batch_transfer * batches, + dict(name='teardown', kwargs=dict(stage='validate')), ] - assert dm.called == expected + assert called == expected - dm = HookedDataModule() + called = [] + dm = HookedDataModule(called) trainer.test(model, datamodule=dm, verbose=False) expected = [ - 'prepare_data', - 'setup_test', - 'test_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'teardown_test', + dict(name='prepare_data'), + dict(name='setup', kwargs=dict(stage='test')), + dict(name='test_dataloader'), + *batch_transfer * batches, + dict(name='teardown', kwargs=dict(stage='test')), + ] + assert called == expected + + called = [] + dm = HookedDataModule(called) + trainer.predict(model, datamodule=dm) + expected = [ + dict(name='prepare_data'), + dict(name='setup', kwargs=dict(stage='predict')), + dict(name='predict_dataloader'), + *batch_transfer * batches, + dict(name='teardown', kwargs=dict(stage='predict')), ] - assert dm.called == expected + assert called == expected diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 10f96845a7a48..ab3c3619652e9 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -264,7 +264,7 @@ def test_horovod_multi_optimizer(tmpdir): assert hasattr(optimizer, 'synchronize'), 'optimizer has not been wrapped into DistributedOptimizer' def get_model_params(model): - return set([p for p in model.parameters()]) + return set(list(model.parameters())) def get_optimizer_params(optimizer): return set([p for group in optimizer.param_groups for p in group.get('params', [])]) @@ -296,7 +296,7 @@ def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) - self.log("test_tensor", tensor, sync_dist=True, sync_dist_op='sum', on_step=True, on_epoch=True) + self.log("test_tensor", tensor, sync_dist=True, reduce_fx='sum', on_step=True, on_epoch=True) res = self._results diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 06477b3572db7..7fa8872036a73 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -15,6 +15,7 @@ import os import pickle from argparse import Namespace +from dataclasses import dataclass import cloudpickle import pytest @@ -719,3 +720,21 @@ def test_empty_hparams_container(tmpdir): assert not model.hparams model = HparamsNamespaceContainerModel(Namespace()) assert not model.hparams + + +@dataclass +class DataClassModel(BoringModel): + + mandatory: int + optional: str = "optional" + ignore_me: bool = False + + def __post_init__(self): + super().__init__() + self.save_hyperparameters(ignore=("ignore_me", )) + + +def test_dataclass_lightning_module(tmpdir): + """ Test that save_hyperparameters() works with a LightningModule as a dataclass. """ + model = DataClassModel(33, optional="cocofruit") + assert model.hparams == dict(mandatory=33, optional="cocofruit") diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 09ae795297eb5..b1b8e73861ef1 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -143,7 +143,7 @@ def test_try_resume_from_non_existing_checkpoint(tmpdir): class CaptureCallbacksBeforeTraining(Callback): callbacks = [] - def on_train_start(self, trainer, pl_module): + def on_pretrain_routine_end(self, trainer, pl_module): self.callbacks = deepcopy(trainer.callbacks) @@ -156,7 +156,11 @@ def test_callbacks_state_resume_from_checkpoint(tmpdir): def get_trainer_args(): checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True) trainer_args = dict( - default_root_dir=tmpdir, max_steps=1, logger=False, callbacks=[checkpoint, callback_capture] + default_root_dir=tmpdir, + max_steps=1, + logger=False, + callbacks=[checkpoint, callback_capture], + limit_val_batches=2 ) assert checkpoint.best_model_path == "" assert checkpoint.best_model_score is None @@ -183,7 +187,13 @@ def test_callbacks_references_resume_from_checkpoint(tmpdir): """ Test that resuming from a checkpoint sets references as expected. """ dm = ClassifDataModule() model = ClassificationModel() - args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False} + args = { + 'default_root_dir': tmpdir, + 'max_steps': 1, + 'logger': False, + "limit_val_batches": 2, + "num_sanity_val_steps": 0 + } # initial training checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True) @@ -431,10 +441,10 @@ class CustomModel(CustomClassificationModelDP): def __init__(self): super().__init__() - self.on_train_start_called = False + self.on_pretrain_routine_end_called = False # set the epoch start hook so we can predict before the model does the full training - def on_train_start(self): + def on_pretrain_routine_end(self): assert self.trainer.current_epoch == real_global_epoch and self.trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we @@ -443,14 +453,14 @@ def on_train_start(self): dataloader = self.train_dataloader() tpipes.run_prediction_eval_model_template(self.trainer.lightning_module, dataloader=dataloader) - self.on_train_start_called = True + self.on_pretrain_routine_end_called = True # new model model = CustomModel() # fit new model which should load hpc weights new_trainer.fit(model, datamodule=dm) - assert model.on_train_start_called + assert model.on_pretrain_routine_end_called # test freeze on gpu model.freeze() diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index f7d0aea829ced..2e7db175801b9 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -24,8 +24,8 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators import TPUAccelerator from pytorch_lightning.callbacks import EarlyStopping -from pytorch_lightning.core.step_result import Result from pytorch_lightning.plugins import TPUSpawnPlugin +from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -424,20 +424,11 @@ def test_if_test_works_with_checkpoint_false(tmpdir): def test_tpu_sync_dist(): """Test tpu spawn sync dist operation """ - def test_sync_dist(rank): - tensor = torch.tensor([1.0]) - training_type_plugin = TPUSpawnPlugin() - - res = Result() - res.log( - "test_tensor", - tensor, - sync_fn=training_type_plugin.reduce, - sync_dist=True, - sync_dist_op=torch.distributed.ReduceOp.SUM - ) - - assert res["test_tensor"].item() == 8, "Result-Log does not work properly with TPU Spawn and Tensors" + def test_sync_dist(_): + sync = _Sync(TPUSpawnPlugin().reduce, should=True, op=torch.distributed.ReduceOp.SUM) + value = torch.tensor([1.0]) + value = sync(value), + assert value.item() == 8 xmp.spawn(test_sync_dist, nprocs=8, start_method='fork') diff --git a/tests/overrides/test_base.py b/tests/overrides/test_base.py new file mode 100644 index 0000000000000..ad0e63fb5f93d --- /dev/null +++ b/tests/overrides/test_base.py @@ -0,0 +1,44 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch +from torch.nn import DataParallel + +from pytorch_lightning.overrides.base import ( + _LightningModuleWrapperBase, + _LightningPrecisionModuleWrapperBase, + unwrap_lightning_module, +) +from tests.helpers import BoringModel + + +@pytest.mark.parametrize("wrapper_class", [ + _LightningModuleWrapperBase, + _LightningPrecisionModuleWrapperBase, +]) +def test_wrapper_device_dtype(wrapper_class): + model = BoringModel() + wrapped_model = wrapper_class(model) + + wrapped_model.to(dtype=torch.float16) + assert model.dtype == torch.float16 + + +def test_unwrap_lightning_module(): + model = BoringModel() + wrapped_model = _LightningPrecisionModuleWrapperBase(model) + wrapped_model = _LightningModuleWrapperBase(wrapped_model) + wrapped_model = DataParallel(wrapped_model) + + assert unwrap_lightning_module(wrapped_model) == model diff --git a/tests/overrides/test_distributed.py b/tests/overrides/test_distributed.py index d09ac9c8bad06..c8d982bd733fe 100644 --- a/tests/overrides/test_distributed.py +++ b/tests/overrides/test_distributed.py @@ -11,11 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Iterable + import pytest from torch.utils.data import BatchSampler, SequentialSampler from pytorch_lightning import seed_everything from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler +from pytorch_lightning.utilities.data import has_len @pytest.mark.parametrize("shuffle", [False, True]) @@ -29,7 +32,7 @@ def test_unrepeated_distributed_sampler(shuffle, tmpdir): for rank in range(world_size): samplers.append(UnrepeatedDistributedSampler(dataset, rank=rank, num_replicas=world_size, shuffle=shuffle)) - indices = [[v for v in s] for s in samplers] + indices = [list(s) for s in samplers] assert len(indices[0]) == 26 assert len(indices[1]) == 26 assert len(indices[2]) == 26 @@ -54,3 +57,13 @@ def test_index_batch_sampler(tmpdir): for batch in index_batch_sampler: assert index_batch_sampler.batch_indices == batch + + +def test_index_batch_sampler_methods(): + dataset = range(15) + sampler = SequentialSampler(dataset) + batch_sampler = BatchSampler(sampler, 3, False) + index_batch_sampler = IndexBatchSamplerWrapper(batch_sampler) + + assert isinstance(index_batch_sampler, Iterable) + assert has_len(index_batch_sampler) diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py index 6d0dbed2cf88b..cf58427b071ce 100644 --- a/tests/plugins/test_amp_plugins.py +++ b/tests/plugins/test_amp_plugins.py @@ -99,6 +99,47 @@ def test_amp_gradient_unscale(tmpdir, accum: int): trainer.fit(model) +@RunIf(min_gpus=1, amp_native=True) +def test_amp_skip_optimizer(tmpdir): + """ + Test that optimizers can be skipped when using amp + """ + + class CustomBoringModel(BoringModel): + + def __init__(self): + super().__init__() + self.layer1 = torch.nn.Linear(32, 32) + self.layer2 = torch.nn.Linear(32, 2) + + def forward(self, x: torch.Tensor): + x = self.layer1(x) + x = self.layer2(x) + return x + + def training_step(self, batch, batch_idx, optimizer_idx): + if optimizer_idx == 1: + return None + output = self(batch) + return self.loss(batch, output) + + def configure_optimizers(self): + return [ + torch.optim.SGD(self.layer1.parameters(), lr=0.1), + torch.optim.SGD(self.layer2.parameters(), lr=0.1), + ] + + trainer = Trainer( + default_root_dir=tmpdir, + gpus=1, + fast_dev_run=1, + amp_backend='native', + precision=16, + ) + model = CustomBoringModel() + trainer.fit(model) + + @RunIf(min_gpus=2, amp_apex=True, special=True) @pytest.mark.parametrize("amp_level", ['O2']) def test_amp_apex_ddp_fit(amp_level, tmpdir): diff --git a/tests/plugins/test_cluster_integration.py b/tests/plugins/test_cluster_integration.py index f9ca8c23d34d9..9f5eba43cf5a0 100644 --- a/tests/plugins/test_cluster_integration.py +++ b/tests/plugins/test_cluster_integration.py @@ -18,7 +18,7 @@ import torch from pytorch_lightning import Trainer -from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPShardedPlugin, DeepSpeedPlugin, RPCSequentialPlugin +from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPShardedPlugin, DeepSpeedPlugin from pytorch_lightning.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment from pytorch_lightning.utilities import rank_zero_only from tests.helpers.runif import RunIf @@ -66,7 +66,6 @@ def environment_combinations(): DDPShardedPlugin, DDP2Plugin, pytest.param(DeepSpeedPlugin, marks=RunIf(deepspeed=True)), - pytest.param(RPCSequentialPlugin, marks=RunIf(fairscale_pipe=True)), ], ) def test_ranks_available_manual_plugin_selection(plugin_cls): diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index d236dc145d96c..61c5d70191db2 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -11,7 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import torch +from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import Trainer from pytorch_lightning.plugins import DDPPlugin @@ -46,3 +49,30 @@ def test_ddp_with_2_gpus(): assert model.device == torch.device("cpu") cuda_memory = torch.cuda.memory_allocated() assert cuda_memory < model.start_cuda_memory + + +class BarrierModel(BoringModel): + + def setup(self, stage=None): + assert not isinstance(self.trainer.accelerator.model, DistributedDataParallel) + self.trainer.accelerator.barrier("barrier before model is wrapped") + + def on_train_start(self): + assert isinstance(self.trainer.accelerator.model, DistributedDataParallel) + self.trainer.accelerator.barrier("barrier after model is wrapped") + + +@RunIf(min_gpus=4, special=True) +@mock.patch("torch.distributed.barrier") +def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir): + """ Test correct usage of barriers when device ids do not start at 0 or are not consecutive. """ + model = BoringModel() + gpus = [1, 3] + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + gpus=gpus, + accelerator="ddp", + ) + trainer.fit(model) + barrier_mock.assert_any_call(device_ids=[gpus[trainer.local_rank]]) diff --git a/tests/plugins/test_ddp_spawn_plugin.py b/tests/plugins/test_ddp_spawn_plugin.py index 8afc30c4692ec..26a7746c41cfe 100644 --- a/tests/plugins/test_ddp_spawn_plugin.py +++ b/tests/plugins/test_ddp_spawn_plugin.py @@ -15,7 +15,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins import DDPSpawnPlugin -from tests.helpers.boring_model import BoringModel +from tests.helpers.boring_model import BoringDataModule, BoringModel from tests.helpers.runif import RunIf @@ -26,6 +26,26 @@ def on_train_start(self) -> None: assert self.device == torch.device("cpu") +class BoringCallbackDDPSpawnModel(BoringModel): + + def __init__(self, name: str, val: float): + super().__init__() + self.name = name + self.val = val + + def validation_step(self, batch, batch_idx): + self.log(self.name, self.val) + return super().validation_step(batch, batch_idx) + + def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: + queue.put("test_val") + return super().add_to_queue(queue) + + def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: + self.test_val = queue.get() + return super().get_from_queue(queue) + + @RunIf(skip_windows=True) def test_ddp_cpu(): """Tests if device is set correctely when training for DDPSpawnPlugin.""" @@ -40,3 +60,22 @@ def test_ddp_cpu(): model = BoringModelDDPCPU() trainer.fit(model) + + +@RunIf(min_gpus=2) +def test_ddp_spawn_extra_parameters(tmpdir): + """Tests if device is set correctely when training for DDPSpawnPlugin.""" + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, accelerator="ddp_spawn") + + assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin) + assert trainer.training_type_plugin.on_gpu + assert trainer.training_type_plugin.root_device == torch.device("cuda:0") + + val: float = 1.0 + val_name: str = "val_acc" + model = BoringCallbackDDPSpawnModel(val_name, val) + dm = BoringDataModule() + + trainer.fit(model, datamodule=dm) + assert trainer.callback_metrics[val_name] == torch.tensor(val) + assert model.test_val == "test_val" diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 85d069b90288d..efe8da981c9eb 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -24,15 +24,40 @@ class ModelParallelBoringModel(BoringModel): def __init__(self): super().__init__() - self.linear = None + self.layer = None def configure_sharded_model(self) -> None: - self.linear = torch.nn.Linear(32, 2) + self.layer = torch.nn.Linear(32, 2) def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: self.configure_sharded_model() +class ModelParallelBoringModelManualOptim(BoringModel): + + def __init__(self): + super().__init__() + self.layer = None + + def training_step(self, batch, batch_idx): + opt = self.optimizers()[0] + output = self(batch) + loss = self.loss(batch, output) + opt.zero_grad() + self.manual_backward(loss) + opt.step() + + def configure_sharded_model(self) -> None: + self.layer = torch.nn.Linear(32, 2) + + def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + self.configure_sharded_model() + + @property + def automatic_optimization(self) -> bool: + return False + + def test_deepspeed_lightning_module(tmpdir): """ Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. @@ -483,6 +508,24 @@ def configure_optimizers(self): }] +class ManualModelParallelClassificationModel(ModelParallelClassificationModel): + + @property + def automatic_optimization(self) -> bool: + return False + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + loss = F.cross_entropy(logits, y) + opt = self.optimizers()[0] + self.log('train_loss', loss, prog_bar=True) + self.log('train_acc', self.train_acc(logits, y), prog_bar=True, sync_dist=True) + opt.zero_grad() + self.manual_backward(loss) + opt.step() + + @RunIf(min_gpus=2, deepspeed=True, special=True) def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): """ @@ -502,9 +545,34 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer, cls=ModelParallelBoringModel) -def run_checkpoint_test(tmpdir, save_full_weights): +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config): + """ + Test to ensure ZeRO Stage 3 works with a parallel model. + """ + model = ModelParallelBoringModelManualOptim() + model.training_epoch_end = None + trainer = Trainer( + default_root_dir=tmpdir, + plugins=[DeepSpeedPlugin(stage=3)], + gpus=2, + fast_dev_run=True, + precision=16, + ) + trainer.fit(model) + trainer.test(model) + + _assert_save_model_is_equal(model, tmpdir, trainer, cls=ModelParallelBoringModelManualOptim) + + +def run_checkpoint_test( + tmpdir: str, save_full_weights: bool, automatic_optimization: bool = True, accumulate_grad_batches: int = 2 +): seed_everything(1) - model = ModelParallelClassificationModel() + if automatic_optimization: + model = ModelParallelClassificationModel() + else: + model = ManualModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( @@ -514,7 +582,7 @@ def run_checkpoint_test(tmpdir, save_full_weights): plugins=[DeepSpeedPlugin(stage=3, save_full_weights=save_full_weights)], gpus=2, precision=16, - accumulate_grad_batches=2, + accumulate_grad_batches=accumulate_grad_batches, callbacks=[ck] ) trainer.fit(model, datamodule=dm) @@ -563,12 +631,28 @@ def test_deepspeed_multigpu_stage_3_checkpointing_full_weights(tmpdir): @RunIf(min_gpus=2, deepspeed=True, special=True) -@pytest.mark.parametrize('cpu_offload', [True, False]) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, cpu_offload): +def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir): + """ + Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, + where we save the full weights to one file. + """ + run_checkpoint_test(tmpdir, save_full_weights=True, automatic_optimization=False, accumulate_grad_batches=1) + + +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): + _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) + + +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): + _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) + + +def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ - os.environ['MASTER_PORT'] = "29500" seed_everything(42) class VerificationCallback(Callback): @@ -585,7 +669,7 @@ def on_train_batch_start( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=5, - plugins=[DeepSpeedPlugin(stage=2, cpu_offload=cpu_offload)], + plugins=[DeepSpeedPlugin(stage=2, offload_optimizer=offload_optimizer)], gpus=2, limit_val_batches=2, precision=16, diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py index 96ff2d182b504..302ee985b2379 100644 --- a/tests/plugins/test_double_plugin.py +++ b/tests/plugins/test_double_plugin.py @@ -11,12 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pickle +from unittest.mock import MagicMock + import pytest import torch from torch.utils.data import DataLoader, Dataset from pytorch_lightning import Trainer +from pytorch_lightning.plugins import DoublePrecisionPlugin +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from tests.helpers.boring_model import BoringModel, RandomDataset +from tests.helpers.runif import RunIf class RandomFloatIntDataset(Dataset): @@ -118,10 +124,30 @@ def predict_dataloader(self): return DataLoader(RandomDataset(32, 64)) -@pytest.mark.parametrize('boring_model', (DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward)) +class DoublePrecisionBoringModelComplexBuffer(BoringModel): + + def __init__(self): + super().__init__() + + self.register_buffer("complex_buffer", torch.complex(torch.rand(10), torch.rand(10)), False) + + def on_fit_start(self): + assert self.layer.weight.dtype == torch.float64 + assert self.complex_buffer.dtype == torch.complex64 + + +@pytest.mark.parametrize( + 'boring_model', [ + DoublePrecisionBoringModel, + DoublePrecisionBoringModelNoForward, + pytest.param( + DoublePrecisionBoringModelComplexBuffer, + marks=pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="torch.complex not available") + ), + ] +) def test_double_precision(tmpdir, boring_model): model = boring_model() - original_training_step = model.training_step trainer = Trainer( max_epochs=2, @@ -134,4 +160,25 @@ def test_double_precision(tmpdir, boring_model): trainer.test(model) trainer.predict(model) - assert model.training_step == original_training_step + +@RunIf(min_gpus=2) +def test_double_precision_ddp(tmpdir): + model = DoublePrecisionBoringModel() + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + accelerator='ddp_spawn', + gpus=2, + fast_dev_run=2, + precision=64, + log_every_n_steps=1, + ) + trainer.fit(model) + + +def test_double_precision_pickle(tmpdir): + model = BoringModel() + plugin = DoublePrecisionPlugin() + model, _, __ = plugin.connect(model, MagicMock(), MagicMock()) + pickle.dumps(model) diff --git a/tests/plugins/test_plugins_registry.py b/tests/plugins/test_plugins_registry.py index 8ccba40013517..d2ca1d46c975f 100644 --- a/tests/plugins/test_plugins_registry.py +++ b/tests/plugins/test_plugins_registry.py @@ -14,7 +14,7 @@ import pytest from pytorch_lightning import Trainer -from pytorch_lightning.plugins import DDPPlugin, DeepSpeedPlugin, TrainingTypePluginsRegistry +from pytorch_lightning.plugins import DDPPlugin, DeepSpeedPlugin, TPUSpawnPlugin, TrainingTypePluginsRegistry from tests.helpers.runif import RunIf @@ -54,14 +54,15 @@ def __init__(self, param1, param2): }), ("deepspeed_stage_2_offload", { "stage": 2, - "cpu_offload": True + "offload_optimizer": True }), ("deepspeed_stage_3", { "stage": 3 }), ("deepspeed_stage_3_offload", { "stage": 3, - "cpu_offload": True + "offload_parameters": True, + "offload_optimizer": True }), ], ) @@ -93,3 +94,16 @@ def test_ddp_training_type_plugins_registry_with_trainer(tmpdir): ) assert isinstance(trainer.training_type_plugin, DDPPlugin) + + +def test_tpu_spawn_debug_plugins_registry(tmpdir): + + plugin = "tpu_spawn_debug" + + assert plugin in TrainingTypePluginsRegistry + assert TrainingTypePluginsRegistry[plugin]["init_params"] == {"debug": True} + assert TrainingTypePluginsRegistry[plugin]["plugin"] == TPUSpawnPlugin + + trainer = Trainer(plugins=plugin) + + assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin) diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py deleted file mode 100644 index 7abf9fcbd5039..0000000000000 --- a/tests/plugins/test_rpc_plugin.py +++ /dev/null @@ -1,89 +0,0 @@ -import os -from typing import Optional -from unittest import mock - -import pytest - -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins.training_type.rpc_sequential import RPCPlugin -from tests.helpers.boring_model import BoringModel -from tests.helpers.runif import RunIf - - -@mock.patch.dict( - os.environ, - { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_PROCID": "0", - "SLURM_LOCALID": "0", - }, -) -@mock.patch("torch.cuda.device_count", return_value=2) -@pytest.mark.parametrize( - ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp_spawn", 2, 0)], -) -@RunIf(rpc=True) -def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes): - - class CB(Callback): - - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.training_type_plugin, RPCPlugin) - raise RuntimeError('finished plugin check') - - model = BoringModel() - trainer = Trainer( - default_root_dir=str(tmpdir), - fast_dev_run=True, - gpus=gpus, - num_processes=num_processes, - distributed_backend=ddp_backend, - callbacks=[CB()], - plugins=[RPCPlugin()] - ) - - with pytest.raises(RuntimeError, match='finished plugin check'): - trainer.fit(model) - - -class CustomRPCPlugin(RPCPlugin): - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.rpc_save_model_count = 0 - self.worker_optimizer_step_count = 0 - - def rpc_save_model(self, *_) -> None: - self.rpc_save_model_count += 1 - - def barrier(self, name: Optional[str] = None) -> None: - return - - -@RunIf(min_gpus=2, special=True, rpc=True) -def test_rpc_function_calls_ddp(tmpdir): - model = BoringModel() - plugin = CustomRPCPlugin() - max_epochs = 2 - limit_train_batches = 2 - trainer = Trainer( - limit_train_batches=limit_train_batches, - limit_val_batches=2, - max_epochs=max_epochs, - gpus=2, - distributed_backend='ddp', - plugins=[plugin], - default_root_dir=tmpdir, - ) - - trainer.fit(model) - if trainer.global_rank == 0: # Main process - assert plugin.rpc_save_model_count == max_epochs - else: # Worker process - assert plugin.rpc_save_model_count == max_epochs diff --git a/tests/plugins/test_rpc_sequential_plugin.py b/tests/plugins/test_rpc_sequential_plugin.py deleted file mode 100644 index 00a6220036c3e..0000000000000 --- a/tests/plugins/test_rpc_sequential_plugin.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -from unittest import mock - -import pytest -import torch -import torch.distributed as torch_distrib -from torch import nn - -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers.boring_model import RandomDataset -from tests.helpers.runif import RunIf - - -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@RunIf(min_gpus=2, special=True, fairscale_pipe=True) -def test_rpc_sequential_plugin_manual(tmpdir): - model = SequentialModelRPCManual() - trainer = Trainer( - max_epochs=2, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - gpus=2, - distributed_backend="ddp", - plugins=[RPCSequentialPlugin(balance=[2, 1], rpc_timeout_sec=5 * 60)], - ) - - trainer.fit(model) - - if torch_distrib.is_initialized() and torch_distrib.get_rank() == 0: - assert len(trainer.dev_debugger.pbar_added_metrics) > 0 - - if trainer.accelerator.rpc_enabled: - # Called at the end of trainer to ensure all processes are killed - trainer.accelerator.training_type_plugin.exit_rpc_process() - - -@RunIf(min_gpus=2, special=True, fairscale_pipe=True) -def test_rpc_sequential_plugin_manual_amp(tmpdir): - model = SequentialModelRPCManual() - trainer = Trainer( - max_epochs=2, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - gpus=2, - precision=16, - amp_backend="native", - distributed_backend="ddp", - plugins=[RPCSequentialPlugin(balance=[2, 1])], - ) - with pytest.raises( - MisconfigurationException, - match='`RPCSequentialPlugin` is currently not supported in Automatic Mixed Precision' - ): - trainer.fit(model) - - -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@RunIf(min_gpus=2, special=True, fairscale_pipe=True) -def test_rpc_sequential_plugin_automatic(tmpdir): - model = SequentialModelRPCAutomatic() - trainer = Trainer( - max_epochs=2, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - gpus=2, - distributed_backend="ddp", - plugins=[RPCSequentialPlugin(balance=[2, 1])], - ) - - trainer.fit(model) - - if torch_distrib.is_initialized() and torch_distrib.get_rank() == 0: - assert len(trainer.dev_debugger.pbar_added_metrics) > 0 - - if trainer.accelerator.rpc_enabled: - # Called at the end of trainer to ensure all processes are killed - trainer.accelerator.training_type_plugin.exit_rpc_process() - - -@RunIf(min_gpus=2, special=True, fairscale_pipe=True) -def test_rpc_sequential_plugin_with_wrong_balance(tmpdir): - model = SequentialModelRPCAutomatic() - trainer = Trainer( - max_epochs=2, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - gpus=2, - distributed_backend="ddp", - plugins=[RPCSequentialPlugin(balance=[2, 2])], - ) - - with pytest.raises( - MisconfigurationException, match="The provided balance sum: 4 does not match your Sequential length: 3" - ): - trainer.fit(model) - - if trainer.accelerator.rpc_enabled: - # Called at the end of trainer to ensure all processes are killed - trainer.accelerator.training_type_plugin.exit_rpc_process() - - -class SequentialModelRPCManual(LightningModule): - - def __init__(self): - super().__init__() - self.sequential_module = nn.Sequential(torch.nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 2)) - self.automatic_optimization = False - - def forward(self, x): - return self.sequential_module(x) - - def loss(self, prediction): - # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls - return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) - - def step(self, x): - x = self(x) - out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) - return out - - def training_step(self, batch, batch_idx): - opt = self.optimizers() - output = self.sequential_module(batch) - loss = self.loss(output) - self.log("train_loss", loss, on_epoch=True, prog_bar=True) - self.manual_backward(loss, opt) - assert torch.stack([torch.abs(p.grad).sum() for p in self.parameters()]).sum() > 0 - opt.step() - opt.zero_grad() - assert torch.stack([torch.abs(p.grad).sum() for p in self.parameters()]).sum() == 0 - - def validation_step(self, batch, batch_idx): - output = self.sequential_module(batch) - loss = self.loss(output) - return loss - - def test_step(self, batch, batch_idx): - output = self.sequential_module(batch) - return self.loss(batch, output) - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.parameters(), lr=0.1) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) - return [optimizer], [lr_scheduler] - - def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - -class SequentialModelRPCAutomatic(SequentialModelRPCManual): - - def __init__(self): - super().__init__() - self.automatic_optimization = True - - def training_step(self, batch, batch_idx): - output = self.sequential_module(batch) - loss = self.loss(output) - self.log("train_loss", loss, on_epoch=True, prog_bar=True) - return loss diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 7ab49e6826d58..543c3c8ae3382 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -297,12 +297,24 @@ def training_step(self, batch, batch_idx): @RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2) -@pytest.mark.parametrize("accelerator", ["ddp_sharded", "ddp_sharded_spawn"]) -def test_ddp_sharded_plugin_manual_optimization(tmpdir, accelerator): +def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir): + # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use" model = ManualBoringModel() trainer = Trainer( default_root_dir=tmpdir, - accelerator=accelerator, + accelerator='ddp_sharded_spawn', + fast_dev_run=2, + gpus=2, + ) + trainer.fit(model) + + +@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2) +def test_ddp_sharded_plugin_manual_optimization(tmpdir): + model = ManualBoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + accelerator='ddp_sharded', fast_dev_run=2, gpus=2, ) diff --git a/tests/plugins/test_single_device_plugin.py b/tests/plugins/test_single_device_plugin.py index a398d960daf91..2e4834233537e 100644 --- a/tests/plugins/test_single_device_plugin.py +++ b/tests/plugins/test_single_device_plugin.py @@ -38,7 +38,7 @@ def on_train_start(self) -> None: @RunIf(skip_windows=True, min_gpus=1) def test_single_gpu(): - """Tests if device is set correctely when training and after teardown for single GPU plugin.""" + """Tests if device is set correctly when training and after teardown for single GPU plugin.""" trainer = Trainer(gpus=1, fast_dev_run=True) # assert training type plugin attributes for device setting assert isinstance(trainer.training_type_plugin, SingleDevicePlugin) diff --git a/tests/plugins/test_tpu_spawn.py b/tests/plugins/test_tpu_spawn.py index 85e1ecb781946..54c65c336fdd3 100644 --- a/tests/plugins/test_tpu_spawn.py +++ b/tests/plugins/test_tpu_spawn.py @@ -49,7 +49,7 @@ def predict_dataloader(self): @pytest.mark.parametrize( - "train_dataloader, val_dataloaders, test_dataloaders, predict_dataloaders", + "train_dataloaders, val_dataloaders, test_dataloaders, predict_dataloaders", [ (_loader_no_len, None, None, None), (None, _loader_no_len, None, None), @@ -60,14 +60,14 @@ def predict_dataloader(self): ) @mock.patch("pytorch_lightning.plugins.training_type.tpu_spawn.xm") def test_error_patched_iterable_dataloaders( - _, tmpdir, train_dataloader, val_dataloaders, test_dataloaders, predict_dataloaders + _, tmpdir, train_dataloaders, val_dataloaders, test_dataloaders, predict_dataloaders ): model = BoringModelNoDataloaders() connector = DataConnector(MagicMock()) connector.attach_dataloaders( model, - train_dataloader=train_dataloader, + train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, test_dataloaders=test_dataloaders, predict_dataloaders=predict_dataloaders, diff --git a/tests/profiler/__init__.py b/tests/profiler/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/test_profiler.py b/tests/profiler/test_profiler.py similarity index 99% rename from tests/test_profiler.py rename to tests/profiler/test_profiler.py index acc2bac1c466f..d940d4426b4a6 100644 --- a/tests/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -331,8 +331,8 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler): files = [file for file in files if file.endswith('.json')] assert len(files) == 2, files local_rank = trainer.local_rank - assert any(f'training_step_{local_rank}' in f for f in files) - assert any(f'validation_step_{local_rank}' in f for f in files) + assert any(f'{local_rank}-training_step_and_backward' in f for f in files) + assert any(f'{local_rank}-validation_step' in f for f in files) def test_pytorch_profiler_trainer_test(tmpdir): diff --git a/tests/profiler/test_xla_profiler.py b/tests/profiler/test_xla_profiler.py new file mode 100644 index 0000000000000..35279ddee8deb --- /dev/null +++ b/tests/profiler/test_xla_profiler.py @@ -0,0 +1,72 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from multiprocessing import Event, Process + +import pytest + +from pytorch_lightning import Trainer +from pytorch_lightning.profiler import XLAProfiler +from pytorch_lightning.utilities import _TPU_AVAILABLE +from tests.helpers import BoringModel +from tests.helpers.runif import RunIf + +if _TPU_AVAILABLE: + import torch_xla.debug.profiler as xp + import torch_xla.utils.utils as xu + + +@RunIf(tpu=True) +def test_xla_profiler_instance(tmpdir): + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + profiler="xla", + tpu_cores=8, + ) + + assert isinstance(trainer.profiler, XLAProfiler) + trainer.fit(model) + assert trainer.state.finished, f"Training failed with {trainer.state}" + + +@pytest.mark.skipif(True, reason="XLA Profiler doesn't support Prog. capture yet") +def test_xla_profiler_prog_capture(tmpdir): + + port = xu.get_free_tcp_ports()[0] + training_started = Event() + + def train_worker(): + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=4, + profiler="xla", + tpu_cores=8, + ) + + trainer.fit(model) + + p = Process(target=train_worker, daemon=True) + p.start() + training_started.wait(120) + + logdir = str(tmpdir) + xp.trace(f'localhost:{port}', logdir, duration_ms=2000, num_tracing_attempts=5, delay_ms=1000) + + p.terminate() + + assert os.isfile(os.path.join(logdir, 'plugins', 'profile', '*', '*.xplane.pb')) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index cf81700291b8d..95311fb2df515 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -17,7 +17,7 @@ set -e # this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --durations=0 --capture=no --disable-warnings' # find tests marked as `@RunIf(special=True)` grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') @@ -68,7 +68,15 @@ for i in "${!files_arr[@]}"; do done < <(echo "$test_code") done -nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +if nvcc --version; then + nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/profiler/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +fi + +# needs to run outside of `pytest` +python tests/utilities/test_warnings.py +if [ $? -eq 0 ]; then + report+="Ran\ttests/utilities/test_warnings.py\n" +fi # echo test report printf '=%.s' {1..80} diff --git a/tests/trainer/connectors/test_callback_connector.py b/tests/trainer/connectors/test_callback_connector.py index 34149e2231bf5..501482d77a240 100644 --- a/tests/trainer/connectors/test_callback_connector.py +++ b/tests/trainer/connectors/test_callback_connector.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from unittest.mock import Mock diff --git a/tests/trainer/connectors/test_checkpoint_connector.py b/tests/trainer/connectors/test_checkpoint_connector.py new file mode 100644 index 0000000000000..6e152f5944b59 --- /dev/null +++ b/tests/trainer/connectors/test_checkpoint_connector.py @@ -0,0 +1,155 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from unittest.mock import Mock + +import torch + +from pytorch_lightning import Trainer +from tests.helpers import BoringModel + + +class HPCHookdedModel(BoringModel): + + def __init__(self): + super().__init__() + self.hpc_save_called = 0 + self.hpc_load_called = 0 + + def on_hpc_save(self, checkpoint): + assert "state_dict" in checkpoint + self.hpc_save_called += 1 + + def on_hpc_load(self, checkpoint): + assert "state_dict" in checkpoint + self.hpc_load_called += 1 + + +def test_hpc_hook_calls(tmpdir): + model = HPCHookdedModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + checkpoint_callback=False, + logger=False, + ) + trainer.fit(model) + connector = trainer.checkpoint_connector + connector.hpc_save(tmpdir, logger=Mock()) + assert model.hpc_save_called == 1 + assert model.hpc_load_called == 0 + + # new training run, restore from hpc checkpoint file automatically + assert set(os.listdir(tmpdir)) == {"hpc_ckpt_1.ckpt"} + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + checkpoint_callback=False, + logger=False, + ) + trainer.fit(model) + assert model.hpc_save_called == 1 + assert model.hpc_load_called == 1 + + +def test_preloaded_checkpoint_lifecycle(tmpdir): + """ Tests that the preloaded checkpoint contents gets cleared from memory when it is not required anymore. """ + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + ) + trainer.fit(model) + + connector = trainer.checkpoint_connector + + assert not trainer.resume_from_checkpoint + assert not connector.resume_checkpoint_path + assert not connector._loaded_checkpoint + + connector.resume_start() + assert not connector.resume_checkpoint_path + assert not connector._loaded_checkpoint + connector.resume_end() + assert not connector.resume_checkpoint_path + assert not connector._loaded_checkpoint + + ckpt_path = trainer.checkpoint_callback.best_model_path + trainer = Trainer(default_root_dir=tmpdir, max_steps=2, resume_from_checkpoint=ckpt_path) + connector = trainer.checkpoint_connector + connector.resume_start() + assert connector.resume_checkpoint_path == ckpt_path + assert connector._loaded_checkpoint + assert isinstance(connector._loaded_checkpoint, dict) + connector.resume_end() + assert not connector.resume_checkpoint_path + assert not connector._loaded_checkpoint + + +def test_hpc_restore_attempt(tmpdir): + """ Test that restore() attempts to restore the hpc_ckpt with highest priority. """ + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + checkpoint_callback=False, + logger=False, + ) + trainer.fit(model) + + hpc_ckpt_path = tmpdir / "hpc_ckpt_3.ckpt" + trainer.save_checkpoint(hpc_ckpt_path) + assert os.listdir(tmpdir) == ["hpc_ckpt_3.ckpt"] + + # set weights to zero + for param in model.parameters(): + torch.nn.init.constant_(param, 0) + + # case 1: restore hpc first, no explicit resume path provided + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=2, + checkpoint_callback=False, + logger=False, + ) + trainer.fit(model) + + for param in model.parameters(): + assert param.abs().sum() > 0 + torch.nn.init.constant_(param, 0) + + # case 2: explicit resume path provided, restore hpc anyway + trainer = Trainer(default_root_dir=tmpdir, max_steps=3, resume_from_checkpoint="not existing") + trainer.fit(model) + + for param in model.parameters(): + assert param.abs().sum() > 0 + + +def test_hpc_max_ckpt_version(tmpdir): + """ Test that the CheckpointConnector is able to find the hpc checkpoint file with the highest version. """ + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + ) + trainer.fit(model) + trainer.save_checkpoint(tmpdir / "hpc_ckpt.ckpt") + trainer.save_checkpoint(tmpdir / "hpc_ckpt_0.ckpt") + trainer.save_checkpoint(tmpdir / "hpc_ckpt_3.ckpt") + trainer.save_checkpoint(tmpdir / "hpc_ckpt_33.ckpt") + + assert trainer.checkpoint_connector.hpc_resume_path == str(tmpdir / "hpc_ckpt_33.ckpt") + assert trainer.checkpoint_connector.max_ckpt_version_in_folder(tmpdir) == 33 + assert trainer.checkpoint_connector.max_ckpt_version_in_folder(tmpdir / "not" / "existing") is None diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py index 1dffefb092716..8320134058c4e 100644 --- a/tests/trainer/flags/test_fast_dev_run.py +++ b/tests/trainer/flags/test_fast_dev_run.py @@ -95,7 +95,6 @@ def _make_fast_dev_run_assertions(trainer, model): # there should be no logger with fast_dev_run assert isinstance(trainer.logger, DummyLogger) - assert len(trainer.dev_debugger.logged_metrics) == fast_dev_run # checkpoint callback should not have been called with fast_dev_run assert trainer.checkpoint_callback == checkpoint_callback diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index 5832f387cc63d..4094fd90021af 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -24,7 +24,7 @@ class TestModel(BoringModel): def on_pretrain_routine_end(self) -> None: with mock.patch('pytorch_lightning.loggers.base.LightningLoggerBase.agg_and_log_metrics') as m: - self.trainer.logger_connector.log_metrics({'a': 2}, {}) + self.trainer.logger_connector.log_metrics({'a': 2}) logged_times = m.call_count expected = int(self.trainer.is_global_zero) msg = f'actual logger called from non-global zero, logged_times: {logged_times}, expected: {expected}' diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 331734aa9b412..5a4e335e0c7c4 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Tests to ensure that the training loop works with a dict (1.0) +Test logging in the evaluation loop """ import collections import itertools -import os from unittest import mock from unittest.mock import call @@ -24,12 +23,9 @@ import pytest import torch -from pytorch_lightning import callbacks, seed_everything, Trainer -from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning import callbacks, Trainer from pytorch_lightning.loggers import TensorBoardLogger from tests.helpers import BoringModel, RandomDataset -from tests.helpers.deterministic_model import DeterministicModel def test__validation_step__log(tmpdir): @@ -37,25 +33,18 @@ def test__validation_step__log(tmpdir): Tests that validation_step can log """ - class TestModel(DeterministicModel): + class TestModel(BoringModel): def training_step(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - acc = acc + batch_idx - self.log('a', acc, on_step=True, on_epoch=True) + out = super().training_step(batch, batch_idx) + self.log('a', out['loss'], on_step=True, on_epoch=True) self.log('a2', 2) - - self.training_step_called = True - return acc + return out def validation_step(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - acc = acc + batch_idx - self.log('b', acc, on_step=True, on_epoch=True) - self.training_step_called = True - - def backward(self, loss, optimizer, optimizer_idx): - return LightningModule.backward(self, loss, optimizer, optimizer_idx) + out = super().validation_step(batch, batch_idx) + self.log('b', out['x'], on_step=True, on_epoch=True) + return out model = TestModel() model.validation_step_end = None @@ -71,8 +60,7 @@ def backward(self, loss, optimizer, optimizer_idx): ) trainer.fit(model) - # make sure all the metrics are available for callbacks - expected_logged_metrics = { + assert set(trainer.logged_metrics) == { 'a2', 'a_step', 'a_epoch', @@ -80,49 +68,33 @@ def backward(self, loss, optimizer, optimizer_idx): 'b_epoch', 'epoch', } - logged_metrics = set(trainer.logged_metrics.keys()) - assert expected_logged_metrics == logged_metrics # we don't want to enable val metrics during steps because it is not something that users should do # on purpose DO NOT allow b_step... it's silly to monitor val step metrics - callback_metrics = set(trainer.callback_metrics.keys()) - expected_cb_metrics = {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'} - assert expected_cb_metrics == callback_metrics + assert set(trainer.callback_metrics) == {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'} -def test__validation_step__step_end__epoch_end__log(tmpdir): +def test__validation_step__epoch_end__log(tmpdir): """ - Tests that validation_step can log + Tests that validation_epoch_end can log """ - class TestModel(DeterministicModel): + class TestModel(BoringModel): def training_step(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - acc = acc + batch_idx - self.log('a', acc) - self.log('b', acc, on_step=True, on_epoch=True) - self.training_step_called = True - return acc + out = super().training_step(batch, batch_idx) + self.log('a', out['loss']) + self.log('b', out['loss'], on_step=True, on_epoch=True) + return out def validation_step(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - acc = acc + batch_idx - self.log('c', acc) - self.log('d', acc, on_step=True, on_epoch=True) - self.validation_step_called = True - return acc - - def validation_step_end(self, acc): - self.validation_step_end_called = True - return ['random_thing'] + out = super().validation_step(batch, batch_idx) + self.log('c', out['x']) + self.log('d', out['x'], on_step=True, on_epoch=True) + return out def validation_epoch_end(self, outputs): self.log('g', torch.tensor(2, device=self.device), on_epoch=True) - self.validation_epoch_end_called = True - - def backward(self, loss, optimizer, optimizer_idx): - return LightningModule.backward(self, loss, optimizer, optimizer_idx) model = TestModel() @@ -136,9 +108,8 @@ def backward(self, loss, optimizer, optimizer_idx): ) trainer.fit(model) - # make sure all the metrics are available for callbacks - logged_metrics = set(trainer.logged_metrics.keys()) - expected_logged_metrics = { + # make sure all the metrics are available for loggers + assert set(trainer.logged_metrics) == { 'epoch', 'a', 'b_step', @@ -148,24 +119,15 @@ def backward(self, loss, optimizer, optimizer_idx): 'd_epoch', 'g', } - assert expected_logged_metrics == logged_metrics - progress_bar_metrics = set(trainer.progress_bar_metrics.keys()) - expected_pbar_metrics = set() - assert expected_pbar_metrics == progress_bar_metrics + assert not trainer.progress_bar_metrics # we don't want to enable val metrics during steps because it is not something that users should do - callback_metrics = set(trainer.callback_metrics.keys()) - expected_cb_metrics = {'a', 'b', 'b_epoch', 'c', 'd', 'd_epoch', 'g', 'b_step'} - assert expected_cb_metrics == callback_metrics + assert set(trainer.callback_metrics) == {'a', 'b', 'b_epoch', 'c', 'd', 'd_epoch', 'g', 'b_step'} -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @pytest.mark.parametrize(['batches', 'log_interval', 'max_epochs'], [(1, 1, 1), (64, 32, 2)]) def test_eval_epoch_logging(tmpdir, batches, log_interval, max_epochs): - """ - Tests that only training_step can be used - """ class TestModel(BoringModel): @@ -185,35 +147,23 @@ def validation_epoch_end(self, outputs): ) trainer.fit(model) - # make sure all the metrics are available for callbacks - logged_metrics = set(trainer.logged_metrics.keys()) - expected_logged_metrics = { + # assert the loggers received the expected number + logged_metrics = set(trainer.logged_metrics) + assert logged_metrics == { 'c', 'd/e/f', 'epoch', } - assert logged_metrics == expected_logged_metrics - pbar_metrics = set(trainer.progress_bar_metrics.keys()) - expected_pbar_metrics = {'c'} - assert pbar_metrics == expected_pbar_metrics + pbar_metrics = set(trainer.progress_bar_metrics) + assert pbar_metrics == {'c'} - callback_metrics = set(trainer.callback_metrics.keys()) - callback_metrics.remove('debug_epoch') - expected_callback_metrics = set() - expected_callback_metrics = expected_callback_metrics.union(logged_metrics) - expected_callback_metrics = expected_callback_metrics.union(pbar_metrics) - expected_callback_metrics.remove('epoch') - assert callback_metrics == expected_callback_metrics - - # assert the loggers received the expected number - assert len(trainer.dev_debugger.logged_metrics) == max_epochs + # make sure all the metrics are available for callbacks + callback_metrics = set(trainer.callback_metrics) + assert callback_metrics == (logged_metrics | pbar_metrics) - {'epoch'} def test_eval_float_logging(tmpdir): - """ - Tests that only training_step can be used - """ class TestModel(BoringModel): @@ -235,45 +185,28 @@ def validation_step(self, batch, batch_idx): ) trainer.fit(model) - # make sure all the metrics are available for callbacks - logged_metrics = set(trainer.logged_metrics.keys()) - expected_logged_metrics = { - 'a', - 'epoch', - } - assert logged_metrics == expected_logged_metrics + assert set(trainer.logged_metrics) == {'a', 'epoch'} -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_eval_logging_auto_reduce(tmpdir): - """ - Tests that only training_step can be used - """ - seed_everything(1234) class TestModel(BoringModel): - - def on_pretrain_routine_end(self) -> None: - self.seen_vals = [] - self.manual_epoch_end_mean = None - - def on_validation_epoch_start(self) -> None: - self.seen_vals = [] + val_losses = [] + manual_epoch_end_mean = None def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) - self.seen_vals.append(loss) + self.val_losses.append(loss) self.log('val_loss', loss, on_epoch=True, on_step=True, prog_bar=True) return {"x": loss} def validation_epoch_end(self, outputs) -> None: - for passed_in, manually_tracked in zip(outputs, self.seen_vals): + for passed_in, manually_tracked in zip(outputs, self.val_losses): assert passed_in['x'] == manually_tracked self.manual_epoch_end_mean = torch.stack([x['x'] for x in outputs]).mean() model = TestModel() - trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=3, @@ -281,93 +214,63 @@ def validation_epoch_end(self, outputs) -> None: max_epochs=1, log_every_n_steps=1, weights_summary=None, - callbacks=[ModelCheckpoint(dirpath=tmpdir)], + num_sanity_val_steps=0, ) trainer.fit(model) # make sure all the metrics are available for callbacks - manual_mean = model.manual_epoch_end_mean - callback_metrics = set(trainer.callback_metrics.keys()) - assert callback_metrics == {'debug_epoch', 'val_loss', 'val_loss_epoch'} + assert set(trainer.callback_metrics) == {'val_loss', 'val_loss_epoch'} # make sure values are correct - assert trainer.logged_metrics['val_loss_epoch'] == manual_mean - assert trainer.callback_metrics['val_loss'] == trainer.logged_metrics['val_loss_step'] - - # make sure correct values were logged - logged_val = trainer.dev_debugger.logged_metrics - - # 3 val batches - assert logged_val[0]['val_loss_step'] == model.seen_vals[0] - assert logged_val[1]['val_loss_step'] == model.seen_vals[1] - assert logged_val[2]['val_loss_step'] == model.seen_vals[2] - - # epoch mean - assert logged_val[3]['val_loss_epoch'] == model.manual_epoch_end_mean - - # only those logged - assert len(logged_val) == 4 + assert trainer.logged_metrics['val_loss_epoch'] == model.manual_epoch_end_mean + assert trainer.callback_metrics['val_loss_epoch'] == model.manual_epoch_end_mean + assert trainer.callback_metrics['val_loss'] == model.manual_epoch_end_mean + assert trainer.logged_metrics["val_loss_step"] == model.val_losses[-1] @pytest.mark.parametrize(['batches', 'log_interval', 'max_epochs'], [(1, 1, 1), (64, 32, 2)]) def test_eval_epoch_only_logging(tmpdir, batches, log_interval, max_epochs): """ - Tests that only test_epoch_end can be used to log, and we return them in the results. + Tests that test_epoch_end can be used to log, and we return them in the results. """ class TestModel(BoringModel): def test_epoch_end(self, outputs): - self.log('c', torch.tensor(2), on_epoch=True, prog_bar=True, logger=True) + self.log('c', torch.tensor(2)) self.log('d/e/f', 2) model = TestModel() - trainer = Trainer( default_root_dir=tmpdir, - limit_train_batches=batches, - limit_val_batches=batches, max_epochs=max_epochs, + limit_test_batches=batches, log_every_n_steps=log_interval, weights_summary=None, ) - trainer.fit(model) results = trainer.test(model) - expected_result_metrics = { - 'c': torch.tensor(2), - 'd/e/f': 2, - } - for result in results: - assert result == expected_result_metrics - - -def test_monitor_val_epoch_end(tmpdir): - epoch_min_loss_override = 0 - model = BoringModel() - checkpoint_callback = callbacks.ModelCheckpoint(dirpath=tmpdir, save_top_k=1, monitor="avg_val_loss") - trainer = Trainer( - max_epochs=epoch_min_loss_override + 2, - logger=False, - callbacks=[checkpoint_callback], - ) - trainer.fit(model) + assert len(results) == 1 + assert results[0] == {'c': torch.tensor(2), 'd/e/f': 2} -def test_multi_dataloaders_add_suffix_properly(tmpdir): +@pytest.mark.parametrize('suffix', (False, True)) +def test_multi_dataloaders_add_suffix_properly(tmpdir, suffix): class TestModel(BoringModel): - def test_step(self, batch, *args): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log("test_loss", loss, on_step=True, on_epoch=True) + def test_step(self, batch, batch_idx, dataloader_idx=0): + out = super().test_step(batch, batch_idx) + self.log("test_loss", out['y'], on_step=True, on_epoch=True) + return out def test_dataloader(self): - return [ - torch.utils.data.DataLoader(RandomDataset(32, 64)), - torch.utils.data.DataLoader(RandomDataset(32, 64)) - ] + if suffix: + return [ + torch.utils.data.DataLoader(RandomDataset(32, 64)), + torch.utils.data.DataLoader(RandomDataset(32, 64)) + ] + return super().test_dataloader() model = TestModel() model.test_epoch_end = None @@ -383,38 +286,13 @@ def test_dataloader(self): ) results = trainer.test(model) - assert {"test_loss/dataloader_idx_0", "test_loss_epoch/dataloader_idx_0"} == set(results[0]) - assert {"test_loss/dataloader_idx_1", "test_loss_epoch/dataloader_idx_1"} == set(results[1]) - + for i, r in enumerate(results): + expected = {'test_loss', 'test_loss_epoch'} + if suffix: + expected = {e + f'/dataloader_idx_{i}' for e in expected} + assert set(r) == expected -def test_single_dataloader_no_suffix_added(tmpdir): - class TestModel(BoringModel): - - def test_step(self, batch, *args): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log("test_loss", loss, on_step=True, on_epoch=True) - - model = TestModel() - model.test_epoch_end = None - - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=0, - limit_val_batches=0, - limit_test_batches=5, - max_epochs=1, - log_every_n_steps=1, - weights_summary=None, - ) - results = trainer.test(model) - - assert len(results) == 1 - assert {"test_loss", "test_loss_epoch"} == set(results[0]) - - -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_log_works_in_val_callback(tmpdir): """ Tests that log can be called within callback @@ -422,200 +300,105 @@ def test_log_works_in_val_callback(tmpdir): class TestCallback(callbacks.Callback): - # helpers - count = 1 + count = 0 choices = [False, True] - # used to compute expected values - callback_funcs_called = collections.defaultdict(list) - funcs_called_count = collections.defaultdict(int) - funcs_attr = {} - def make_logging(self, pl_module, func_name, func_idx, on_steps=[], on_epochs=[], prob_bars=[]): - self.funcs_called_count[func_name] += 1 - product = [on_steps, on_epochs, prob_bars] - for idx, (on_step, on_epoch, prog_bar) in enumerate(list(itertools.product(*product))): - # run logging - custom_func_name = f"{func_idx}_{idx}_{func_name}" - pl_module.log( - custom_func_name, self.count * func_idx, on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar - ) - # catch information for verification - self.callback_funcs_called[func_name].append([self.count * func_idx]) - self.funcs_attr[custom_func_name] = { - "on_step": on_step, - "on_epoch": on_epoch, - "prog_bar": prog_bar, - "forked": on_step and on_epoch, - "func_name": func_name - } + # used to compute expected values + logged_values = collections.defaultdict(list) + call_counter = collections.Counter() + logged_arguments = {} - if on_step and on_epoch: - self.funcs_attr[f"{custom_func_name}_step"] = { - "on_step": True, - "on_epoch": False, - "prog_bar": prog_bar, - "forked": False, - "func_name": func_name - } + def make_logging(self, pl_module, func_name, on_steps, on_epochs, prob_bars): + self.call_counter.update([func_name]) - self.funcs_attr[f"{custom_func_name}_epoch"] = { - "on_step": False, - "on_epoch": True, - "prog_bar": prog_bar, - "forked": False, - "func_name": func_name - } + for idx, (on_step, on_epoch, prog_bar) in enumerate(itertools.product(on_steps, on_epochs, prob_bars)): + fx = f"{func_name}_{idx}" + pl_module.log(fx, self.count, on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar) + self.logged_values[fx].append(self.count) + self.logged_arguments[fx] = {"on_step": on_step, "on_epoch": on_epoch, "prog_bar": prog_bar} + self.count += 1 - def on_validation_start(self, trainer, pl_module): + def on_validation_start(self, _, pl_module): self.make_logging( - pl_module, - 'on_validation_start', - 1, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_validation_start', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) def on_epoch_start(self, trainer, pl_module): if trainer.validating: self.make_logging( - pl_module, - 'on_epoch_start', - 2, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_epoch_start', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) - def on_validation_epoch_start(self, trainer, pl_module): + def on_validation_epoch_start(self, _, pl_module): self.make_logging( - pl_module, - 'on_validation_epoch_start', - 3, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices - ) - - def on_batch_end(self, trainer, pl_module): - self.make_logging( - pl_module, 'on_batch_end', 6, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices + pl_module, 'on_validation_epoch_start', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) - def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + def on_validation_batch_end(self, _, pl_module, *__): self.make_logging( pl_module, 'on_validation_batch_end', - 7, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices ) - # used to make sure aggregation works fine. - # we should obtain func[value * c for c in range(1, max_epochs * limit_validation_batches)]) - # with func = np.mean if on_epoch else func = np.max - self.count += 1 def on_epoch_end(self, trainer, pl_module): if trainer.validating: - self.make_logging( - pl_module, 'on_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices - ) + self.make_logging(pl_module, 'on_epoch_end', on_steps=[False], on_epochs=[True], prob_bars=self.choices) - def on_validation_epoch_end(self, trainer, pl_module): + def on_validation_epoch_end(self, _, pl_module): self.make_logging( - pl_module, - 'on_validation_epoch_end', - 9, - on_steps=[False], - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_validation_epoch_end', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) class TestModel(BoringModel): def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) + loss = super().validation_step(batch, batch_idx)['x'] self.log('val_loss', loss) - max_epochs = 1 model = TestModel() model.validation_epoch_end = None - test_callback = TestCallback() - + cb = TestCallback() trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=4, - limit_test_batches=0, - val_check_interval=0., num_sanity_val_steps=0, - max_epochs=max_epochs, - callbacks=[test_callback], + max_epochs=1, + callbacks=[cb], ) trainer.fit(model) - assert test_callback.funcs_called_count["on_epoch_start"] == 1 - # assert test_callback.funcs_called_count["on_batch_start"] == 1 - assert test_callback.funcs_called_count["on_batch_end"] == 1 - assert test_callback.funcs_called_count["on_validation_start"] == 1 - assert test_callback.funcs_called_count["on_validation_epoch_start"] == 1 - # assert test_callback.funcs_called_count["on_validation_batch_start"] == 4 - assert test_callback.funcs_called_count["on_epoch_end"] == 1 - assert test_callback.funcs_called_count["on_validation_batch_end"] == 4 - assert test_callback.funcs_called_count["on_validation_epoch_end"] == 1 - - # Make sure the func_name exists within callback_metrics. If not, we missed some - callback_metrics_keys = [*trainer.callback_metrics.keys()] - for func_name in test_callback.callback_funcs_called.keys(): - is_in = False - for callback_metrics_key in callback_metrics_keys: - if func_name in callback_metrics_key: - is_in = True - assert is_in, (func_name, callback_metrics_keys) - - # function used to describe expected return logic - def get_expected_output(func_attr, original_values): + assert cb.call_counter == { + 'on_validation_batch_end': 4, + 'on_validation_start': 1, + 'on_epoch_start': 1, + 'on_validation_epoch_start': 1, + 'on_validation_epoch_end': 1, + 'on_epoch_end': 1 + } - if func_attr["on_epoch"] and not func_attr["on_step"]: - # Apply mean on values - expected_output = np.mean(original_values) - else: - # Keep the latest value - expected_output = np.max(original_values) - return expected_output + def get_expected(on_epoch, values): + reduction = np.mean if on_epoch else np.max + return reduction(values) - # Make sure the func_name output equals the average from all logged values when on_epoch true - # pop extra keys - trainer.callback_metrics.pop("debug_epoch") - trainer.callback_metrics.pop("val_loss") - for func_name, output_value in trainer.callback_metrics.items(): - # not sure how to handle this now - if "epoch_0" in func_name: - func_name = '/'.join(func_name.split('/')[:-1]) + for fx, value in trainer.callback_metrics.items(): + actual = value.item() + if fx not in cb.logged_arguments: continue + on_epoch = cb.logged_arguments[fx]['on_epoch'] + values = cb.logged_values[fx] + expected = get_expected(on_epoch, values) + assert actual == expected - if torch.is_tensor(output_value): - output_value = output_value.item() - # get creation attr - func_attr = test_callback.funcs_attr[func_name] - - # retrived orginal logged values - original_values = test_callback.callback_funcs_called[func_attr["func_name"]] - - # compute expected output and compare to actual one - expected_output = get_expected_output(func_attr, original_values) - assert float(output_value) == float(expected_output) - - for func_name, func_attr in test_callback.funcs_attr.items(): - if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: - assert func_name in trainer.logger_connector.progress_bar_metrics - else: - assert func_name not in trainer.logger_connector.progress_bar_metrics + for fx, attrs in cb.logged_arguments.items(): + should_include = attrs["prog_bar"] and attrs["on_step"] ^ attrs["on_epoch"] + is_included = fx in trainer.logger_connector.progress_bar_metrics + assert is_included if should_include else not is_included -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_log_works_in_test_callback(tmpdir): """ Tests that log can be called within callback @@ -624,7 +407,7 @@ def test_log_works_in_test_callback(tmpdir): class TestCallback(callbacks.Callback): # helpers - count = 1 + count = 0 choices = [False, True] # used to compute expected values @@ -632,19 +415,15 @@ class TestCallback(callbacks.Callback): funcs_called_count = collections.defaultdict(int) funcs_attr = {} - def make_logging(self, pl_module, func_name, func_idx, on_steps=[], on_epochs=[], prob_bars=[]): + def make_logging(self, pl_module, func_name, on_steps, on_epochs, prob_bars): original_func_name = func_name[:] self.funcs_called_count[original_func_name] += 1 - product = [on_steps, on_epochs, prob_bars] - for idx, t in enumerate(list(itertools.product(*product))): - # run logging + + for idx, (on_step, on_epoch, prog_bar) in enumerate(itertools.product(on_steps, on_epochs, prob_bars)): func_name = original_func_name[:] - on_step, on_epoch, prog_bar = t - custom_func_name = f"{func_idx}_{idx}_{func_name}" + custom_func_name = f"{idx}_{func_name}" - pl_module.log( - custom_func_name, self.count * func_idx, on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar - ) + pl_module.log(custom_func_name, self.count, on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar) num_dl_ext = '' if pl_module._current_dataloader_idx is not None: @@ -653,12 +432,11 @@ def make_logging(self, pl_module, func_name, func_idx, on_steps=[], on_epochs=[] func_name += num_dl_ext # catch information for verification - self.callback_funcs_called[func_name].append([self.count * func_idx]) + self.callback_funcs_called[func_name].append([self.count]) self.funcs_attr[custom_func_name + num_dl_ext] = { "on_step": on_step, "on_epoch": on_epoch, "prog_bar": prog_bar, - "forked": on_step and on_epoch, "func_name": func_name } if on_step and on_epoch: @@ -666,7 +444,6 @@ def make_logging(self, pl_module, func_name, func_idx, on_steps=[], on_epochs=[] "on_step": True, "on_epoch": False, "prog_bar": prog_bar, - "forked": False, "func_name": func_name } @@ -674,140 +451,89 @@ def make_logging(self, pl_module, func_name, func_idx, on_steps=[], on_epochs=[] "on_step": False, "on_epoch": True, "prog_bar": prog_bar, - "forked": False, "func_name": func_name } - def on_test_start(self, trainer, pl_module): - self.make_logging( - pl_module, 'on_test_start', 1, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices - ) + def on_test_start(self, _, pl_module): + self.make_logging(pl_module, 'on_test_start', on_steps=[False], on_epochs=[True], prob_bars=self.choices) - def on_test_epoch_start(self, trainer, pl_module): + def on_test_epoch_start(self, _, pl_module): self.make_logging( - pl_module, - 'on_test_epoch_start', - 3, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_test_epoch_start', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) - def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + def on_test_batch_end(self, _, pl_module, *__): self.make_logging( - pl_module, - 'on_test_batch_end', - 5, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_test_batch_end', on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices ) - # used to make sure aggregation works fine. - # we should obtain func[value * c for c in range(1, max_epochs * limit_test_batches)]) - # with func = np.mean if on_epoch else func = np.max - self.count += 1 - - def on_test_epoch_end(self, trainer, pl_module): + def on_test_epoch_end(self, _, pl_module): self.make_logging( - pl_module, 'on_test_epoch_end', 7, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices + pl_module, 'on_test_epoch_end', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) - max_epochs = 2 num_dataloaders = 2 class TestModel(BoringModel): - - manual_mean = collections.defaultdict(list) + seen_losses = {i: [] for i in range(num_dataloaders)} def test_step(self, batch, batch_idx, dataloader_idx=None): - output = self.layer(batch) - loss = self.loss(batch, output) + loss = super().test_step(batch, batch_idx)['y'] self.log('test_loss', loss) - self.manual_mean[str(dataloader_idx)].append(loss) + self.seen_losses[dataloader_idx].append(loss) def test_dataloader(self): return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)] model = TestModel() model.test_epoch_end = None - test_callback = TestCallback() - + cb = TestCallback() trainer = Trainer( default_root_dir=tmpdir, - limit_train_batches=2, - limit_val_batches=0, limit_test_batches=2, - val_check_interval=0., num_sanity_val_steps=0, - max_epochs=max_epochs, - callbacks=[test_callback], + max_epochs=2, + callbacks=[cb], ) trainer.test(model) - assert test_callback.funcs_called_count["on_test_start"] == 1 - assert test_callback.funcs_called_count["on_test_epoch_start"] == 1 - assert test_callback.funcs_called_count["on_test_batch_end"] == 4 - assert test_callback.funcs_called_count["on_test_epoch_end"] == 1 - - # Make sure the func_name exists within callback_metrics. If not, we missed some - callback_metrics_keys = [*trainer.callback_metrics.keys()] + assert cb.funcs_called_count["on_test_start"] == 1 + assert cb.funcs_called_count["on_test_epoch_start"] == 1 + assert cb.funcs_called_count["on_test_batch_end"] == 4 + assert cb.funcs_called_count["on_test_epoch_end"] == 1 - for func_name in test_callback.callback_funcs_called.keys(): + callback_metrics_keys = list(trainer.callback_metrics) + for func_name in cb.callback_funcs_called.keys(): is_in = False for callback_metrics_key in callback_metrics_keys: if func_name in callback_metrics_key: is_in = True assert is_in, (func_name, callback_metrics_keys) - # function used to describe expected return logic - def get_expected_output(func_attr, original_values): - # Apply mean on values - if func_attr["on_epoch"] and not func_attr["on_step"]: - expected_output = np.mean(original_values) - else: - expected_output = np.max(original_values) - return expected_output + def get_expected(on_epoch, values): + reduction = np.mean if on_epoch else np.max + return reduction(values) # Make sure the func_name output equals the average from all logged values when on_epoch true - # pop extra keys - assert "debug_epoch" in trainer.callback_metrics - trainer.callback_metrics.pop("debug_epoch") - for dl_idx in range(num_dataloaders): key = f"test_loss/dataloader_idx_{dl_idx}" assert key in trainer.callback_metrics - assert torch.stack(model.manual_mean[str(dl_idx)]).mean() == trainer.callback_metrics[key] - trainer.callback_metrics.pop(key) + assert torch.stack(model.seen_losses[dl_idx]).mean() == trainer.callback_metrics.pop(key) for func_name, output_value in trainer.callback_metrics.items(): - # not sure how to handle this now - if "epoch_1" in func_name: - func_name = '/'.join(func_name.split('/')[:-1]) - continue - - if torch.is_tensor(output_value): - output_value = output_value.item() + output_value = output_value.item() + func_attr = cb.funcs_attr[func_name] + original_values = cb.callback_funcs_called[func_attr["func_name"]] + expected_output = get_expected(func_attr['on_epoch'], original_values) + assert output_value == expected_output - # get func attr - func_attr = test_callback.funcs_attr[func_name] - - # retrived orginal logged values - original_values = test_callback.callback_funcs_called[func_attr["func_name"]] - - # compute expected output and compare to actual one - expected_output = get_expected_output(func_attr, original_values) - assert float(output_value) == float(expected_output) - - for func_name, func_attr in test_callback.funcs_attr.items(): - if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: - assert func_name in trainer.logger_connector.progress_bar_metrics - else: - assert func_name not in trainer.logger_connector.progress_bar_metrics + for fx, attrs in cb.funcs_attr.items(): + should_include = attrs["prog_bar"] and attrs["on_step"] ^ attrs["on_epoch"] + is_included = fx in trainer.logger_connector.progress_bar_metrics + assert is_included if should_include else not is_included @mock.patch("pytorch_lightning.loggers.TensorBoardLogger.log_metrics") -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_validation_step_log_with_tensorboard(mock_log_metrics, tmpdir): """ This tests make sure we properly log_metrics to loggers @@ -860,54 +586,42 @@ def test_step(self, batch, batch_idx): expected_num_calls = 1 + 2 + 1 + 2 + 1 assert len(mock_log_metrics.mock_calls) == expected_num_calls - assert mock_log_metrics.mock_calls[0] == call({'hp_metric': -1}, 0) def get_metrics_at_idx(idx): mock_calls = list(mock_log_metrics.mock_calls) if isinstance(mock_calls[idx].kwargs, dict): return mock_calls[idx].kwargs["metrics"] - else: - return mock_calls[idx][2]["metrics"] + return mock_calls[idx][2]["metrics"] - expected = ['valid_loss_0_step', 'valid_loss_2', 'global_step'] - assert sorted(get_metrics_at_idx(1)) == sorted(expected) - assert sorted(get_metrics_at_idx(2)) == sorted(expected) + expected = {'valid_loss_0_step', 'valid_loss_2'} + assert set(get_metrics_at_idx(1)) == expected + assert set(get_metrics_at_idx(2)) == expected - expected = model.val_losses[2] - assert get_metrics_at_idx(1)["valid_loss_0_step"] == expected - expected = model.val_losses[3] - assert get_metrics_at_idx(2)["valid_loss_0_step"] == expected + assert get_metrics_at_idx(1)["valid_loss_0_step"] == model.val_losses[2] + assert get_metrics_at_idx(2)["valid_loss_0_step"] == model.val_losses[3] - expected = ['valid_loss_0_epoch', 'valid_loss_1', 'epoch', 'global_step'] - assert sorted(get_metrics_at_idx(3)) == sorted(expected) + assert set(get_metrics_at_idx(3)) == {'valid_loss_0_epoch', 'valid_loss_1', 'epoch'} - expected = torch.stack(model.val_losses[2:4]).mean() - assert get_metrics_at_idx(3)["valid_loss_1"] == expected - expected = ['valid_loss_0_step', 'valid_loss_2', 'global_step'] + assert get_metrics_at_idx(3)["valid_loss_1"] == torch.stack(model.val_losses[2:4]).mean() - assert sorted(get_metrics_at_idx(4)) == sorted(expected) - assert sorted(get_metrics_at_idx(5)) == sorted(expected) + expected = {'valid_loss_0_step', 'valid_loss_2'} + assert set(get_metrics_at_idx(4)) == expected + assert set(get_metrics_at_idx(5)) == expected - expected = model.val_losses[4] - assert get_metrics_at_idx(4)["valid_loss_0_step"] == expected - expected = model.val_losses[5] - assert get_metrics_at_idx(5)["valid_loss_0_step"] == expected + assert get_metrics_at_idx(4)["valid_loss_0_step"] == model.val_losses[4] + assert get_metrics_at_idx(5)["valid_loss_0_step"] == model.val_losses[5] - expected = ['valid_loss_0_epoch', 'valid_loss_1', 'epoch', 'global_step'] - assert sorted(get_metrics_at_idx(6)) == sorted(expected) + assert set(get_metrics_at_idx(6)) == {'valid_loss_0_epoch', 'valid_loss_1', 'epoch'} - expected = torch.stack(model.val_losses[4:]).mean() - assert get_metrics_at_idx(6)["valid_loss_1"] == expected + assert get_metrics_at_idx(6)["valid_loss_1"] == torch.stack(model.val_losses[4:]).mean() results = trainer.test(model) - expected_callback_metrics = { + assert set(trainer.callback_metrics) == { 'train_loss', 'valid_loss_0_epoch', 'valid_loss_0', - 'debug_epoch', 'valid_loss_1', 'test_loss', } - assert set(trainer.callback_metrics) == expected_callback_metrics - assert set(results[0]) == {'test_loss', 'debug_epoch'} + assert set(results[0]) == {'test_loss'} diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py index e0e1c3cdf42ec..592fde1569344 100644 --- a/tests/trainer/logging_/test_logger_connector.py +++ b/tests/trainer/logging_/test_logger_connector.py @@ -11,12 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Tests to ensure that the training loop works with a dict (1.0) -""" -import os -from copy import deepcopy -from typing import Any, Callable from unittest import mock import pytest @@ -26,251 +20,14 @@ from pytorch_lightning import LightningModule from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer import Trainer from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import FxValidator -from pytorch_lightning.trainer.connectors.logger_connector.metrics_holder import MetricsHolder +from pytorch_lightning.trainer.connectors.logger_connector.result import MetricSource, ResultCollection from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel, RandomDataset from tests.helpers.runif import RunIf -def decorator_with_arguments(fx_name: str = '', hook_fx_name: str = None) -> Callable: - - def decorator(func: Callable) -> Callable: - - def wrapper(self, *args, **kwargs) -> Any: - # Set information - self._current_fx_name = fx_name - self._current_hook_fx_name = hook_fx_name - self._results = Result() - - result = func(self, *args, **kwargs) - - # cache metrics - self.trainer.logger_connector.cache_logged_metrics() - return result - - return wrapper - - return decorator - - -def test__logger_connector__epoch_result_store__train(tmpdir): - """ - Tests that LoggerConnector will properly capture logged information - and reduce them - """ - - class TestModel(BoringModel): - - train_losses = [] - - @decorator_with_arguments(fx_name="training_step") - def training_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - - self.train_losses.append(loss) - - self.log("train_loss", loss, on_step=True, on_epoch=True) - - return {"loss": loss} - - def training_step_end(self, *_): - self.train_results = deepcopy(self.trainer.logger_connector.cached_results) - - model = TestModel() - model.training_epoch_end = None - model.val_dataloader = None - - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=2, - limit_val_batches=4, - max_epochs=1, - log_every_n_steps=1, - weights_summary=None, - ) - trainer.fit(model) - - train_results = model.train_results - - assert len(train_results(fx_name="training_step", dl_idx=0, opt_idx=0)) == 2 - generated = train_results(fx_name="training_step", dl_idx=0, opt_idx=0, batch_idx=0, split_idx=0)["train_loss"] - assert generated == model.train_losses[0] - generated = train_results(fx_name="training_step", dl_idx=0, opt_idx=0, batch_idx=1, split_idx=0)["train_loss"] - assert generated == model.train_losses[1] - - assert train_results.has_reduced is not True - - train_results.has_batch_loop_finished = True - - assert train_results.has_reduced is True - - generated = train_results(fx_name="training_step", dl_idx=0, opt_idx=0, reduced=True)['train_loss_epoch'].item() - excepted = torch.stack(model.train_losses).mean().item() - assert generated == excepted - - -def test__logger_connector__epoch_result_store__train__tbptt(tmpdir): - """ - Tests that LoggerConnector will properly capture logged information with ttbt - and reduce them - """ - truncated_bptt_steps = 2 - sequence_size = 30 - batch_size = 30 - - x_seq = torch.rand(batch_size, sequence_size, 1) - y_seq_list = torch.rand(batch_size, sequence_size, 1).tolist() - - class MockSeq2SeqDataset(torch.utils.data.Dataset): - - def __getitem__(self, i): - return x_seq, y_seq_list - - def __len__(self): - return 1 - - class TestModel(BoringModel): - - train_losses = [] - - def __init__(self): - super().__init__() - self.test_hidden = None - self.layer = torch.nn.Linear(2, 2) - - @decorator_with_arguments(fx_name="training_step") - def training_step(self, batch, batch_idx, hiddens): - assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" - self.test_hidden = torch.rand(1) - - x_tensor, y_list = batch - assert x_tensor.shape[1] == truncated_bptt_steps, "tbptt split Tensor failed" - - y_tensor = torch.tensor(y_list, dtype=x_tensor.dtype) - assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" - - pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) - loss = torch.nn.functional.mse_loss(pred, y_tensor.view(batch_size, truncated_bptt_steps)) - - self.train_losses.append(loss) - - self.log('a', loss, on_epoch=True) - - return {'loss': loss, 'hiddens': self.test_hidden} - - def on_train_epoch_start(self) -> None: - self.test_hidden = None - - def train_dataloader(self): - return torch.utils.data.DataLoader( - dataset=MockSeq2SeqDataset(), - batch_size=batch_size, - shuffle=False, - sampler=None, - ) - - def training_step_end(self, training_step_output): - self.train_results = deepcopy(self.trainer.logger_connector.cached_results) - # must return - return training_step_output - - model = TestModel() - model.training_epoch_end = None - model.example_input_array = torch.randn(5, truncated_bptt_steps) - - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=10, - limit_val_batches=0, - truncated_bptt_steps=truncated_bptt_steps, - max_epochs=1, - log_every_n_steps=1, - weights_summary=None, - ) - trainer.fit(model) - - train_results = model.train_results - - generated = train_results(fx_name="training_step", dl_idx=0, opt_idx=0, batch_idx=0) - assert len(generated) == len(model.train_losses) - - # assert reduction didn't happen yet - assert train_results.has_reduced is False - - # Launch reduction - train_results.has_batch_loop_finished = True - - # assert reduction did happen - assert train_results.has_reduced is True - - generated = train_results(fx_name="training_step", dl_idx=0, opt_idx=0, reduced=True)['a_epoch'].item() - assert generated == torch.stack(model.train_losses).mean().item() - - -@pytest.mark.parametrize('num_dataloaders', [1, 2]) -def test__logger_connector__epoch_result_store__test_multi_dataloaders(tmpdir, num_dataloaders): - """ - Tests that LoggerConnector will properly capture logged information in multi dataloaders scenario - """ - - class TestModel(BoringModel): - test_losses = {dl_idx: [] for dl_idx in range(num_dataloaders)} - - @decorator_with_arguments(fx_name="test_step") - def test_step(self, batch, batch_idx, dl_idx=0): - output = self.layer(batch) - loss = self.loss(batch, output) - self.test_losses[dl_idx].append(loss) - self.log("test_loss", loss, on_step=True, on_epoch=True) - return {"test_loss": loss} - - def on_test_batch_end(self, *args, **kwargs): - # save objects as it will be reset at the end of epoch. - self.batch_results = deepcopy(self.trainer.logger_connector.cached_results) - - def on_test_epoch_end(self): - # save objects as it will be reset at the end of epoch. - self.reduce_results = deepcopy(self.trainer.logger_connector.cached_results) - - def test_dataloader(self): - return [super().test_dataloader()] * num_dataloaders - - model = TestModel() - model.test_epoch_end = None - limit_test_batches = 4 - - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=0, - limit_val_batches=0, - limit_test_batches=limit_test_batches, - max_epochs=1, - log_every_n_steps=1, - weights_summary=None, - ) - trainer.test(model) - - test_results = model.batch_results - - generated = test_results(fx_name="test_step") - assert len(generated) == num_dataloaders - - for dl_idx in range(num_dataloaders): - generated = test_results(fx_name="test_step", dl_idx=dl_idx) - assert len(generated) == limit_test_batches - - test_results = model.reduce_results - - for dl_idx in range(num_dataloaders): - expected = torch.stack(model.test_losses[dl_idx]).mean() - generated = test_results(fx_name="test_step", dl_idx=dl_idx, reduced=True)["test_loss_epoch"] - torch.testing.assert_allclose(generated, expected) - - def test_fx_validator(tmpdir): funcs_name = sorted([f for f in dir(Callback) if not f.startswith('_')]) @@ -360,7 +117,8 @@ def test_fx_validator(tmpdir): # This summarizes where and what is currently possible to log using `self.log` is_stage = "train" in func_name or "test" in func_name or "validation" in func_name is_start = "start" in func_name or "batch" in func_name - on_step = is_stage and is_start + is_epoch = "epoch" in func_name + on_step = is_stage and not is_start and not is_epoch on_epoch = True # creating allowed condition allowed = ( @@ -444,56 +202,6 @@ def test_dataloader(self): trainer.test(model, ckpt_path=None) -@pytest.mark.parametrize('to_float', [False, True]) -def test_metrics_holder(to_float, tmpdir): - - device = "cuda" if torch.cuda.is_available() else "cpu" - preds = torch.tensor([[0.9, 0.1]], device=device) - - def is_float(value: Any) -> bool: - return isinstance(value, float) - - excepted_function = is_float if to_float else torch.is_tensor - targets = torch.tensor([1], device=device) - acc = Accuracy().to(device) - metric_holder = MetricsHolder(to_float=to_float) - metric_holder.update({ - "x": 1, - "y": torch.tensor(2), - "z": acc(preds, targets), - }) - metric_holder.convert(device) - metrics = metric_holder.metrics - assert excepted_function(metrics["x"]) - assert excepted_function(metrics["y"]) - assert excepted_function(metrics["z"]) - - -def test_metric_holder_raises(tmpdir): - """Check that an error is raised when trying to convert non-scalar tensors""" - - class TestModel(BoringModel): - - def validation_step(self, batch, *args, **kwargs): - output = self(batch) - self.log('test', output) - - def test_step(self, *args, **kwargs): - return self.validation_step(*args, **kwargs) - - model = TestModel() - model.validation_epoch_end = None - model.test_epoch_end = None - - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) - - match = "The metric `test` does not contain a single element" - with pytest.raises(MisconfigurationException, match=match): - trainer.validate(model) - with pytest.raises(MisconfigurationException, match=match): - trainer.test(model) - - def test_can_return_tensor_with_more_than_one_element(tmpdir): """Ensure {validation,test}_step return values are not included as callback metrics. #6623""" @@ -562,7 +270,7 @@ def validation_step(self, *args, **kwargs): model = TestModel() model.validation_epoch_end = None - trainer = Trainer(default_root_dir=tmpdir, max_steps=5) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2) trainer.fit(model) logged = trainer.logged_metrics @@ -575,33 +283,6 @@ def validation_step(self, *args, **kwargs): assert 'val_loss_custom_naming_1' in logged -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def test_logged_metrics_steps(tmpdir): - - class TestModel(BoringModel): - - def validation_step(self, batch, batch_idx): - loss_val = torch.randn(1) - self.log('val_loss', loss_val) - return loss_val - - model = TestModel() - model.validation_epoch_end = None - - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=2, - limit_val_batches=2, - max_epochs=2, - log_every_n_steps=1, - weights_summary=None, - ) - trainer.fit(model) - - assert trainer.dev_debugger.logged_metrics[0]['global_step'] == 1 - assert trainer.dev_debugger.logged_metrics[1]['global_step'] == 3 - - def test_metrics_reset(tmpdir): """Tests that metrics are reset correctly after the end of the train/val/test epoch.""" @@ -611,48 +292,63 @@ def __init__(self): super().__init__() self.layer = torch.nn.Linear(32, 1) - for stage in ['train', 'val', 'test']: - acc = Accuracy() - acc.reset = mock.Mock(side_effect=acc.reset) - ap = AveragePrecision(num_classes=1, pos_label=1) - ap.reset = mock.Mock(side_effect=ap.reset) - self.add_module(f"acc_{stage}", acc) - self.add_module(f"ap_{stage}", ap) + def _create_metrics(self): + acc = Accuracy() + acc.reset = mock.Mock(side_effect=acc.reset) + ap = AveragePrecision(num_classes=1, pos_label=1) + ap.reset = mock.Mock(side_effect=ap.reset) + return acc, ap + + def setup(self, stage): + fn = stage + if fn == 'fit': + for stage in ('train', 'validate'): + acc, ap = self._create_metrics() + self.add_module(f"acc_{fn}_{stage}", acc) + self.add_module(f"ap_{fn}_{stage}", ap) + else: + acc, ap = self._create_metrics() + stage = self.trainer.state.stage + self.add_module(f"acc_{fn}_{stage}", acc) + self.add_module(f"ap_{fn}_{stage}", ap) def forward(self, x): return self.layer(x) - def _step(self, stage, batch): - labels = (batch.detach().sum(1) > 0).float() # Fake some targets - logits = self.forward(batch) - loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels.unsqueeze(1)) - probs = torch.sigmoid(logits.detach()) - self.log(f"loss/{stage}", loss) + def _step(self, batch): + fn, stage = self.trainer.state.fn, self.trainer.state.stage + + logits = self(batch) + loss = logits.sum() + self.log(f"loss/{fn}_{stage}", loss) - acc = self._modules[f"acc_{stage}"] - ap = self._modules[f"ap_{stage}"] + acc = self._modules[f"acc_{fn}_{stage}"] + ap = self._modules[f"ap_{fn}_{stage}"] - labels_int = labels.to(torch.long) - acc(probs.flatten(), labels_int) - ap(probs.flatten(), labels_int) + preds = torch.rand(len(batch)) # Fake preds + labels = torch.randint(0, 1, [len(batch)]) # Fake targets + acc(preds, labels) + ap(preds, labels) # Metric.forward calls reset so reset the mocks here acc.reset.reset_mock() ap.reset.reset_mock() - self.log(f"{stage}/accuracy", acc) - self.log(f"{stage}/ap", ap) + self.log(f"acc/{fn}_{stage}", acc) + self.log(f"ap/{fn}_{stage}", ap) return loss def training_step(self, batch, batch_idx, *args, **kwargs): - return self._step('train', batch) + return self._step(batch) def validation_step(self, batch, batch_idx, *args, **kwargs): - return self._step('val', batch) + if self.trainer.sanity_checking: + return + return self._step(batch) def test_step(self, batch, batch_idx, *args, **kwargs): - return self._step('test', batch) + return self._step(batch) def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) @@ -668,31 +364,11 @@ def val_dataloader(self): def test_dataloader(self): return DataLoader(RandomDataset(32, 64)) - def _assert_epoch_end(self, stage): - acc = self._modules[f"acc_{stage}"] - ap = self._modules[f"ap_{stage}"] - - acc.reset.asset_not_called() - ap.reset.assert_not_called() - - def on_train_epoch_end(self): - self._assert_epoch_end('train') - - def on_validation_epoch_end(self): - self._assert_epoch_end('val') - - def on_test_epoch_end(self): - self._assert_epoch_end('test') - - def _assert_called(model, stage): - acc = model._modules[f"acc_{stage}"] - ap = model._modules[f"ap_{stage}"] - + def _assert_called(model, fn, stage): + acc = model._modules[f"acc_{fn}_{stage}"] + ap = model._modules[f"ap_{fn}_{stage}"] acc.reset.assert_called_once() - acc.reset.reset_mock() - ap.reset.assert_called_once() - ap.reset.reset_mock() model = TestModel() trainer = Trainer( @@ -702,14 +378,126 @@ def _assert_called(model, stage): limit_test_batches=2, max_epochs=1, progress_bar_refresh_rate=0, + num_sanity_val_steps=2, + checkpoint_callback=False, ) trainer.fit(model) - _assert_called(model, 'train') - _assert_called(model, 'val') + _assert_called(model, 'fit', 'train') + _assert_called(model, 'fit', 'validate') trainer.validate(model) - _assert_called(model, 'val') + _assert_called(model, 'validate', 'validate') trainer.test(model) - _assert_called(model, 'test') + _assert_called(model, 'test', 'test') + + +def test_result_collection_on_tensor_with_mean_reduction(): + result_collection = ResultCollection(True, torch.device("cpu")) + product = [(True, True), (False, True), (True, False), (False, False)] + values = torch.arange(1, 10).float() # need to convert to float() due to precision issues using torch 1.4 + batches = values * values + + for i, v in enumerate(values): + for prog_bar in [False, True]: + for logger in [False, True]: + for on_step, on_epoch in product: + name = "loss" + if on_step: + name += "_on_step" + if on_epoch: + name += "_on_epoch" + if prog_bar: + name += "_prog_bar" + if logger: + name += "_logger" + result_collection.log( + "training_step", + name, + v, + on_step=on_step, + on_epoch=on_epoch, + batch_size=batches[i], + prog_bar=prog_bar, + logger=logger, + ) + + total_value = sum(values * batches) + total_batches = sum(batches) + assert result_collection["training_step.loss_on_step_on_epoch"].value == total_value + assert result_collection["training_step.loss_on_step_on_epoch"].cumulated_batch_size == total_batches + + batch_metrics = result_collection.metrics(True) + max_ = max(values) + assert batch_metrics[MetricSource.PBAR] == { + 'loss_on_step_on_epoch_prog_bar_step': max_, + 'loss_on_step_on_epoch_prog_bar_logger_step': max_, + 'loss_on_step_prog_bar': max_, + 'loss_on_step_prog_bar_logger': max_, + } + assert batch_metrics[MetricSource.LOG] == { + 'loss_on_step_on_epoch_logger_step': max_, + 'loss_on_step_logger': max_, + 'loss_on_step_on_epoch_prog_bar_logger_step': max_, + 'loss_on_step_prog_bar_logger': max_, + } + assert batch_metrics[MetricSource.CALLBACK] == { + 'loss_on_step': max_, + 'loss_on_step_logger': max_, + 'loss_on_step_on_epoch': max_, + 'loss_on_step_on_epoch_logger': max_, + 'loss_on_step_on_epoch_logger_step': max_, + 'loss_on_step_on_epoch_prog_bar': max_, + 'loss_on_step_on_epoch_prog_bar_logger': max_, + 'loss_on_step_on_epoch_prog_bar_logger_step': max_, + 'loss_on_step_on_epoch_prog_bar_step': max_, + 'loss_on_step_on_epoch_step': max_, + 'loss_on_step_prog_bar': max_, + 'loss_on_step_prog_bar_logger': max_, + } + + epoch_metrics = result_collection.metrics(False) + mean = total_value / total_batches + assert epoch_metrics[MetricSource.PBAR] == { + 'loss_on_epoch_prog_bar': mean, + 'loss_on_epoch_prog_bar_logger': mean, + 'loss_on_step_on_epoch_prog_bar_epoch': mean, + 'loss_on_step_on_epoch_prog_bar_logger_epoch': mean, + } + assert epoch_metrics[MetricSource.LOG] == { + 'loss_on_epoch_logger': mean, + 'loss_on_epoch_prog_bar_logger': mean, + 'loss_on_step_on_epoch_logger_epoch': mean, + 'loss_on_step_on_epoch_prog_bar_logger_epoch': mean + } + assert epoch_metrics[MetricSource.CALLBACK] == { + 'loss_on_epoch': mean, + 'loss_on_epoch_logger': mean, + 'loss_on_epoch_prog_bar': mean, + 'loss_on_epoch_prog_bar_logger': mean, + 'loss_on_step_on_epoch': mean, + 'loss_on_step_on_epoch_epoch': mean, + 'loss_on_step_on_epoch_logger': mean, + 'loss_on_step_on_epoch_logger_epoch': mean, + 'loss_on_step_on_epoch_prog_bar': mean, + 'loss_on_step_on_epoch_prog_bar_epoch': mean, + 'loss_on_step_on_epoch_prog_bar_logger': mean, + 'loss_on_step_on_epoch_prog_bar_logger_epoch': mean + } + + +def test_logged_metrics_has_logged_epoch_value(tmpdir): + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + self.log('epoch', -batch_idx, logger=True) + return super().training_step(batch, batch_idx) + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2) + trainer.fit(model) + + # should not get overridden if logged manually + assert trainer.logged_metrics == {'epoch': -1} diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 546fb9ff8fdac..b26e3fc83d25c 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -12,74 +12,67 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Tests to ensure that the training loop works with a dict (1.0) +Test logging in the training loop """ import collections import itertools -import os -from unittest import mock +from re import escape import numpy as np import pytest import torch -from torch.utils.data import Dataset +from torchmetrics import Accuracy import pytorch_lightning as pl from pytorch_lightning import callbacks, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from pytorch_lightning.core.lightning import LightningModule -from tests.helpers.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset -from tests.helpers.deterministic_model import DeterministicModel +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.helpers.boring_model import BoringModel, RandomDictDataset from tests.helpers.runif import RunIf -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test__training_step__log(tmpdir): """ Tests that only training_step can be used """ - class TestModel(DeterministicModel): + class TestModel(BoringModel): def training_step(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - acc = acc + batch_idx + out = super().training_step(batch, batch_idx) + loss = out['loss'] # ----------- # default # ----------- - self.log('default', acc) + self.log('default', loss) # ----------- # logger # ----------- # on_step T on_epoch F - self.log('l_s', acc, on_step=True, on_epoch=False, prog_bar=False, logger=True) + self.log('l_s', loss, on_step=True, on_epoch=False, prog_bar=False, logger=True) # on_step F on_epoch T - self.log('l_e', acc, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('l_e', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) # on_step T on_epoch T - self.log('l_se', acc, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('l_se', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) # ----------- # pbar # ----------- # on_step T on_epoch F - self.log('p_s', acc, on_step=True, on_epoch=False, prog_bar=True, logger=False) + self.log('p_s', loss, on_step=True, on_epoch=False, prog_bar=True, logger=False) # on_step F on_epoch T - self.log('p_e', acc, on_step=False, on_epoch=True, prog_bar=True, logger=False) + self.log('p_e', loss, on_step=False, on_epoch=True, prog_bar=True, logger=False) # on_step T on_epoch T - self.log('p_se', acc, on_step=True, on_epoch=True, prog_bar=True, logger=False) - - self.training_step_called = True - return acc + self.log('p_se', loss, on_step=True, on_epoch=True, prog_bar=True, logger=False) - def backward(self, loss, optimizer, optimizer_idx): - return LightningModule.backward(self, loss, optimizer, optimizer_idx) + return loss model = TestModel() model.val_dataloader = None @@ -95,14 +88,8 @@ def backward(self, loss, optimizer, optimizer_idx): ) trainer.fit(model) - # make sure correct steps were called - assert model.training_step_called - assert not model.training_step_end_called - assert not model.training_epoch_end_called - - # make sure all the metrics are available for callbacks - logged_metrics = set(trainer.logged_metrics.keys()) - expected_logged_metrics = { + logged_metrics = set(trainer.logged_metrics) + assert logged_metrics == { 'epoch', 'default', 'l_e', @@ -110,51 +97,36 @@ def backward(self, loss, optimizer, optimizer_idx): 'l_se_step', 'l_se_epoch', } - assert logged_metrics == expected_logged_metrics - pbar_metrics = set(trainer.progress_bar_metrics.keys()) - expected_pbar_metrics = { + pbar_metrics = set(trainer.progress_bar_metrics) + assert pbar_metrics == { 'p_e', 'p_s', 'p_se_step', 'p_se_epoch', } - assert pbar_metrics == expected_pbar_metrics - callback_metrics = set(trainer.callback_metrics.keys()) - callback_metrics.remove('debug_epoch') - expected_callback_metrics = set() - expected_callback_metrics = expected_callback_metrics.union(logged_metrics) - expected_callback_metrics = expected_callback_metrics.union(pbar_metrics) - expected_callback_metrics.update({'p_se', 'l_se'}) - expected_callback_metrics.remove('epoch') - assert callback_metrics == expected_callback_metrics + assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {'p_se', 'l_se'}) - {'epoch'} -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test__training_step__epoch_end__log(tmpdir): """ - Tests that only training_step can be used + Tests that training_epoch_end can log """ - class TestModel(DeterministicModel): + class TestModel(BoringModel): def training_step(self, batch, batch_idx): - self.training_step_called = True - acc = self.step(batch, batch_idx) - acc = acc + batch_idx - self.log('a', acc, on_step=True, on_epoch=True) - self.log_dict({'a1': acc, 'a2': acc}) - return acc + out = super().training_step(batch, batch_idx) + loss = out['loss'] + self.log('a', loss, on_step=True, on_epoch=True) + self.log_dict({'a1': loss, 'a2': loss}) + return out def training_epoch_end(self, outputs): - self.training_epoch_end_called = True self.log('b1', outputs[0]['loss']) self.log('b', outputs[0]['loss'], on_epoch=True, prog_bar=True, logger=True) - def backward(self, loss, optimizer, optimizer_idx): - return LightningModule.backward(self, loss, optimizer, optimizer_idx) - model = TestModel() model.val_dataloader = None @@ -168,52 +140,33 @@ def backward(self, loss, optimizer, optimizer_idx): ) trainer.fit(model) - # make sure correct steps were called - assert model.training_step_called - assert not model.training_step_end_called - assert model.training_epoch_end_called - - # make sure all the metrics are available for callbacks - logged_metrics = set(trainer.logged_metrics.keys()) - expected_logged_metrics = {'epoch', 'a_step', 'a_epoch', 'b', 'b1', 'a1', 'a2'} - assert logged_metrics == expected_logged_metrics + logged_metrics = set(trainer.logged_metrics) + assert logged_metrics == {'epoch', 'a_step', 'a_epoch', 'b', 'b1', 'a1', 'a2'} - pbar_metrics = set(trainer.progress_bar_metrics.keys()) - expected_pbar_metrics = {'b'} - assert pbar_metrics == expected_pbar_metrics + pbar_metrics = set(trainer.progress_bar_metrics) + assert pbar_metrics == {'b'} - callback_metrics = set(trainer.callback_metrics.keys()) - callback_metrics.remove('debug_epoch') - expected_callback_metrics = set() - expected_callback_metrics = expected_callback_metrics.union(logged_metrics) - expected_callback_metrics = expected_callback_metrics.union(pbar_metrics) - expected_callback_metrics.remove('epoch') - expected_callback_metrics.add('a') - assert callback_metrics == expected_callback_metrics + assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {'a'}) - {'epoch'} -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @pytest.mark.parametrize(['batches', 'log_interval', 'max_epochs'], [(1, 1, 1), (64, 32, 2)]) def test__training_step__step_end__epoch_end__log(tmpdir, batches, log_interval, max_epochs): """ - Tests that only training_step can be used + Tests that training_step_end and training_epoch_end can log """ class TestModel(BoringModel): def training_step(self, batch, batch_idx): - self.training_step_called = True loss = self.step(batch[0]) self.log('a', loss, on_step=True, on_epoch=True) return loss def training_step_end(self, out): - self.training_step_end_called = True self.log('b', out, on_step=True, on_epoch=True, prog_bar=True, logger=True) return out def training_epoch_end(self, outputs): - self.training_epoch_end_called = True self.log('c', outputs[0]['loss'], on_epoch=True, prog_bar=True, logger=True) self.log('d/e/f', 2) @@ -230,34 +183,23 @@ def training_epoch_end(self, outputs): ) trainer.fit(model) - # make sure correct steps were called - assert model.training_step_called - assert model.training_step_end_called - assert model.training_epoch_end_called - # make sure all the metrics are available for callbacks - logged_metrics = set(trainer.logged_metrics.keys()) - expected_logged_metrics = {'a_step', 'a_epoch', 'b_step', 'b_epoch', 'c', 'd/e/f', 'epoch'} - assert logged_metrics == expected_logged_metrics - - pbar_metrics = set(trainer.progress_bar_metrics.keys()) - expected_pbar_metrics = {'c', 'b_epoch', 'b_step'} - assert pbar_metrics == expected_pbar_metrics + logged_metrics = set(trainer.logged_metrics) + assert logged_metrics == {'a_step', 'a_epoch', 'b_step', 'b_epoch', 'c', 'd/e/f', 'epoch'} - callback_metrics = set(trainer.callback_metrics.keys()) - callback_metrics.remove('debug_epoch') - expected_callback_metrics = set() - expected_callback_metrics = expected_callback_metrics.union(logged_metrics) - expected_callback_metrics = expected_callback_metrics.union(pbar_metrics) - expected_callback_metrics.update({'a', 'b'}) - expected_callback_metrics.remove('epoch') - assert callback_metrics == expected_callback_metrics + pbar_metrics = set(trainer.progress_bar_metrics) + assert pbar_metrics == {'c', 'b_epoch', 'b_step'} - # assert the loggers received the expected number - assert len(trainer.dev_debugger.logged_metrics) == ((batches / log_interval) * max_epochs) + max_epochs + assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {'a', 'b'}) - {'epoch'} -@pytest.mark.parametrize(['batches', 'fx', 'result'], [(1, min, 0), (2, max, 1), (11, max, 10)]) +@pytest.mark.parametrize(['batches', 'fx', 'result'], [ + (3, min, 0), + (3, torch.max, 2), + (11, max, 10), + (5, 'avg', 2), + (5, 'SUM', 10), +]) def test__training_step__log_max_reduce_fx(tmpdir, batches, fx, result): """ Tests that log works correctly with different tensor types @@ -267,7 +209,7 @@ class TestModel(BoringModel): def training_step(self, batch, batch_idx): acc = self.step(batch[0]) - self.log('foo', torch.tensor(batch_idx).long(), on_step=False, on_epoch=True, reduce_fx=fx) + self.log('foo', torch.tensor(batch_idx, dtype=torch.long), on_step=False, on_epoch=True, reduce_fx=fx) return acc def validation_step(self, batch, batch_idx): @@ -319,7 +261,9 @@ def __init__(self): def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" - self.test_hidden = torch.rand(1) + if hiddens is not None: + assert hiddens.grad_fn is None + self.test_hidden = torch.tensor(2., requires_grad=True).pow(2) x_tensor, y_list = batch assert x_tensor.shape[1] == truncated_bptt_steps, "tbptt split Tensor failed" @@ -347,7 +291,6 @@ def train_dataloader(self): model = TestModel() model.training_epoch_end = None - model.example_input_array = torch.randn(5, truncated_bptt_steps) trainer = Trainer( default_root_dir=tmpdir, @@ -360,9 +303,7 @@ def train_dataloader(self): ) trainer.fit(model) - generated = set(trainer.logged_metrics.keys()) - expected = {'a_step', 'a_epoch', 'epoch'} - assert generated == expected + assert set(trainer.logged_metrics) == {'a_step', 'a_epoch', 'epoch'} def test_different_batch_types_for_sizing(tmpdir): @@ -397,105 +338,13 @@ def val_dataloader(self): limit_val_batches=2, max_epochs=1, weights_summary=None, + fast_dev_run=True, ) trainer.fit(model) - generated = set(trainer.logger_connector.logged_metrics) - expected = {'a_step', 'a_epoch', 'n_step', 'n_epoch', 'epoch'} - - assert generated == expected - - -def test_validation_step_with_string_data_logging(tmpdir): - - class TestModel(BoringModel): - - def on_train_epoch_start(self) -> None: - print("override any method to prove your bug") - - def training_step(self, batch, batch_idx): - output = self.layer(batch["x"]) - loss = self.loss(batch, output) - return {"loss": loss} - - def validation_step(self, batch, batch_idx): - output = self.layer(batch["x"]) - loss = self.loss(batch, output) - self.log("x", loss) - return {"x": loss} - - # fake data - train_data = torch.utils.data.DataLoader(RandomDictStringDataset(32, 64)) - val_data = torch.utils.data.DataLoader(RandomDictStringDataset(32, 64)) - - # model - model = TestModel() - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=1, - limit_val_batches=1, - max_epochs=1, - weights_summary=None, - ) - trainer.fit(model, train_data, val_data) - - -def test_nested_datasouce_batch(tmpdir): - - class NestedDictStringDataset(Dataset): - - def __init__(self, size, length): - self.len = length - self.data = torch.randn(length, size) - - def __getitem__(self, index): - x = { - 'post_text': ['bird is fast', 'big cat'], - 'dense_0': [ - torch.tensor([-0.1000, 0.2000], dtype=torch.float64), - torch.tensor([1, 1], dtype=torch.uint8), - ], - 'post_id': ['115', '116'], - 'label': [torch.tensor([0, 1]), torch.tensor([1, 1], dtype=torch.uint8)] - } - return x - - def __len__(self): - return self.len - - class TestModel(BoringModel): - - def on_train_epoch_start(self) -> None: - print("override any method to prove your bug") - - def training_step(self, batch, batch_idx): - output = self.layer(torch.rand(32)) - loss = self.loss(batch, output) - return {"loss": loss} - - def validation_step(self, batch, batch_idx): - output = self.layer(torch.rand(32)) - loss = self.loss(batch, output) - self.log("x", loss) - return {"x": loss} - - # fake data - train_data = torch.utils.data.DataLoader(NestedDictStringDataset(32, 64)) - val_data = torch.utils.data.DataLoader(NestedDictStringDataset(32, 64)) - - # model - model = TestModel() - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=1, - limit_val_batches=1, - max_epochs=1, - weights_summary=None, - ) - trainer.fit(model, train_data, val_data) + assert set(trainer.logged_metrics) == {'a_step', 'a_epoch', 'n_step', 'n_epoch', 'epoch'} -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_log_works_in_train_callback(tmpdir): """ Tests that log can be called within callback @@ -503,214 +352,126 @@ def test_log_works_in_train_callback(tmpdir): class TestCallback(callbacks.Callback): - # helpers - count = 1 + count = 0 choices = [False, True] + # used to compute expected values - callback_funcs_called = collections.defaultdict(list) - funcs_called_count = collections.defaultdict(int) - funcs_attr = {} - - def make_logging( - self, pl_module: pl.LightningModule, func_name, func_idx, on_steps=[], on_epochs=[], prob_bars=[] - ): - self.funcs_called_count[func_name] += 1 - iterate = list(itertools.product(*[on_steps, on_epochs, prob_bars])) - for idx, (on_step, on_epoch, prog_bar) in enumerate(iterate): - # run logging - custom_func_name = f"{func_idx}_{idx}_{func_name}" - pl_module.log( - custom_func_name, self.count * func_idx, on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar - ) - - # catch information for verification - - # on on_train_start is outside the main loop. Won't be called - if func_name == "on_train_start": - self.callback_funcs_called[func_name].append([self.count * func_idx]) - - # Saved only values from second epoch, so we can compute its mean or latest. - if pl_module.trainer.current_epoch == 1: - self.callback_funcs_called[func_name].append([self.count * func_idx]) - - forked = on_step and on_epoch - - self.funcs_attr[custom_func_name] = { - "on_step": on_step, - "on_epoch": on_epoch, - "prog_bar": prog_bar, - "forked": forked, - "func_name": func_name - } - - if on_step and on_epoch: - self.funcs_attr[f"{custom_func_name}_step"] = { - "on_step": True, - "on_epoch": False, - "prog_bar": prog_bar, - "forked": False, - "func_name": func_name - } - - self.funcs_attr[f"{custom_func_name}_epoch"] = { - "on_step": False, - "on_epoch": True, - "prog_bar": prog_bar, - "forked": False, - "func_name": func_name - } + logged_values = collections.defaultdict(list) + call_counter = collections.Counter() + logged_arguments = {} - def on_train_start(self, trainer, pl_module): - self.make_logging( - pl_module, 'on_train_start', 1, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices - ) + def make_logging(self, pl_module, func_name, on_steps, on_epochs, prob_bars): + self.call_counter.update([func_name]) - def on_epoch_start(self, trainer, pl_module): - self.make_logging( - pl_module, 'on_epoch_start', 2, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices - ) + for idx, (on_step, on_epoch, prog_bar) in enumerate(itertools.product(on_steps, on_epochs, prob_bars)): + fx = f"{func_name}_{idx}" + pl_module.log(fx, self.count, on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar) + self.logged_values[fx].append(self.count) + self.logged_arguments[fx] = {"on_step": on_step, "on_epoch": on_epoch, "prog_bar": prog_bar} + self.count += 1 - def on_train_epoch_start(self, trainer, pl_module): + def on_train_start(self, _, pl_module): + self.make_logging(pl_module, 'on_train_start', on_steps=[False], on_epochs=[True], prob_bars=self.choices) + + def on_epoch_start(self, _, pl_module): self.make_logging( - pl_module, - 'on_train_epoch_start', - 3, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_epoch_start', on_steps=self.choices, on_epochs=[True], prob_bars=self.choices ) - def on_batch_end(self, trainer, pl_module): + def on_train_epoch_start(self, _, pl_module): self.make_logging( - pl_module, 'on_batch_end', 6, on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices + pl_module, 'on_train_epoch_start', on_steps=self.choices, on_epochs=[True], prob_bars=self.choices ) - def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + def on_batch_end(self, _, pl_module): self.make_logging( - pl_module, - 'on_train_batch_end', - 7, - on_steps=self.choices, - on_epochs=self.choices, - prob_bars=self.choices + pl_module, 'on_batch_end', on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices ) - # used to make sure aggregation works fine. - # we should obtain func[value * c for c in range(1, max_epochs * limit_train_batches)]) - # with func = np.mean if on_epoch else func = np.max - self.count += 1 - def on_train_epoch_end(self, trainer, pl_module): + def on_train_batch_end(self, _, pl_module, *__): self.make_logging( - pl_module, 'on_train_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices + pl_module, 'on_train_batch_end', on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices ) - def on_epoch_end(self, trainer, pl_module): + def on_train_epoch_end(self, _, pl_module): self.make_logging( - pl_module, 'on_epoch_end', 9, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices + pl_module, 'on_train_epoch_end', on_steps=[False], on_epochs=[True], prob_bars=self.choices ) - class TestModel(BoringModel): + def on_epoch_end(self, _, pl_module): + self.make_logging(pl_module, 'on_epoch_end', on_steps=[False], on_epochs=[True], prob_bars=self.choices) - manual_loss = [] + class TestModel(BoringModel): + seen_losses = [] def training_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.manual_loss.append(loss) - self.log('train_loss', loss) + loss = super().training_step(batch, batch_idx)['loss'] + self.seen_losses.append(loss) + self.log('train_loss', loss, prog_bar=True) return {"loss": loss} - max_epochs = 2 - limit_train_batches = 2 model = TestModel() - test_callback = TestCallback() - + cb = TestCallback() trainer = Trainer( default_root_dir=tmpdir, - limit_train_batches=limit_train_batches, + limit_train_batches=2, limit_val_batches=0, - limit_test_batches=0, - val_check_interval=0., num_sanity_val_steps=0, - max_epochs=max_epochs, - callbacks=[test_callback] + max_epochs=1, + callbacks=[cb] ) trainer.fit(model) - assert test_callback.funcs_called_count["on_train_start"] == 1 - assert test_callback.funcs_called_count["on_epoch_start"] == 2 - assert test_callback.funcs_called_count["on_train_epoch_start"] == 2 - assert test_callback.funcs_called_count["on_batch_end"] == 4 - assert test_callback.funcs_called_count["on_epoch_end"] == 2 - assert test_callback.funcs_called_count["on_train_batch_end"] == 4 - assert test_callback.funcs_called_count["on_epoch_end"] == 2 - assert test_callback.funcs_called_count["on_train_epoch_end"] == 2 - - # Make sure the func_name exists within callback_metrics. If not, we missed some - callback_metrics_keys = [*trainer.callback_metrics.keys()] - for func_name in test_callback.callback_funcs_called.keys(): - is_in = False - for callback_metrics_key in callback_metrics_keys: - if func_name in callback_metrics_key: - is_in = True - assert is_in, (func_name, callback_metrics_keys) - - # function used to describe expected return logic - def get_expected_output(func_attr, original_values): - if func_attr["on_epoch"] and not func_attr["on_step"]: - # Apply mean on values - expected_output = np.mean(original_values) - else: - # Keep the latest value - expected_output = np.max(original_values) - return expected_output - # Make sure the func_name output equals the average from all logged values when on_epoch true - # pop extra keys - trainer.callback_metrics.pop("debug_epoch") - assert trainer.logged_metrics["train_loss"] == model.manual_loss[-1] - assert trainer.callback_metrics["train_loss"] == model.manual_loss[-1] - trainer.callback_metrics.pop("train_loss") + assert trainer.progress_bar_dict["train_loss"] == model.seen_losses[-1] + assert trainer.callback_metrics["train_loss"] == model.seen_losses[-1] - for func_name, output_value in trainer.callback_metrics.items(): - if torch.is_tensor(output_value): - output_value = output_value.item() - # get creation attr - func_attr = test_callback.funcs_attr[func_name] + assert cb.call_counter == { + 'on_train_start': 1, + 'on_epoch_start': 1, + 'on_train_epoch_start': 1, + 'on_train_batch_end': 2, + 'on_batch_end': 2, + 'on_train_epoch_end': 1, + 'on_epoch_end': 1 + } - # retrived orginal logged values - original_values = test_callback.callback_funcs_called[func_attr["func_name"]] + def get_expected(on_epoch, values): + reduction = np.mean if on_epoch else np.max + return reduction(values) - # compute expected output and compare to actual one - expected_output = get_expected_output(func_attr, original_values) - assert float(output_value) == float(expected_output) + for fx, value in trainer.callback_metrics.items(): + actual = value.item() + if fx not in cb.logged_arguments: + continue + on_epoch = cb.logged_arguments[fx]['on_epoch'] + values = cb.logged_values[fx] + expected = get_expected(on_epoch, values) + assert actual == expected - for func_name, func_attr in test_callback.funcs_attr.items(): - if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: - assert func_name in trainer.logger_connector.progress_bar_metrics - else: - assert func_name not in trainer.logger_connector.progress_bar_metrics + for fx, attrs in cb.logged_arguments.items(): + should_include = attrs["prog_bar"] and attrs["on_step"] ^ attrs["on_epoch"] + is_included = fx in trainer.logger_connector.progress_bar_metrics + assert is_included if should_include else not is_included -def test_logging_sync_dist_true_cpu(tmpdir): +@pytest.mark.parametrize('gpus', [None, pytest.param(1, marks=RunIf(min_gpus=1))]) +def test_logging_sync_dist_true(tmpdir, gpus): """ - Tests to ensure that the sync_dist flag works with CPU (should just return the original value) + Tests to ensure that the sync_dist flag works (should just return the original value) """ fake_result = 1 class TestModel(BoringModel): def training_step(self, batch, batch_idx): - acc = self.step(batch[0]) - self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') - self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') - return acc + self.log('foo', fake_result, on_step=False, on_epoch=True, sync_dist=True, reduce_fx='sum') + self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, reduce_fx='sum') + return super().training_step(batch, batch_idx) def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') - return {"x": loss} + self.log('bar', fake_result, on_step=False, on_epoch=True, sync_dist=True, reduce_fx='sum') + return super().validation_step(batch, batch_idx) model = TestModel() trainer = Trainer( @@ -719,6 +480,7 @@ def validation_step(self, batch, batch_idx): limit_val_batches=1, max_epochs=2, weights_summary=None, + gpus=gpus, ) trainer.fit(model) @@ -737,15 +499,14 @@ class TestLoggingSyncDistModel(BoringModel): def training_step(self, batch, batch_idx): acc = self.step(batch[0]) - self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM') + self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, reduce_fx='SUM') self.log('cho', acc, on_step=False, on_epoch=True) return acc def validation_step(self, batch, batch_idx): - self.training_step_called = True output = self.layer(batch) loss = self.loss(batch, output) - self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='AVG') + self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, reduce_fx='AVG') return {"x": loss} model = TestLoggingSyncDistModel() @@ -765,41 +526,6 @@ def validation_step(self, batch, batch_idx): assert trainer.logged_metrics['bar'] == 2 -@RunIf(min_gpus=1) -def test_logging_sync_dist_true_gpu(tmpdir): - """ - Tests to ensure that the sync_dist flag works with GPU (should just return the original value) - """ - fake_result = 1 - - class TestModel(BoringModel): - - def training_step(self, batch, batch_idx): - acc = self.step(batch[0]) - self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') - return acc - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') - return {"x": loss} - - model = TestModel() - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=1, - limit_val_batches=1, - max_epochs=2, - gpus=1, - weights_summary=None, - ) - trainer.fit(model) - - assert trainer.logged_metrics['foo'] == fake_result - assert trainer.logged_metrics['bar'] == fake_result - - def test_progress_bar_dict_contains_values_on_train_epoch_end(tmpdir): class TestModel(BoringModel): @@ -809,21 +535,20 @@ def training_step(self, *args): return super().training_step(*args) def on_train_epoch_end(self, *_): - self.on_train_epoch_end_called = True - self.epoch_end_called = True self.log( 'foo_2', torch.tensor(self.current_epoch), prog_bar=True, on_epoch=True, sync_dist=True, - sync_dist_op='sum' + reduce_fx='sum' ) + self.on_train_epoch_end_called = True def on_epoch_end(self): - self.epoch_end_called = True assert self.trainer.progress_bar_dict["foo"] == self.current_epoch assert self.trainer.progress_bar_dict["foo_2"] == self.current_epoch + self.on_epoch_end_called = True trainer = Trainer( default_root_dir=tmpdir, @@ -837,8 +562,8 @@ def on_epoch_end(self): ) model = TestModel() trainer.fit(model) - assert model.epoch_end_called assert model.on_train_epoch_end_called + assert model.on_epoch_end_called def test_logging_in_callbacks_with_log_function(tmpdir): @@ -934,3 +659,135 @@ def validation_step(self, batch, batch_idx): assert trainer.callback_metrics["val_acc"] == 8 / 32. assert "train_loss" in trainer.callback_metrics + + +@pytest.mark.parametrize( + 'value', + [None, dict(a=None), + dict(a=dict(b=None)), + dict(a=dict(b=1)), 'foo', [1, 2, 3], (1, 2, 3), [[1, 2], 3]] +) +def test_log_none_raises(tmpdir, value): + + class TestModel(BoringModel): + + def training_step(self, *args): + self.log("foo", value) + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + model = TestModel() + match = escape(f"self.log(foo, {value})` was called") + with pytest.raises(ValueError, match=match): + trainer.fit(model) + + +def test_logging_raises(tmpdir): + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + self.log('foo/dataloader_idx_0', -1) + + trainer = Trainer(default_root_dir=tmpdir) + model = TestModel() + with pytest.raises(MisconfigurationException, match='`self.log` with the key `foo/dataloader_idx_0`'): + trainer.fit(model) + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + self.log('foo', Accuracy()) + + trainer = Trainer(default_root_dir=tmpdir) + model = TestModel() + with pytest.raises(MisconfigurationException, match='fix this by setting an attribute for the metric in your'): + trainer.fit(model) + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.bar = Accuracy() + + def training_step(self, batch, batch_idx): + self.log('foo', Accuracy()) + + trainer = Trainer(default_root_dir=tmpdir) + model = TestModel() + with pytest.raises( + MisconfigurationException, + match=r"`self.log\(foo, ..., metric_attribute=name\)` where `name` is one of \['bar'\]" + ): + trainer.fit(model) + + class TestModel(BoringModel): + + def training_step(self, *args): + self.log('foo', -1, prog_bar=False) + self.log('foo', -1, prog_bar=True) + return super().training_step(*args) + + trainer = Trainer(default_root_dir=tmpdir) + model = TestModel() + with pytest.raises(MisconfigurationException, match=r'self.log\(foo, ...\)` twice in `training_step`'): + trainer.fit(model) + + class TestModel(BoringModel): + + def training_step(self, *args): + self.log('foo', -1, reduce_fx=torch.argmax) + return super().training_step(*args) + + trainer = Trainer(default_root_dir=tmpdir) + model = TestModel() + with pytest.raises(MisconfigurationException, match=r'reduce_fx={min,max,mean,sum}\)` are currently supported'): + trainer.fit(model) + + +def test_sanity_metrics_are_reset(tmpdir): + + class TestModel(BoringModel): + + def validation_step(self, batch, batch_idx): + output = super().validation_step(batch, batch_idx) + if self.trainer.sanity_checking: + self.log("val_loss", output["x"], prog_bar=True, logger=True) + return output + + def training_step(self, batch, batch_idx): + loss = super().training_step(batch, batch_idx) + if batch_idx == 0: + assert self.trainer.logger_connector._progress_bar_metrics == {} + assert self.trainer.logger_connector._logged_metrics == {} + assert self.trainer.logger_connector._callback_metrics == {} + self.log("train_loss", loss, prog_bar=True, logger=True) + return loss + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=2, + num_sanity_val_steps=2, + ) + trainer.fit(TestModel()) + + assert "val_loss" not in trainer.progress_bar_metrics + + +@RunIf(min_gpus=2) +def test_log_gpu_memory_without_logging_on_step(tmpdir): + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=0, + log_gpu_memory='all', + log_every_n_steps=1, + gpus=[1] + ) + trainer.fit(model) + + assert 'gpu_id: 1/memory.used (MB)' in trainer.logged_metrics diff --git a/tests/trainer/loops/test_evaluation_loop.py b/tests/trainer/loops/test_evaluation_loop.py index 278ed8619d0de..2a0f95a19209b 100644 --- a/tests/trainer/loops/test_evaluation_loop.py +++ b/tests/trainer/loops/test_evaluation_loop.py @@ -13,11 +13,15 @@ # limitations under the License. from unittest import mock +import torch +from torch.utils.data import DataLoader + from pytorch_lightning import Trainer -from tests.helpers.boring_model import BoringModel +from tests.helpers.boring_model import BoringModel, RandomDataset +from tests.helpers.runif import RunIf -@mock.patch("pytorch_lightning.trainer.evaluation_loop.EvaluationLoop.on_evaluation_epoch_end") +@mock.patch("pytorch_lightning.loops.dataloader.evaluation_loop.EvaluationLoop.on_evaluation_epoch_end") def test_on_evaluation_epoch_end(eval_epoch_end_mock, tmpdir): """ Tests that `on_evaluation_epoch_end` is called @@ -43,12 +47,12 @@ def test_on_evaluation_epoch_end(eval_epoch_end_mock, tmpdir): @mock.patch( - "pytorch_lightning.trainer.connectors.logger_connector.logger_connector.LoggerConnector.get_evaluate_epoch_results" + "pytorch_lightning.trainer.connectors.logger_connector.logger_connector.LoggerConnector.update_eval_epoch_metrics" ) -def test_log_epoch_metrics_before_on_evaluation_end(get_evaluate_epoch_results_mock, tmpdir): +def test_log_epoch_metrics_before_on_evaluation_end(update_eval_epoch_metrics_mock, tmpdir): """Test that the epoch metrics are logged before the `on_evalutaion_end` hook is fired""" order = [] - get_evaluate_epoch_results_mock.side_effect = lambda: order.append("log_epoch_metrics") + update_eval_epoch_metrics_mock.side_effect = lambda: order.append("log_epoch_metrics") class LessBoringModel(BoringModel): @@ -65,3 +69,52 @@ def on_validation_end(self): trainer.fit(LessBoringModel()) assert order == ["log_epoch_metrics", "on_validation_end"] + + +@RunIf(min_gpus=1) +def test_memory_consumption_validation(tmpdir): + """Test that the training batch is no longer in GPU memory when running validation""" + + initial_memory = torch.cuda.memory_allocated(0) + + class BoringLargeBatchModel(BoringModel): + + @property + def num_params(self): + return sum(p.numel() for p in self.parameters()) + + def train_dataloader(self): + # batch target memory >= 100x boring_model size + batch_size = self.num_params * 100 // 32 + 1 + return DataLoader(RandomDataset(32, 5000), batch_size=batch_size) + + def val_dataloader(self): + return self.train_dataloader() + + def training_step(self, batch, batch_idx): + # there is a batch and the boring model, but not two batches on gpu, assume 32 bit = 4 bytes + lower = 101 * self.num_params * 4 + upper = 201 * self.num_params * 4 + current = torch.cuda.memory_allocated(0) + assert lower < current + assert current - initial_memory < upper + return super().training_step(batch, batch_idx) + + def validation_step(self, batch, batch_idx): + # there is a batch and the boring model, but not two batches on gpu, assume 32 bit = 4 bytes + lower = 101 * self.num_params * 4 + upper = 201 * self.num_params * 4 + current = torch.cuda.memory_allocated(0) + assert lower < current + assert current - initial_memory < upper + return super().validation_step(batch, batch_idx) + + torch.cuda.empty_cache() + trainer = Trainer( + gpus=1, + default_root_dir=tmpdir, + fast_dev_run=2, + move_metrics_to_cpu=True, + weights_summary=None, + ) + trainer.fit(BoringLargeBatchModel()) diff --git a/tests/trainer/loops/test_evaluation_loop_flow.py b/tests/trainer/loops/test_evaluation_loop_flow.py index 67ed756630734..14cb4ce4ae7f8 100644 --- a/tests/trainer/loops/test_evaluation_loop_flow.py +++ b/tests/trainer/loops/test_evaluation_loop_flow.py @@ -19,6 +19,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.trainer.states import RunningStage from tests.helpers.deterministic_model import DeterministicModel @@ -65,22 +66,20 @@ def backward(self, loss, optimizer, optimizer_idx): assert not model.validation_step_end_called assert not model.validation_epoch_end_called - # make sure training outputs what is expected - for batch_idx, batch in enumerate(model.train_dataloader()): - break - - out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + # simulate training manually + trainer.state.stage = RunningStage.TRAINING + batch_idx, batch = 0, next(iter(model.train_dataloader())) + out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict) - train_step_out = out.training_step_output_for_epoch_end + train_step_out = out.training_step_output assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] - assert isinstance(train_step_out['minimize'], torch.Tensor) - assert train_step_out['minimize'].item() == 171 + assert isinstance(train_step_out.minimize, torch.Tensor) + assert train_step_out.minimize.item() == 171 # make sure the optimizer closure returns the correct things - opt_closure_result = trainer.train_loop.training_step_and_backward( + opt_closure_result = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward( batch, batch_idx, 0, @@ -138,22 +137,20 @@ def backward(self, loss, optimizer, optimizer_idx): assert model.validation_step_end_called assert not model.validation_epoch_end_called + trainer.state.stage = RunningStage.TRAINING # make sure training outputs what is expected - for batch_idx, batch in enumerate(model.train_dataloader()): - break - - out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + batch_idx, batch = 0, next(iter(model.train_dataloader())) + out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict) - train_step_out = out.training_step_output_for_epoch_end + train_step_out = out.training_step_output assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] - assert isinstance(train_step_out['minimize'], torch.Tensor) - assert train_step_out['minimize'].item() == 171 + assert isinstance(train_step_out.minimize, torch.Tensor) + assert train_step_out.minimize.item() == 171 # make sure the optimizer closure returns the correct things - opt_closure_result = trainer.train_loop.training_step_and_backward( + opt_closure_result = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], hiddens=None ) assert opt_closure_result['loss'].item() == 171 diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py index da4ecbe5a9f05..c0fde2983985d 100644 --- a/tests/trainer/loops/test_training_loop.py +++ b/tests/trainer/loops/test_training_loop.py @@ -11,10 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re + import pytest import torch from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel @@ -105,10 +108,10 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx): trainer = Trainer(max_epochs=max_epochs, limit_train_batches=10) trainer.fit(model) if batch_idx_ > trainer.num_training_batches - 1: - assert trainer.train_loop.batch_idx == trainer.num_training_batches - 1 + assert trainer.fit_loop.batch_idx == trainer.num_training_batches - 1 assert trainer.global_step == trainer.num_training_batches * max_epochs else: - assert trainer.train_loop.batch_idx == batch_idx_ + assert trainer.fit_loop.batch_idx == batch_idx_ assert trainer.global_step == batch_idx_ * max_epochs @@ -142,3 +145,43 @@ def validation_step(self, *args): assert trainer.current_epoch == 0 assert trainer.global_step == 5 assert model.validation_called_at == (0, 4) + + +@pytest.mark.parametrize(['output'], [(5., ), ({'a': 5}, )]) +def test_warning_invalid_trainstep_output(tmpdir, output): + + class InvalidTrainStepModel(BoringModel): + + def training_step(self, batch, batch_idx): + return output + + model = InvalidTrainStepModel() + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + with pytest.raises( + MisconfigurationException, + match=re.escape( + "In automatic optimization, `training_step` must either return a Tensor, " + "a dict with key 'loss' or None (where the step will be skipped)." + ) + ): + trainer.fit(model) + + +def test_warning_valid_train_step_end(tmpdir): + + class ValidTrainStepEndModel(BoringModel): + + def training_step(self, batch, batch_idx): + output = self(batch) + return {'output': output, 'batch': batch} + + def training_step_end(self, outputs): + loss = self.loss(outputs['batch'], outputs['output']) + return loss + + # No error is raised + model = ValidTrainStepEndModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + + trainer.fit(model) diff --git a/tests/trainer/loops/test_training_loop_flow_scalar.py b/tests/trainer/loops/test_training_loop_flow_scalar.py index 2f503b62f56ee..9b438aea45f87 100644 --- a/tests/trainer/loops/test_training_loop_flow_scalar.py +++ b/tests/trainer/loops/test_training_loop_flow_scalar.py @@ -11,10 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Tests to ensure that the training loop works with a dict (1.0) -""" - import pytest import torch from torch.utils.data import DataLoader @@ -22,6 +18,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.trainer.states import RunningStage from tests.helpers.boring_model import BoringModel, RandomDataset from tests.helpers.deterministic_model import DeterministicModel from tests.helpers.utils import no_warning_call @@ -149,22 +146,20 @@ def backward(self, loss, optimizer, optimizer_idx): assert len(trainer.logger_connector.callback_metrics) == 0 assert len(trainer.logger_connector.progress_bar_metrics) == 0 + trainer.state.stage = RunningStage.TRAINING # make sure training outputs what is expected - for batch_idx, batch in enumerate(model.train_dataloader()): - break - - out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + batch_idx, batch = 0, next(iter(model.train_dataloader())) + out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict) - train_step_out = out.training_step_output_for_epoch_end + train_step_out = out.training_step_output assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] - assert isinstance(train_step_out['minimize'], torch.Tensor) - assert train_step_out['minimize'].item() == 171 + assert isinstance(train_step_out.minimize, torch.Tensor) + assert train_step_out.minimize.item() == 171 # make sure the optimizer closure returns the correct things - opt_closure_result = trainer.train_loop.training_step_and_backward( + opt_closure_result = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward( batch, batch_idx, 0, @@ -229,22 +224,20 @@ def backward(self, loss, optimizer, optimizer_idx): assert len(trainer.logger_connector.callback_metrics) == 0 assert len(trainer.logger_connector.progress_bar_metrics) == 0 + trainer.state.stage = RunningStage.TRAINING # make sure training outputs what is expected - for batch_idx, batch in enumerate(model.train_dataloader()): - break - - out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + batch_idx, batch = 0, next(iter(model.train_dataloader())) + out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict) - train_step_out = out.training_step_output_for_epoch_end + train_step_out = out.training_step_output assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] - assert isinstance(train_step_out['minimize'], torch.Tensor) - assert train_step_out['minimize'].item() == 171 + assert isinstance(train_step_out.minimize, torch.Tensor) + assert train_step_out.minimize.item() == 171 # make sure the optimizer closure returns the correct things - opt_closure_result = trainer.train_loop.training_step_and_backward( + opt_closure_result = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], hiddens=None ) assert opt_closure_result['loss'].item() == 171 @@ -316,11 +309,13 @@ def training_step(self, batch, batch_idx): with pytest.warns(UserWarning, match=r'.*training_step returned None.*'): trainer.fit(model) + trainer.state.stage = RunningStage.TRAINING + # manually check a few batches for batch_idx, batch in enumerate(model.train_dataloader()): - out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx, 0) if not batch_idx % 2: - assert out.training_step_output_for_epoch_end == [[]] + assert out.training_step_output == [[]] assert out.signal == 0 @@ -359,9 +354,11 @@ def train_dataloader(self): with pytest.warns(UserWarning, match=r'.*train_dataloader yielded None.*'): trainer.fit(model) + trainer.state.stage = RunningStage.TRAINING + # manually check a few batches for batch_idx, batch in enumerate(model.train_dataloader()): - out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx, 0) if not batch_idx % 2: - assert out.training_step_output_for_epoch_end == [[]] + assert out.training_step_output == [[]] assert out.signal == 0 diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index ea8732a3958b2..75a509e07c26b 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -28,54 +28,56 @@ from tests.helpers.runif import RunIf -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def test_multiple_optimizers_manual_no_return(tmpdir): - """ - Tests that only training_step can be used - """ +class ManualOptModel(BoringModel): - class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False - def __init__(self): - super().__init__() - self.automatic_optimization = False + def training_step(self, batch, batch_idx): + opt_a, opt_b = self.optimizers() - def training_step(self, batch, batch_idx): - # manual - opt_a, opt_b = self.optimizers() - loss_1 = self.step(batch[0]) + # make sure there are no grads + if batch_idx > 0: + assert torch.all(self.layer.weight.grad == 0) - # make sure there are no grads - if batch_idx > 0: - assert torch.all(self.layer.weight.grad == 0) + loss_1 = self.step(batch[0]) + self.manual_backward(loss_1, opt_a) + opt_a.step() + opt_a.zero_grad() + assert torch.all(self.layer.weight.grad == 0) - self.manual_backward(loss_1, opt_a) - opt_a.step() - opt_a.zero_grad() - assert torch.all(self.layer.weight.grad == 0) + loss_2 = self.step(batch[0]) + # ensure we forward the correct params to the optimizer + # without retain_graph we can't do multiple backward passes + self.manual_backward(loss_2, opt_b, retain_graph=True) + self.manual_backward(loss_2, opt_a) + assert self.layer.weight.grad is not None + opt_b.step() + opt_b.zero_grad() + assert torch.all(self.layer.weight.grad == 0) - # fake discriminator - loss_2 = self.step(batch[0]) + return loss_2 - # ensure we forward the correct params to the optimizer - # without retain_graph we can't do multiple backward passes - self.manual_backward(loss_2, opt_b, retain_graph=True) - self.manual_backward(loss_2, opt_a, retain_graph=True) + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return optimizer, optimizer_2 - assert self.layer.weight.grad is not None - opt_b.step() - opt_b.zero_grad() - assert torch.all(self.layer.weight.grad == 0) + +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) +def test_multiple_optimizers_manual_no_return(tmpdir): + + class TestModel(ManualOptModel): + + def training_step(self, batch, batch_idx): + # avoid returning a value + super().training_step(batch, batch_idx) def training_epoch_end(self, outputs) -> None: # outputs is empty as training_step does not return # and it is not automatic optimization - assert len(outputs) == 0 - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 + assert not outputs model = TestModel() model.val_dataloader = None @@ -98,53 +100,16 @@ def configure_optimizers(self): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test_multiple_optimizers_manual_return(tmpdir): - """ - Tests that only training_step can be used - """ - class TestModel(BoringModel): - - def __init__(self): - super().__init__() - self.automatic_optimization = False + class TestModel(ManualOptModel): def training_step(self, batch, batch_idx): - # manual - opt_a, opt_b = self.optimizers() - loss_1 = self.step(batch[0]) - - # make sure there are no grads - if batch_idx > 0: - assert torch.all(self.layer.weight.grad == 0) - - self.manual_backward(loss_1, opt_a) - opt_a.step() - opt_a.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - - # fake discriminator - loss_2 = self.step(batch[0]) - - # ensure we forward the correct params to the optimizer - # without retain_graph we can't do multiple backward passes - self.manual_backward(loss_2, opt_b, retain_graph=True) - self.manual_backward(loss_2, opt_a, retain_graph=True) - - assert self.layer.weight.grad is not None - opt_b.step() - opt_b.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - + super().training_step(batch, batch_idx) return {'something': 'else'} def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer - assert len(outputs) == 2 - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 + assert outputs == [{'something': 'else'}, {'something': 'else'}] model = TestModel() model.val_dataloader = None @@ -166,55 +131,16 @@ def configure_optimizers(self): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def test_multiple_optimizers_manual_return_and_log(tmpdir): - """ - Tests that only training_step can be used - """ - - class TestModel(BoringModel): +def test_multiple_optimizers_manual_log(tmpdir): - def __init__(self): - super().__init__() - self.automatic_optimization = False + class TestModel(ManualOptModel): def training_step(self, batch, batch_idx): - # manual - opt_a, opt_b = self.optimizers() - loss_1 = self.step(batch[0]) - - # make sure there are no grads - if batch_idx > 0: - assert torch.all(self.layer.weight.grad == 0) - - self.manual_backward(loss_1, opt_a) - opt_a.step() - opt_a.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - - # fake discriminator - loss_2 = self.step(batch[0]) - - # ensure we forward the correct params to the optimizer - # without retain_graph we can't do multiple backward passes - self.manual_backward(loss_2, opt_b, retain_graph=True) - self.manual_backward(loss_2, opt_a, retain_graph=True) + loss_2 = super().training_step(batch, batch_idx) self.log('a', loss_2, on_epoch=True) - assert self.layer.weight.grad is not None - opt_b.step() - opt_b.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - - return {'something': 'else'} - def training_epoch_end(self, outputs) -> None: - # outputs should be an array with an entry per optimizer - assert len(outputs) == 2 - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 + assert not outputs model = TestModel() model.val_dataloader = None @@ -234,62 +160,13 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls - expected = {'a_step', 'a_epoch', 'epoch'} - logged = set(trainer.logged_metrics.keys()) - assert expected == logged + assert set(trainer.logged_metrics) == {'a_step', 'a_epoch', 'epoch'} @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @RunIf(min_gpus=1) def test_multiple_optimizers_manual_native_amp(tmpdir): - """ - Tests that only training_step can be used - """ - - class TestModel(BoringModel): - - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def training_step(self, batch, batch_idx): - # manual - opt_a, opt_b = self.optimizers() - loss_1 = self.step(batch[0]) - - # make sure there are no grads - if batch_idx > 0: - assert torch.all(self.layer.weight.grad == 0) - - self.manual_backward(loss_1, opt_a) - opt_a.step() - opt_a.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - - # fake discriminator - loss_2 = self.step(batch[0]) - - # ensure we forward the correct params to the optimizer - # without retain_graph we can't do multiple backward passes - self.manual_backward(loss_2, opt_b, retain_graph=True) - self.manual_backward(loss_2, opt_a, retain_graph=True) - - assert self.layer.weight.grad is not None - opt_b.step() - opt_b.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - - def training_epoch_end(self, outputs) -> None: - # outputs is empty as training_step does not return - # and it is not automatic optimization - assert len(outputs) == 0 - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 - - model = TestModel() + model = ManualOptModel() model.val_dataloader = None limit_train_batches = 2 @@ -313,57 +190,18 @@ def configure_optimizers(self): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @RunIf(min_gpus=1, amp_apex=True) def test_multiple_optimizers_manual_apex_no_return(tmpdir): - """ - Tests that only training_step can be used - """ - class TestModel(BoringModel): - - def __init__(self): - super().__init__() - self.automatic_optimization = False + class TestModel(ManualOptModel): def training_step(self, batch, batch_idx): - # manual - opt_a, opt_b = self.optimizers() - x = batch[0] - - loss_1 = self(x) - loss_1 = self.loss(loss_1, loss_1) - - # make sure there are no grads - if batch_idx > 0: - assert torch.all(self.layer.weight.grad == 0) - - self.manual_backward(loss_1, opt_a) - opt_a.step() - opt_a.zero_grad() - assert torch.all(self.layer.weight.grad == 0) - - # fake discriminator - loss_2 = self(x) - loss_2 = self.loss(loss_2, loss_2) - - # ensure we forward the correct params to the optimizer - # without retain_graph we can't do multiple backward passes - self.manual_backward(loss_2, retain_graph=True) - self.manual_backward(loss_2) - - assert self.layer.weight.grad is not None - opt_b.step() - opt_b.zero_grad() - assert torch.all(self.layer.weight.grad == 0) + # avoid returning a value + super().training_step(batch, batch_idx) def training_epoch_end(self, outputs) -> None: # outputs is empty as training_step does not return # and it is not automatic optimization assert len(outputs) == 0 - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 - model = TestModel() model.val_dataloader = None @@ -586,7 +424,6 @@ def on_train_epoch_end(self, *_, **__): limit_val_batches=0, precision=16, amp_backend='native', - accumulate_grad_batches=4, gpus=1, ) trainer.fit(model) @@ -599,14 +436,10 @@ def test_multiple_optimizers_step(tmpdir): Tests that `step` works with several optimizers """ - class TestModel(BoringModel): + class TestModel(ManualOptModel): called = False - def __init__(self): - super().__init__() - self.automatic_optimization = False - def on_after_backward(self): self.called = True norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) @@ -641,17 +474,12 @@ def training_step(self, batch, batch_idx): opt_b.step() opt_b.zero_grad() - return {'loss1': loss_1, 'loss2': loss_2} + return {'loss1': loss_1.detach(), 'loss2': loss_2.detach()} def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer assert len(outputs) == 2 - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 - model = TestModel() model.val_dataloader = None @@ -730,8 +558,7 @@ def optimizer_closure(): assert not torch.equal(weight_before, weight_after) def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer + return torch.optim.SGD(self.layer.parameters(), lr=0.1) model = TestModel() model.val_dataloader = None @@ -784,14 +611,13 @@ def optimizer_closure(): opt.step(closure=optimizer_closure) weight_after = self.layer.weight.clone() - if not self.trainer.train_loop.should_accumulate(): + if not self.trainer.fit_loop.should_accumulate(): assert not torch.equal(weight_before, weight_after) else: assert self.layer.weight.grad is not None def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer + return torch.optim.SGD(self.layer.parameters(), lr=0.1) model = TestModel() model.val_dataloader = None @@ -804,14 +630,12 @@ def configure_optimizers(self): limit_val_batches=2, max_epochs=1, log_every_n_steps=1, - accumulate_grad_batches=2, ) trainer.fit(model) assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * 2 -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @patch("torch.optim.SGD.step") def test_step_with_optimizer_closure_and_extra_arguments(step_mock, tmpdir): """ @@ -843,8 +667,7 @@ def optimizer_closure(): opt.zero_grad() def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer + return torch.optim.SGD(self.layer.parameters(), lr=0.1) model = TestModel() model.val_dataloader = None @@ -857,11 +680,10 @@ def configure_optimizers(self): limit_val_batches=2, max_epochs=1, log_every_n_steps=1, - accumulate_grad_batches=2, ) trainer.fit(model) - expected_calls = [call(closure=ANY) for s in range(2)] + expected_calls = [call(closure=ANY) for _ in range(2)] step_mock.assert_has_calls(expected_calls) @@ -932,7 +754,6 @@ def configure_optimizers(self): limit_val_batches=2, max_epochs=1, log_every_n_steps=1, - accumulate_grad_batches=2, ) trainer.fit(model) @@ -1042,7 +863,6 @@ def train_manual_optimization(tmpdir, accelerator, model_cls=TesManualOptimizati limit_val_batches=2, max_epochs=1, log_every_n_steps=1, - accumulate_grad_batches=2, gpus=2, accelerator=accelerator, callbacks=[TestManualOptimizationDDPCallack()] @@ -1274,9 +1094,5 @@ def configure_optimizers(self): trainer.fit(model) - expected = {'epoch', 'loss_d', 'loss_g'} - logged = set(trainer.logged_metrics.keys()) - assert expected == logged - expected = {'loss_d', 'loss_g'} - logged = set(trainer.progress_bar_metrics.keys()) - assert expected == logged + assert set(trainer.logged_metrics) == {'epoch', 'loss_d', 'loss_g'} + assert set(trainer.progress_bar_metrics) == {'loss_d', 'loss_g'} diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py index aba3b53248a57..495f51ab8d394 100644 --- a/tests/trainer/optimization/test_multiple_optimizers.py +++ b/tests/trainer/optimization/test_multiple_optimizers.py @@ -30,10 +30,7 @@ def configure_optimizers(self): def test_unbalanced_logging_with_multiple_optimizers(tmpdir): - """ - This tests ensures reduction works in unbalanced logging settings, - even when a Callback also logs. - """ + """This tests ensures reduction works in unbalanced logging settings""" class TestModel(MultiOptModel): @@ -49,22 +46,12 @@ def training_step(self, batch, batch_idx, optimizer_idx): model = TestModel() model.training_epoch_end = None - class TestCallback(pl.Callback): - - def on_train_batch_end(self, trainer, pl_module, output, batch, batch_idx, dl_idx): - # when this is called, the EpochResultStore state has not been reset yet because we are still - # "INSIDE_BATCH_TRAIN_LOOP" and the LoggerConnector runs its `on_train_batch_end` after the - # Callback (see `TrainLoop.on_train_batch_end`). For this reason, opt_idx here is the index - # of the last optimizer updated (the second, index 1). This produced a KeyError as reported in #5459 - pl_module.log("test_train_batch_end", trainer.logger_connector.cached_results._opt_idx) - # Initialize a trainer trainer = pl.Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=5, limit_val_batches=5, - callbacks=[TestCallback()], weights_summary=None, ) trainer.fit(model) @@ -74,8 +61,6 @@ def on_train_batch_end(self, trainer, pl_module, output, batch, batch_idx, dl_id # test loss is properly reduced torch.testing.assert_allclose(trainer.callback_metrics[f"loss_{k}_epoch"], torch.tensor(v).mean()) - assert trainer.callback_metrics["test_train_batch_end"] == len(model.optimizers()) - 1 - def test_multiple_optimizers(tmpdir): diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py index a81e0eecf5c61..6165aa132153b 100644 --- a/tests/trainer/optimization/test_optimizers.py +++ b/tests/trainer/optimization/test_optimizers.py @@ -18,6 +18,7 @@ from torch import optim from pytorch_lightning import Callback, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate from tests.helpers.boring_model import BoringModel @@ -620,3 +621,87 @@ def test_lr_scheduler_epoch_step_frequency(mocked_sched, check_val_every_n_epoch ) trainer.fit(model) assert mocked_sched.call_count == expected_steps + + +@pytest.mark.parametrize('every_n_train_steps, epoch_interval', [(None, True), (2, False), (2, True)]) +def test_lr_scheduler_state_updated_before_saving(tmpdir, every_n_train_steps, epoch_interval): + batches = 2 + max_epochs = 1 + lr, gamma = 1, 10 + trainer = Trainer( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + logger=False, + max_epochs=max_epochs, + limit_train_batches=batches, + limit_val_batches=1, + callbacks=[ModelCheckpoint(dirpath=tmpdir, every_n_train_steps=every_n_train_steps)] + ) + + class TestModel(BoringModel): + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.parameters(), lr=lr) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma) + lr_dict = {'scheduler': lr_scheduler} + if not epoch_interval: + lr_dict['interval'] = 'step' + return [optimizer], [lr_dict] + + def on_save_checkpoint(self, checkpoint): + lr_dict = checkpoint['lr_schedulers'][0] + # 2 batches ran. since the lr_dict interval is `step`, the step count should be 2 + assert self.trainer.global_step + 1 == batches # the global step hasn't been increased yet + compare_to = max_epochs if epoch_interval else batches + assert lr_dict['_step_count'] - 1 == compare_to # step count starts at 1 + assert lr_dict['_last_lr'] == [lr * gamma**compare_to] + self.on_save_checkpoint_called = True + + model = TestModel() + trainer.fit(model) + assert model.on_save_checkpoint_called + + +def test_plateau_scheduler_lr_step_interval_updated_after_saving(tmpdir): + batches = 4 + trainer = Trainer( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + logger=False, + max_epochs=1, + limit_train_batches=batches, + limit_val_batches=1, + callbacks=[ModelCheckpoint(dirpath=tmpdir)] + ) + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx, optimizer_idx): + self.log("foo", batch_idx) + return super().training_step(batch, batch_idx) + + def configure_optimizers(self): + optimizer_1 = torch.optim.Adam(self.parameters()) + optimizer_2 = torch.optim.Adam(self.parameters()) + + lr_scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_1) + lr_dict_1 = {'scheduler': lr_scheduler1, 'interval': 'step', 'monitor': 'foo'} + + lr_scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer_2, step_size=1) + lr_dict_2 = {'scheduler': lr_scheduler2, 'interval': 'step'} + return [optimizer_1, optimizer_2], [lr_dict_1, lr_dict_2] + + def on_save_checkpoint(self, checkpoint): + lr_dict_1 = checkpoint['lr_schedulers'][0] + # since plateau schedulers are updated after saving checkpoint, last_epoch should be 3 + assert lr_dict_1['last_epoch'] == batches - 1 # last epoch starts at 0 + + lr_dict_2 = checkpoint['lr_schedulers'][1] + assert lr_dict_2['_step_count'] - 1 == batches # step count starts at 1 + + self.on_save_checkpoint_called = True + + model = TestModel() + model.training_epoch_end = None + trainer.fit(model) + assert model.on_save_checkpoint_called diff --git a/tests/trainer/test_config_validator.py b/tests/trainer/test_config_validator.py index 9fccd9b36440a..6762d65f41bab 100644 --- a/tests/trainer/test_config_validator.py +++ b/tests/trainer/test_config_validator.py @@ -128,17 +128,13 @@ def test_dataloader(self): def predict_dataloader(self): return self._dataloaders - dataloaders = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))] + data = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))] + if datamodule: + data = TestLightningDataModule(data) model = TestModel() - trainer = Trainer(default_root_dir=tmpdir) - - if datamodule: - datamodule = TestLightningDataModule(dataloaders) - results = trainer.predict(model, datamodule=datamodule) - else: - results = trainer.predict(model, dataloaders=dataloaders) + results = trainer.predict(model, data) assert len(results) == 2 assert results[0][0].shape == torch.Size([1, 2]) @@ -147,3 +143,17 @@ def predict_dataloader(self): with pytest.raises(MisconfigurationException, match="Dataloader not found for `Trainer.predict`"): trainer.predict(model) + + +def test_trainer_manual_optimization_config(tmpdir): + """ Test error message when requesting Trainer features unsupported with manual optimization """ + model = BoringModel() + model.automatic_optimization = False + + trainer = Trainer(gradient_clip_val=1.0) + with pytest.raises(MisconfigurationException, match="Automatic gradient clipping is not supported"): + trainer.fit(model) + + trainer = Trainer(accumulate_grad_batches=2) + with pytest.raises(MisconfigurationException, match="Automatic gradient accumulation is not supported"): + trainer.fit(model) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 831fc474336b6..5d4da1be7ddbe 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -98,9 +98,13 @@ def check_replace_distributed_sampler(tmpdir, save_preds_on_dl_idx, accelerator, @RunIf(min_gpus=2, special=True) -@pytest.mark.parametrize("mode", [1, 2]) -def test_replace_distributed_sampler_custom_dataloader_custom_batch_sampler(tmpdir, mode): - check_replace_distributed_sampler(tmpdir, True, "ddp", 2, 2, mode) +def test_replace_distributed_sampler_custom_dataloader_custom_batch_sampler_0(tmpdir): + check_replace_distributed_sampler(tmpdir, True, "ddp", 2, 2, mode=1) + + +@RunIf(min_gpus=2, special=True) +def test_replace_distributed_sampler_custom_dataloader_custom_batch_sampler_1(tmpdir): + check_replace_distributed_sampler(tmpdir, True, "ddp", 2, 2, mode=2) @pytest.mark.parametrize("num_workers", [0, 1]) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index c2e5e1c24ac78..14f47a2558eff 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -813,11 +813,11 @@ def test_missing_worker_init_fn(): seed_everything(0) dataloader = DataLoader(dataset, batch_size=2, num_workers=2, shuffle=False) - batches0 = torch.cat([batch for batch in dataloader]) + batches0 = torch.cat(list(dataloader)) seed_everything(0) dataloader = DataLoader(dataset, batch_size=2, num_workers=2, shuffle=False) - batches1 = torch.cat([batch for batch in dataloader]) + batches1 = torch.cat(list(dataloader)) is_duplicated = len(torch.unique(batches1, dim=0)) < len(dataset) is_deterministic = torch.eq(batches0, batches1).all() @@ -895,6 +895,25 @@ def test_auto_add_worker_init_fn_distributed(tmpdir, monkeypatch): trainer.fit(model, train_dataloader=dataloader) +def test_warning_with_small_dataloader_and_logging_interval(tmpdir): + """ Test that a warning message is shown if the dataloader length is too short for the chosen logging interval. """ + model = BoringModel() + dataloader = DataLoader(RandomDataset(32, length=10)) + model.train_dataloader = lambda: dataloader + + with pytest.warns(UserWarning, match=r"The number of training samples \(10\) is smaller than the logging interval"): + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + log_every_n_steps=11, + ) + trainer.fit(model) + + with pytest.warns(UserWarning, match=r"The number of training samples \(1\) is smaller than the logging interval"): + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, log_every_n_steps=2, limit_train_batches=1) + trainer.fit(model) + + def test_warning_with_iterable_dataset_and_len(tmpdir): """ Tests that a warning message is shown when an IterableDataset defines `__len__`. """ model = BoringModel() diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py index 7db052218eb39..8c287e8cb37d1 100644 --- a/tests/trainer/test_progress.py +++ b/tests/trainer/test_progress.py @@ -13,7 +13,15 @@ # limitations under the License. import pytest -from pytorch_lightning.trainer.progress import LoopProgress, Progress, Tracker +from pytorch_lightning.trainer.progress import ( + BatchProgress, + EpochLoopProgress, + EpochProgress, + FitLoopProgress, + OptimizerProgress, + Progress, + Tracker, +) def test_progress_geattr_setattr(): @@ -60,51 +68,151 @@ def test_base_progress_from_defaults(): assert actual == expected -def test_loop_progress_increment_epoch(): - p = LoopProgress() +def test_epoch_loop_progress_increment_epoch(): + p = EpochLoopProgress() p.increment_epoch_completed() p.increment_epoch_completed() assert p.epoch.total == Tracker(completed=2) assert p.epoch.current == Tracker() - assert p.batch.current == Tracker() - - -def test_loop_progress_increment_sequence(): - """ Test sequences for incrementing batches reads and epochs. """ - p = LoopProgress(batch=Progress(total=Tracker(started=None))) - - p.batch.increment_ready() - assert p.batch.total == Tracker(ready=1, started=None) - assert p.batch.current == Tracker(ready=1) - - p.batch.increment_started() - assert p.batch.total == Tracker(ready=1, started=None) - assert p.batch.current == Tracker(ready=1) - - p.batch.increment_processed() - assert p.batch.total == Tracker(ready=1, started=None, processed=1) - assert p.batch.current == Tracker(ready=1, processed=1) - - p.batch.increment_completed() - assert p.batch.total == Tracker(ready=1, started=None, processed=1, completed=1) - assert p.batch.current == Tracker(ready=1, processed=1, completed=1) - - assert p.epoch.total == Tracker() - assert p.epoch.current == Tracker() - p.increment_epoch_completed() - assert p.batch.total == Tracker(ready=1, started=None, processed=1, completed=1) - assert p.batch.current == Tracker() - assert p.epoch.total == Tracker(completed=1) - assert p.epoch.current == Tracker() - - p.batch.increment_ready() - assert p.batch.total == Tracker(ready=2, started=None, processed=1, completed=1) - assert p.batch.current == Tracker(ready=1) - assert p.epoch.total == Tracker(completed=1) - assert p.epoch.current == Tracker() - - p.reset_on_epoch() - assert p.batch.total == Tracker(ready=2, started=None, processed=1, completed=1) - assert p.batch.current == Tracker() - assert p.epoch.total == Tracker(completed=1) - assert p.epoch.current == Tracker() + assert p.epoch.batch.current == Tracker() + + +def test_epoch_loop_progress_increment_sequence(): + """Test sequences for incrementing batches reads and epochs.""" + batch = BatchProgress(total=Tracker(started=None)) + epoch = EpochProgress(batch=batch) + loop = EpochLoopProgress(epoch=epoch) + + batch.increment_ready() + assert batch.total == Tracker(ready=1, started=None) + assert batch.current == Tracker(ready=1) + + batch.increment_started() + assert batch.total == Tracker(ready=1, started=None) + assert batch.current == Tracker(ready=1) + + batch.increment_processed() + assert batch.total == Tracker(ready=1, started=None, processed=1) + assert batch.current == Tracker(ready=1, processed=1) + + batch.increment_completed() + assert batch.total == Tracker(ready=1, started=None, processed=1, completed=1) + assert batch.current == Tracker(ready=1, processed=1, completed=1) + + assert epoch.total == Tracker() + assert epoch.current == Tracker() + loop.increment_epoch_completed() + assert batch.total == Tracker(ready=1, started=None, processed=1, completed=1) + assert batch.current == Tracker() + assert epoch.total == Tracker(completed=1) + assert epoch.current == Tracker() + + batch.increment_ready() + assert batch.total == Tracker(ready=2, started=None, processed=1, completed=1) + assert batch.current == Tracker(ready=1) + assert epoch.total == Tracker(completed=1) + assert epoch.current == Tracker() + + loop.reset_on_epoch() + assert batch.total == Tracker(ready=2, started=None, processed=1, completed=1) + assert batch.current == Tracker() + assert epoch.total == Tracker(completed=1) + assert epoch.current == Tracker() + + +def test_optimizer_progress_default_factory(): + """ + Ensure that the defaults are created appropiately. If `default_factory` was not used, the default would + be shared between instances. + """ + p1 = OptimizerProgress() + p2 = OptimizerProgress() + p1.step.increment_completed() + assert p1.step.total.completed == p1.step.current.completed + assert p1.step.total.completed == 1 + assert p2.step.total.completed == 0 + + +def test_fit_loop_progress_serialization(): + fit_loop = FitLoopProgress() + state_dict = fit_loop.state_dict() + # yapf: disable + assert state_dict == { + 'epoch': { + # number of epochs across `fit` calls + 'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + # number of epochs this `fit` call + 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + 'batch': { + # number of batches across `fit` calls + 'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + # number of batches this epoch + 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + }, + # `fit` optimization progress + 'optim': { + # optimizers progress + 'optimizer': { + 'step': { + # `optimizer.step` calls across `fit` calls + 'total': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0}, + # `optimizer.step` calls this epoch + 'current': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0}, + }, + 'zero_grad': { + # `optimizer.zero_grad` calls across `fit` calls + 'total': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0}, + # `optimizer.zero_grad` calls this epoch + 'current': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0}, + }, + }, + 'scheduler': { + # `scheduler.step` calls across `fit` calls + 'total': {'completed': 0, 'processed': None, 'ready': 0, 'started': None}, + # `scheduler.step` calls this epoch + 'current': {'completed': 0, 'processed': None, 'ready': 0, 'started': None}, + }, + }, + # `fit` validation progress + 'val': { + 'epoch': { + # number of `validation` calls across `fit` calls + 'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + # number of `validation` calls this `fit` call + 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + 'batch': { + # number of batches across `fit` `validation` calls + 'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + # number of batches this `fit` `validation` call + 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + }, + } + }, + } + } + # yapf: enable + new_loop = FitLoopProgress.from_state_dict(state_dict) + assert fit_loop == new_loop + + +def test_epoch_loop_progress_serialization(): + loop = EpochLoopProgress() + state_dict = loop.state_dict() + # yapf: disable + assert state_dict == { + 'epoch': { + # number of times `validate` has been called + 'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + # either 0 or 1 as `max_epochs` does not apply to the `validate` loop + 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + 'batch': { + # number of batches across `validate` calls + 'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + # number of batches this `validate` call + 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0}, + }, + } + } + # yapf: enable + new_loop = EpochLoopProgress.from_state_dict(state_dict) + assert loop == new_loop diff --git a/tests/trainer/test_states.py b/tests/trainer/test_states.py index c9fb50e8501dd..2614eda6d4634 100644 --- a/tests/trainer/test_states.py +++ b/tests/trainer/test_states.py @@ -37,25 +37,28 @@ class TestModel(BoringModel): def __init__(self, expected_fn, expected_stage): super().__init__() - self.expected_state = expected_fn + self.expected_fn = expected_fn self.expected_stage = expected_stage self.lr = 0.1 - def on_batch_start(self, *_): - assert self.trainer.state == TrainerState( - status=TrainerStatus.RUNNING, fn=self.expected_fn, stage=self.expected_stage - ) - def on_train_batch_start(self, *_): + assert self.trainer.state.status == TrainerStatus.RUNNING + assert self.trainer.state.fn == self.expected_fn assert self.trainer.training def on_sanity_check_start(self, *_): + assert self.trainer.state.status == TrainerStatus.RUNNING + assert self.trainer.state.fn == self.expected_fn assert self.trainer.sanity_checking def on_validation_batch_start(self, *_): + assert self.trainer.state.status == TrainerStatus.RUNNING + assert self.trainer.state.fn == self.expected_fn assert self.trainer.validating or self.trainer.sanity_checking def on_test_batch_start(self, *_): + assert self.trainer.state.status == TrainerStatus.RUNNING + assert self.trainer.state.fn == self.expected_fn assert self.trainer.testing model = TestModel(TrainerFn.TUNING, RunningStage.TRAINING) diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 169c8cb80b04d..2e4d6bcc5b833 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -91,7 +91,7 @@ def __iter__(self): dataset = IterDataset() iterator = prefetch_iterator(dataset) - assert [item for item in iterator] == [(1, False), (2, False), (3, True)] + assert list(iterator) == [(1, False), (2, False), (3, True)] class EmptyIterDataset(IterableDataset): @@ -100,7 +100,7 @@ def __iter__(self): dataset = EmptyIterDataset() iterator = prefetch_iterator(dataset) - assert [item for item in iterator] == [] + assert list(iterator) == [] @pytest.mark.parametrize( diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index a8567db70d0a6..0a4cfc42ffd3a 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -39,7 +39,7 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import DeviceType, DistributedType from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything from tests.base import EvalModelTemplate from tests.helpers import BoringModel, RandomDataset @@ -232,29 +232,61 @@ def test_trainer_accumulate_grad_batches_zero_grad(tmpdir, accumulate_grad_batch def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_batches, limit_train_batches): """ Verify optimizer.step() applied to last batch while grad accumulation """ - class CurrentModel(BoringModel): + class TestModel(BoringModel): - def on_batch_start(self, *_): - self.on_train_batch_start_state_dict = self.state_dict() + def state_dict(self, *args, **kwargs): + return deepcopy(super().state_dict(*args, **kwargs)) - def on_batch_end(self, outputs, batch, batch_idx, *_): - self.on_train_batch_start_end_dict = self.state_dict() - for key in self.on_train_batch_start_end_dict.keys(): - equal = torch.equal(self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key]) - if (batch_idx + 1) == self.trainer.num_training_batches: - assert equal - else: - assert not equal + def check(self, d1, d2, equal=True): + keys = d1.keys() | d2.keys() + values = [torch.equal(d1[k], d2[k]) for k in keys] + return all(values) if equal else not any(values) - model = CurrentModel() + def backward(self, *args, **kwargs) -> None: + pre_bwd_state_dict = self.state_dict() + assert self.check(self.start_state_dict, pre_bwd_state_dict) + + out = super().backward(*args, **kwargs) + + # state dict is equal, just the gradients changed + assert self.check(pre_bwd_state_dict, self.state_dict()) + + return out + + def optimizer_step(self, *args, **kwargs): + pre_opt_step_state_dict = self.state_dict() + assert self.check(self.start_state_dict, pre_opt_step_state_dict) + + # this calls `backward` and `on_after_backward` inside the closure + out = super().optimizer_step(*args, **kwargs) + # the state dict changed + assert self.check(pre_opt_step_state_dict, self.state_dict(), equal=False) + + self.opt_step_called = True + return out + + def on_train_batch_start(self, *_): + self.start_state_dict = self.state_dict() + self.opt_step_called = False + + def on_train_batch_end(self, outputs, batch, batch_idx, *_): + end_state_dict = self.state_dict() + is_last_batch = (batch_idx + 1) == self.trainer.num_training_batches + + if is_last_batch or self.opt_step_called: + assert self.check(self.start_state_dict, end_state_dict, equal=False) + else: + assert self.check(self.start_state_dict, end_state_dict) + + model = TestModel() trainer = Trainer( accumulate_grad_batches=accumulate_grad_batches, max_epochs=2, limit_train_batches=limit_train_batches, limit_val_batches=0, - limit_test_batches=0, default_root_dir=tmpdir, + progress_bar_refresh_rate=0, ) trainer.fit(model) @@ -339,9 +371,9 @@ def mock_save_function(filepath, *args): # emulate callback's calls during the training for i, loss in enumerate(losses): - trainer.train_loop.current_epoch = i - trainer.train_loop.global_step = i - trainer.logger_connector.callback_metrics = {"checkpoint_on": torch.tensor(loss)} + trainer.fit_loop.current_epoch = i + trainer.fit_loop.global_step = i + trainer.logger_connector.callback_metrics.update({"checkpoint_on": loss}) checkpoint_callback.on_validation_end(trainer, trainer.lightning_module) file_lists = set(os.listdir(tmpdir)) @@ -391,7 +423,7 @@ def test_model_checkpoint_only_weights(tmpdir): # assert restoring train state fails with pytest.raises(KeyError, match="checkpoint contains only the model"): - trainer.checkpoint_connector.restore_training_state(checkpoint) + trainer.checkpoint_connector.restore(new_weights_path) def test_model_freeze_unfreeze(): @@ -894,21 +926,21 @@ def test_gradient_clipping(tmpdir): default_root_dir=tmpdir, ) - trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward + old_training_step_and_backward = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens): """ wrap the forward step in a closure so second order methods work """ # test that gradient is clipped correctly - ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + ret_val = old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2) assert (grad_norm - 1.0).abs() < 0.01, "Gradient norm != 1.0: {grad_norm}".format(grad_norm=grad_norm) return ret_val - trainer.train_loop.training_step_and_backward = training_step_and_backward + trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward = training_step_and_backward # for the test model.prev_called_batch_idx = 0 @@ -932,14 +964,14 @@ def test_gradient_clipping_by_value(tmpdir): default_root_dir=tmpdir ) - trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward + old_training_step_and_backward = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens): """ wrap the forward step in a closure so second order methods work """ # test that gradient is clipped correctly - ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + ret_val = old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters] grad_max = torch.max(torch.stack(grad_max_list)) @@ -948,7 +980,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde return ret_val - trainer.train_loop.training_step_and_backward = training_step_and_backward + trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward = training_step_and_backward # for the test model.prev_called_batch_idx = 0 @@ -973,21 +1005,21 @@ def test_gradient_clipping_fp16(tmpdir): default_root_dir=tmpdir, ) - trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward + old_training_step_and_backward = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens): """ wrap the forward step in a closure so second order methods work """ # test that gradient is clipped correctly - ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + ret_val = old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2) assert (grad_norm - 1.0).abs() < 0.01, "Gradient norm != 1.0: {grad_norm}".format(grad_norm=grad_norm) return ret_val - trainer.train_loop.training_step_and_backward = training_step_and_backward + trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward = training_step_and_backward model.prev_called_batch_idx = 0 trainer.fit(model) @@ -1012,14 +1044,14 @@ def test_gradient_clipping_by_value_fp16(tmpdir): default_root_dir=tmpdir, ) - trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward + old_training_step_and_backward = trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens): """ wrap the forward step in a closure so second order methods work """ # test that gradient is clipped correctly - ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + ret_val = old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters] grad_max = torch.max(torch.stack(grad_max_list)) @@ -1028,7 +1060,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde return ret_val - trainer.train_loop.training_step_and_backward = training_step_and_backward + trainer.fit_loop.epoch_loop.batch_loop.training_step_and_backward = training_step_and_backward model.prev_called_batch_idx = 0 trainer.fit(model) @@ -1069,7 +1101,9 @@ def test_num_sanity_val_steps(tmpdir, limit_val_batches): assert trainer.num_sanity_val_steps == num_sanity_val_steps with patch.object( - trainer.evaluation_loop, "evaluation_step", wraps=trainer.evaluation_loop.evaluation_step + trainer.fit_loop.epoch_loop.val_loop.epoch_loop, + "evaluation_step", + wraps=trainer.fit_loop.epoch_loop.val_loop.epoch_loop.evaluation_step ) as mocked: val_dataloaders = model.val_dataloader__multiple_mixed_length() trainer.fit(model, val_dataloaders=val_dataloaders) @@ -1097,7 +1131,9 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): assert trainer.num_sanity_val_steps == float("inf") with patch.object( - trainer.evaluation_loop, "evaluation_step", wraps=trainer.evaluation_loop.evaluation_step + trainer.fit_loop.epoch_loop.val_loop.epoch_loop, + "evaluation_step", + wraps=trainer.fit_loop.epoch_loop.val_loop.epoch_loop.evaluation_step ) as mocked: val_dataloaders = model.val_dataloader__multiple() trainer.fit(model, val_dataloaders=val_dataloaders) @@ -1733,7 +1769,7 @@ def compare_optimizers(): trainer.fit(model) compare_optimizers() - trainer.train_loop.max_epochs = 2 # simulate multiple fit calls + trainer.fit_loop.max_epochs = 2 # simulate multiple fit calls trainer.fit(model) compare_optimizers() @@ -1901,3 +1937,35 @@ def test_exception_when_lightning_module_is_not_set_on_trainer(): trainer.test() with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*predict"): trainer.predict() + + +@RunIf(min_gpus=2, special=True) +def test_ddp_terminate_when_deadlock_is_detected(tmpdir): + """ Test that DDP kills the remaining processes when only one rank is throwing an exception. """ + + class CustomException(Exception): + pass + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + if batch_idx == 1 and self.trainer.is_global_zero: + # rank 0: raises an exception + # rank 1: continues training but will hang on the next barrier in the training loop + raise CustomException + return super().training_step(batch, batch_idx) + + model = TestModel() + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=5, + num_sanity_val_steps=0, + gpus=2, + accelerator="ddp", + ) + + # simulate random failure in training_step on rank 0 + with pytest.raises(DeadlockDetectedException, match="CustomException"): + trainer.fit(model) diff --git a/tests/tuner/test_auto_gpu_select.py b/tests/tuner/test_auto_gpu_select.py index 32ec0282c8ce4..2d13855f93239 100644 --- a/tests/tuner/test_auto_gpu_select.py +++ b/tests/tuner/test_auto_gpu_select.py @@ -51,7 +51,7 @@ def test_trainer_with_gpus_options_combination_at_available_gpus_env(auto_select ["nb", "expected_gpu_idxs", "expected_error"], [ (0, [], MisconfigurationException), - (-1, [i for i in range(torch.cuda.device_count())], None), + (-1, list(range(torch.cuda.device_count())), None), (1, [0], None), ], ) diff --git a/tests/utilities/distributed.py b/tests/utilities/distributed.py index 80c0246ce6c57..0a1a6dbc5badd 100644 --- a/tests/utilities/distributed.py +++ b/tests/utilities/distributed.py @@ -20,12 +20,13 @@ import pytorch_lightning -def call_training_script(module_file, cli_args, method, tmpdir, timeout=60): +def call_training_script(module_file, cli_args, method, tmpdir, timeout=60, as_module=False): file = Path(module_file.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--tmpdir', str(tmpdir)] cli_args += ['--trainer_method', method] - command = [sys.executable, str(file)] + cli_args + file_args = ["-m", module_file.__spec__.name] if as_module else [str(file)] + command = [sys.executable] + file_args + cli_args # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment env = os.environ.copy() diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py index a7eea3a749f26..5b8d8c3596e7c 100644 --- a/tests/utilities/test_apply_func.py +++ b/tests/utilities/test_apply_func.py @@ -11,18 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses import numbers -from collections import namedtuple +from collections import namedtuple, OrderedDict +from typing import List import numpy as np +import pytest import torch -from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections def test_recursive_application_to_collection(): ntc = namedtuple('Foo', ['bar']) + @dataclasses.dataclass + class Feature: + input_ids: torch.Tensor + segment_ids: np.ndarray + + @dataclasses.dataclass + class ModelExample: + example_ids: List[str] + feature: Feature + label: torch.Tensor + to_reduce = { 'a': torch.tensor([1.]), # Tensor 'b': [torch.tensor([2.])], # list @@ -30,7 +44,13 @@ def test_recursive_application_to_collection(): 'd': ntc(bar=5.), # named tuple 'e': np.array([10.]), # numpy array 'f': 'this_is_a_dummy_str', # string - 'g': 12. # number + 'g': 12., # number + 'h': Feature(input_ids=torch.tensor([1., 2., 3.]), segment_ids=np.array([4., 5., 6.])), # dataclass + 'i': ModelExample( + example_ids=['i-1', 'i-2', 'i-3'], + feature=Feature(input_ids=torch.tensor([1., 2., 3.]), segment_ids=np.array([4., 5., 6.])), + label=torch.tensor([7., 8., 9.]) + ) # nested dataclass } expected_result = { @@ -40,14 +60,20 @@ def test_recursive_application_to_collection(): 'd': ntc(bar=torch.tensor([10.])), 'e': np.array([20.]), 'f': 'this_is_a_dummy_str', - 'g': 24. + 'g': 24., + 'h': Feature(input_ids=torch.tensor([2., 4., 6.]), segment_ids=np.array([8., 10., 12.])), + 'i': ModelExample( + example_ids=['i-1', 'i-2', 'i-3'], + feature=Feature(input_ids=torch.tensor([2., 4., 6.]), segment_ids=np.array([8., 10., 12.])), + label=torch.tensor([14., 16., 18.]) + ) } reduced = apply_to_collection(to_reduce, (torch.Tensor, numbers.Number, np.ndarray), lambda x: x * 2) assert isinstance(reduced, dict), ' Type Consistency of dict not preserved' - assert all([x in reduced for x in to_reduce.keys()]), 'Not all entries of the dict were preserved' - assert all([isinstance(reduced[k], type(expected_result[k])) for k in to_reduce.keys()]), \ + assert all([x in reduced for x in to_reduce]), 'Not all entries of the dict were preserved' + assert all([isinstance(reduced[k], type(expected_result[k])) for k in to_reduce]), \ 'At least one type was not correctly preserved' assert isinstance(reduced['a'], torch.Tensor), 'Reduction Result of a Tensor should be a Tensor' @@ -74,5 +100,112 @@ def test_recursive_application_to_collection(): assert isinstance(reduced['f'], str), 'A string should not be reduced' assert reduced['f'] == expected_result['f'], 'String not preserved during reduction' - assert isinstance(reduced['g'], numbers.Number), 'Reduction of a number should result in a tensor' + assert isinstance(reduced['g'], numbers.Number), 'Reduction of a number should result in a number' assert reduced['g'] == expected_result['g'], 'Reduction of a number did not yield the desired result' + + assert dataclasses.is_dataclass(reduced['h']) and not isinstance(reduced['h'], type), \ + 'Reduction of a dataclass should result in a dataclass' + assert torch.allclose(reduced['h'].input_ids, expected_result['h'].input_ids), \ + 'Reduction of a dataclass did not yield the desired result' + assert np.allclose(reduced['h'].segment_ids, expected_result['h'].segment_ids), \ + 'Reduction of a dataclass did not yield the desired result' + + assert dataclasses.is_dataclass(reduced['i']) and not isinstance(reduced['i'], type), \ + 'Reduction of a dataclass should result in a dataclass' + assert dataclasses.is_dataclass(reduced['i'].feature) and not isinstance(reduced['i'].feature, type), \ + 'Reduction of a nested dataclass should result in a nested dataclass' + assert reduced['i'].example_ids == expected_result['i'].example_ids, \ + 'Reduction of a nested dataclass did not yield the desired result' + assert torch.allclose(reduced['i'].label, expected_result['i'].label), \ + 'Reduction of a nested dataclass did not yield the desired result' + assert torch.allclose(reduced['i'].feature.input_ids, expected_result['i'].feature.input_ids), \ + 'Reduction of a nested dataclass did not yield the desired result' + assert np.allclose(reduced['i'].feature.segment_ids, expected_result['i'].feature.segment_ids), \ + 'Reduction of a nested dataclass did not yield the desired result' + + # mapping support + reduced = apply_to_collection({'a': 1, 'b': 2}, int, lambda x: str(x)) + assert reduced == {'a': '1', 'b': '2'} + reduced = apply_to_collection(OrderedDict([('b', 2), ('a', 1)]), int, lambda x: str(x)) + assert reduced == OrderedDict([('b', '2'), ('a', '1')]) + + # custom mappings + class _CustomCollection(dict): + + def __init__(self, initial_dict): + super().__init__(initial_dict) + + to_reduce = _CustomCollection({'a': 1, 'b': 2, 'c': 3}) + reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) + assert reduced == _CustomCollection({'a': '1', 'b': '2', 'c': '3'}) + + +def test_apply_to_collection_include_none(): + to_reduce = [1, 2, 3.4, 5.6, 7] + + def fn(x): + if isinstance(x, float): + return x + + reduced = apply_to_collection(to_reduce, (int, float), fn) + assert reduced == [None, None, 3.4, 5.6, None] + + reduced = apply_to_collection(to_reduce, (int, float), fn, include_none=False) + assert reduced == [3.4, 5.6] + + +def test_apply_to_collections(): + to_reduce_1 = {'a': {'b': [1, 2]}, 'c': 5} + to_reduce_2 = {'a': {'b': [3, 4]}, 'c': 6} + + def fn(a, b): + return a + b + + # basic test + reduced = apply_to_collections(to_reduce_1, to_reduce_2, int, fn) + assert reduced == {'a': {'b': [4, 6]}, 'c': 11} + + with pytest.raises(KeyError): + # strict mode - if a key does not exist in both we fail + apply_to_collections({**to_reduce_2, 'd': 'foo'}, to_reduce_1, float, fn) + + # multiple dtypes + reduced = apply_to_collections(to_reduce_1, to_reduce_2, (list, int), fn) + assert reduced == {'a': {'b': [1, 2, 3, 4]}, 'c': 11} + + # wrong dtype + reduced = apply_to_collections(to_reduce_1, to_reduce_2, (list, int), fn, wrong_dtype=int) + assert reduced == {'a': {'b': [1, 2, 3, 4]}, 'c': 5} + + # list takes precedence because it is the type of data1 + reduced = apply_to_collections([1, 2, 3], [4], (int, list), fn) + assert reduced == [1, 2, 3, 4] + + # different sizes + with pytest.raises(AssertionError, match='Sequence collections have different sizes'): + apply_to_collections([[1, 2], [3]], [4], int, fn) + + def fn(a, b): + return a.keys() | b.keys() + + # base case + reduced = apply_to_collections(to_reduce_1, to_reduce_2, dict, fn) + assert reduced == {'a', 'c'} + + # type conversion + to_reduce = [(1, 2), (3, 4)] + reduced = apply_to_collections(to_reduce, to_reduce, int, lambda *x: sum(x)) + assert reduced == [(2, 4), (6, 8)] + + # named tuple + foo = namedtuple('Foo', ['bar']) + to_reduce = [foo(1), foo(2), foo(3)] + reduced = apply_to_collections(to_reduce, to_reduce, int, lambda *x: sum(x)) + assert reduced == [foo(2), foo(4), foo(6)] + + # passing none + reduced1 = apply_to_collections([1, 2, 3], None, int, lambda x: x * x) + reduced2 = apply_to_collections(None, [1, 2, 3], int, lambda x: x * x) + assert reduced1 == reduced2 == [1, 4, 9] + reduced = apply_to_collections(None, None, int, lambda x: x * x) + assert reduced is None diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 5780a83e75db8..67866219a76bf 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -20,18 +20,26 @@ from argparse import Namespace from contextlib import redirect_stdout from io import StringIO +from typing import List, Optional from unittest import mock import pytest +import torch import yaml +from packaging import version from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.utilities import _TPU_AVAILABLE -from pytorch_lightning.utilities.cli import LightningArgumentParser, LightningCLI, SaveConfigCallback +from pytorch_lightning.utilities.cli import instantiate_class, LightningArgumentParser, LightningCLI, SaveConfigCallback +from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE from tests.helpers import BoringDataModule, BoringModel +torchvision_version = version.parse('0') +if _TORCHVISION_AVAILABLE: + torchvision_version = version.parse(__import__('torchvision').__version__) + @mock.patch('argparse.ArgumentParser.parse_args') def test_default_args(mock_argparse, tmpdir): @@ -281,6 +289,27 @@ def on_fit_start(self): assert cli.trainer.ran_asserts +def test_lightning_cli_configurable_callbacks(tmpdir): + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_lightning_class_args(LearningRateMonitor, 'learning_rate_monitor') + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + '--learning_rate_monitor.logging_interval=epoch', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = MyLightningCLI(BoringModel) + + callback = [c for c in cli.trainer.callbacks if isinstance(c, LearningRateMonitor)] + assert len(callback) == 1 + assert callback[0].logging_interval == 'epoch' + + def test_lightning_cli_args_cluster_environments(tmpdir): plugins = [dict(class_path='pytorch_lightning.plugins.environments.SLURMEnvironment')] @@ -320,6 +349,31 @@ def test_lightning_cli_args(tmpdir): assert config['trainer'] == cli.config['trainer'] +def test_lightning_cli_save_config_cases(tmpdir): + + config_path = tmpdir / 'config.yaml' + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.logger=False', + '--trainer.fast_dev_run=1', + ] + + # With fast_dev_run!=False config should not be saved + with mock.patch('sys.argv', ['any.py'] + cli_args): + LightningCLI(BoringModel) + assert not os.path.isfile(config_path) + + # With fast_dev_run==False config should be saved + cli_args[-1] = '--trainer.max_epochs=1' + with mock.patch('sys.argv', ['any.py'] + cli_args): + LightningCLI(BoringModel) + assert os.path.isfile(config_path) + + # If run again on same directory exception should be raised since config file already exists + with mock.patch('sys.argv', ['any.py'] + cli_args), pytest.raises(RuntimeError): + LightningCLI(BoringModel) + + def test_lightning_cli_config_and_subclass_mode(tmpdir): config = dict( @@ -438,8 +492,245 @@ def __init__( with mock.patch('sys.argv', ['any.py'] + cli_args): cli = LightningCLI(MainModule) - assert cli.config_init['model']['main_param'] == 2 - assert cli.model.submodule1 == cli.config_init['model']['submodule1'] - assert cli.model.submodule2 == cli.config_init['model']['submodule2'] - assert isinstance(cli.config_init['model']['submodule1'], BoringModel) - assert isinstance(cli.config_init['model']['submodule2'], BoringModel) + assert cli.config['model']['main_param'] == 2 + assert isinstance(cli.model.submodule1, BoringModel) + assert isinstance(cli.model.submodule2, BoringModel) + + +@pytest.mark.skipif(torchvision_version < version.parse('0.8.0'), reason='torchvision>=0.8.0 is required') +def test_lightning_cli_torch_modules(tmpdir): + + class TestModule(BoringModel): + + def __init__( + self, + activation: torch.nn.Module = None, + transform: Optional[List[torch.nn.Module]] = None, + ): + super().__init__() + self.activation = activation + self.transform = transform + + config = """model: + activation: + class_path: torch.nn.LeakyReLU + init_args: + negative_slope: 0.2 + transform: + - class_path: torchvision.transforms.Resize + init_args: + size: 64 + - class_path: torchvision.transforms.CenterCrop + init_args: + size: 64 + """ + config_path = tmpdir / 'config.yaml' + with open(config_path, 'w') as f: + f.write(config) + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + f'--config={str(config_path)}', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = LightningCLI(TestModule) + + assert isinstance(cli.model.activation, torch.nn.LeakyReLU) + assert cli.model.activation.negative_slope == 0.2 + assert len(cli.model.transform) == 2 + assert all(isinstance(v, torch.nn.Module) for v in cli.model.transform) + + +class BoringModelRequiredClasses(BoringModel): + + def __init__( + self, + num_classes: int, + batch_size: int = 8, + ): + super().__init__() + self.num_classes = num_classes + self.batch_size = batch_size + + +class BoringDataModuleBatchSizeAndClasses(BoringDataModule): + + def __init__( + self, + batch_size: int = 8, + ): + super().__init__() + self.batch_size = batch_size + self.num_classes = 5 # only available after instantiation + + +def test_lightning_cli_link_arguments(tmpdir): + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.link_arguments('data.batch_size', 'model.batch_size') + parser.link_arguments('data.num_classes', 'model.num_classes', apply_on='instantiate') + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + '--data.batch_size=12', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = MyLightningCLI(BoringModelRequiredClasses, BoringDataModuleBatchSizeAndClasses) + + assert cli.model.batch_size == 12 + assert cli.model.num_classes == 5 + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.link_arguments('data.batch_size', 'model.init_args.batch_size') + parser.link_arguments('data.num_classes', 'model.init_args.num_classes', apply_on='instantiate') + + cli_args[-1] = '--model=tests.utilities.test_cli.BoringModelRequiredClasses' + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = MyLightningCLI( + BoringModelRequiredClasses, + BoringDataModuleBatchSizeAndClasses, + subclass_mode_model=True, + ) + + assert cli.model.batch_size == 8 + assert cli.model.num_classes == 5 + + +def test_cli_config_overwrite(tmpdir): + trainer_defaults = {'default_root_dir': str(tmpdir), 'logger': False, 'max_steps': 1, 'max_epochs': 1} + + with mock.patch('sys.argv', ['any.py']): + LightningCLI(BoringModel, trainer_defaults=trainer_defaults) + with mock.patch('sys.argv', ['any.py']), pytest.raises(RuntimeError, match='Aborting to avoid overwriting'): + LightningCLI(BoringModel, trainer_defaults=trainer_defaults) + with mock.patch('sys.argv', ['any.py']): + LightningCLI(BoringModel, save_config_overwrite=True, trainer_defaults=trainer_defaults) + + +def test_lightning_cli_optimizer(tmpdir): + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args(torch.optim.Adam) + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + ] + + match = ( + 'BoringModel.configure_optimizers` will be overridden by ' + '`MyLightningCLI.add_configure_optimizers_method_to_model`' + ) + with mock.patch('sys.argv', ['any.py'] + cli_args), pytest.warns(UserWarning, match=match): + cli = MyLightningCLI(BoringModel) + + assert cli.model.configure_optimizers is not BoringModel.configure_optimizers + assert len(cli.trainer.optimizers) == 1 + assert isinstance(cli.trainer.optimizers[0], torch.optim.Adam) + assert len(cli.trainer.lr_schedulers) == 0 + + +def test_lightning_cli_optimizer_and_lr_scheduler(tmpdir): + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args(torch.optim.Adam) + parser.add_lr_scheduler_args(torch.optim.lr_scheduler.ExponentialLR) + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + '--lr_scheduler.gamma=0.8', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = MyLightningCLI(BoringModel) + + assert cli.model.configure_optimizers is not BoringModel.configure_optimizers + assert len(cli.trainer.optimizers) == 1 + assert isinstance(cli.trainer.optimizers[0], torch.optim.Adam) + assert len(cli.trainer.lr_schedulers) == 1 + assert isinstance(cli.trainer.lr_schedulers[0]['scheduler'], torch.optim.lr_scheduler.ExponentialLR) + assert cli.trainer.lr_schedulers[0]['scheduler'].gamma == 0.8 + + +def test_lightning_cli_optimizer_and_lr_scheduler_subclasses(tmpdir): + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args((torch.optim.SGD, torch.optim.Adam)) + parser.add_lr_scheduler_args((torch.optim.lr_scheduler.StepLR, torch.optim.lr_scheduler.ExponentialLR)) + + optimizer_arg = dict( + class_path='torch.optim.Adam', + init_args=dict(lr=0.01), + ) + lr_scheduler_arg = dict( + class_path='torch.optim.lr_scheduler.StepLR', + init_args=dict(step_size=50), + ) + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + f'--optimizer={json.dumps(optimizer_arg)}', + f'--lr_scheduler={json.dumps(lr_scheduler_arg)}', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = MyLightningCLI(BoringModel) + + assert len(cli.trainer.optimizers) == 1 + assert isinstance(cli.trainer.optimizers[0], torch.optim.Adam) + assert len(cli.trainer.lr_schedulers) == 1 + assert isinstance(cli.trainer.lr_schedulers[0]['scheduler'], torch.optim.lr_scheduler.StepLR) + assert cli.trainer.lr_schedulers[0]['scheduler'].step_size == 50 + + +def test_lightning_cli_optimizers_and_lr_scheduler_with_link_to(tmpdir): + + class MyLightningCLI(LightningCLI): + + def add_arguments_to_parser(self, parser): + parser.add_optimizer_args(torch.optim.Adam, nested_key='optim1', link_to='model.optim1') + parser.add_optimizer_args((torch.optim.ASGD, torch.optim.SGD), nested_key='optim2', link_to='model.optim2') + parser.add_lr_scheduler_args(torch.optim.lr_scheduler.ExponentialLR, link_to='model.scheduler') + + class TestModel(BoringModel): + + def __init__( + self, + optim1: dict, + optim2: dict, + scheduler: dict, + ): + super().__init__() + self.optim1 = instantiate_class(self.parameters(), optim1) + self.optim2 = instantiate_class(self.parameters(), optim2) + self.scheduler = instantiate_class(self.optim1, scheduler) + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + '--optim2={"class_path": "torch.optim.SGD", "init_args": {"lr": 0.01}}', + '--lr_scheduler.gamma=0.2', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = MyLightningCLI(TestModel) + + assert isinstance(cli.model.optim1, torch.optim.Adam) + assert isinstance(cli.model.optim2, torch.optim.SGD) + assert isinstance(cli.model.scheduler, torch.optim.lr_scheduler.ExponentialLR) diff --git a/tests/utilities/test_model_helpers.py b/tests/utilities/test_model_helpers.py new file mode 100644 index 0000000000000..f63d46bdb67b9 --- /dev/null +++ b/tests/utilities/test_model_helpers.py @@ -0,0 +1,67 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial +from unittest.mock import Mock + +import pytest + +from pytorch_lightning import LightningDataModule, Trainer +from pytorch_lightning.utilities.model_helpers import is_overridden +from tests.helpers import BoringDataModule, BoringModel + + +def test_is_overridden(): + model = BoringModel() + datamodule = BoringDataModule() + + # edge cases + assert not is_overridden("whatever", None) + with pytest.raises(ValueError, match="Expected a parent"): + is_overridden("whatever", object()) + assert not is_overridden("whatever", model) + assert not is_overridden("whatever", model, parent=LightningDataModule) + + class TestModel(BoringModel): + + def foo(self): + pass + + with pytest.raises(ValueError, match="The parent should define the method"): + is_overridden("foo", TestModel()) + + # normal usage + assert is_overridden("training_step", model) + assert is_overridden("train_dataloader", datamodule) + + # `Mock` support + mock = Mock(spec=BoringModel, wraps=model) + assert is_overridden("training_step", mock) + mock = Mock(spec=BoringDataModule, wraps=datamodule) + assert is_overridden("train_dataloader", mock) + + # `partial` support + model.training_step = partial(model.training_step) + assert is_overridden("training_step", model) + + # `_PatchDataLoader.patch_loader_code` support + class TestModel(BoringModel): + + def on_fit_start(self): + assert is_overridden("train_dataloader", self) + self.on_fit_start_called = True + + model = TestModel() + trainer = Trainer(fast_dev_run=1) + trainer.fit(model, train_dataloader=model.train_dataloader()) + assert model.on_fit_start_called diff --git a/tests/utilities/test_warnings.py b/tests/utilities/test_warnings.py new file mode 100644 index 0000000000000..2e0c372e5c39f --- /dev/null +++ b/tests/utilities/test_warnings.py @@ -0,0 +1,52 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test that the warnings actually appear and they have the correct `stacklevel` + +Needs to be run outside of `pytest` as it captures all the warnings. +""" +import os +from contextlib import redirect_stderr +from io import StringIO + +from pytorch_lightning.utilities.warnings import _warn, rank_zero_deprecation, rank_zero_warn, WarningCache + +running_special = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1" +if running_special: + + stderr = StringIO() + with redirect_stderr(stderr): + _warn("test1") + _warn("test2", DeprecationWarning) + + rank_zero_warn("test3") + rank_zero_warn("test4", DeprecationWarning) + + rank_zero_deprecation("test5") + + cache = WarningCache() + cache.warn("test6") + cache.deprecation("test7") + + output = stderr.getvalue() + assert "test_warnings.py:30: UserWarning: test1" in output + assert "test_warnings.py:31: DeprecationWarning: test2" in output + + assert "test_warnings.py:33: UserWarning: test3" in output + assert "test_warnings.py:34: DeprecationWarning: test4" in output + + assert "test_warnings.py:36: LightningDeprecationWarning: test5" in output + + assert "test_warnings.py:39: UserWarning: test6" in output + assert "test_warnings.py:40: LightningDeprecationWarning: test7" in output From cbc1136bf28eb9d60b596ccb6bb6dcc97ac5814e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 4 Jul 2021 17:37:43 +0200 Subject: [PATCH 05/26] Fix test --- pytorch_lightning/callbacks/early_stopping.py | 1 - tests/callbacks/test_early_stopping.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 6f9ea07c0716d..719607b718bb1 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -175,7 +175,6 @@ def on_train_epoch_end(self, trainer, pl_module) -> None: def on_validation_end(self, trainer, pl_module) -> None: if self._check_on_train_epoch_end or self._should_skip_check(trainer): return - self._run_early_stopping_check(trainer) def _run_early_stopping_check(self, trainer) -> None: diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index d7a6f15459912..1582a8ed90c91 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -45,8 +45,8 @@ def on_train_start(self, trainer, pl_module): if self.expected_state: assert self.on_save_checkpoint(trainer, pl_module, {}) == self.expected_state - def on_validation_end(self, trainer, pl_module): - super().on_validation_end(trainer, pl_module) + def on_train_epoch_end(self, trainer, pl_module): + super().on_train_epoch_end(trainer, pl_module) self.saved_states.append(self.on_save_checkpoint(trainer, pl_module, {}).copy()) @@ -69,12 +69,13 @@ def test_resume_early_stopping_from_checkpoint(tmpdir): ) trainer.fit(model, datamodule=dm) + assert len(early_stop_callback.saved_states) == 4 + checkpoint_filepath = checkpoint_callback.kth_best_model_path # ensure state is persisted properly checkpoint = torch.load(checkpoint_filepath) # the checkpoint saves "epoch + 1" early_stop_callback_state = early_stop_callback.saved_states[checkpoint["epoch"] - 1] - assert 4 == len(early_stop_callback.saved_states) assert checkpoint["callbacks"][type(early_stop_callback)] == early_stop_callback_state # ensure state is reloaded properly (assertion in the callback) From 56e9d893a42e4cd85f91976730793d6f13347a6e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 4 Jul 2021 17:55:35 +0200 Subject: [PATCH 06/26] Fix test --- tests/trainer/optimization/test_optimizers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py index 6165aa132153b..faf5434d6ba5a 100644 --- a/tests/trainer/optimization/test_optimizers.py +++ b/tests/trainer/optimization/test_optimizers.py @@ -662,7 +662,8 @@ def on_save_checkpoint(self, checkpoint): assert model.on_save_checkpoint_called -def test_plateau_scheduler_lr_step_interval_updated_after_saving(tmpdir): +@pytest.mark.parametrize("save_on_train_epoch_end", (False, True)) +def test_plateau_scheduler_lr_step_interval_updated_after_saving(tmpdir, save_on_train_epoch_end): batches = 4 trainer = Trainer( default_root_dir=tmpdir, @@ -671,7 +672,7 @@ def test_plateau_scheduler_lr_step_interval_updated_after_saving(tmpdir): max_epochs=1, limit_train_batches=batches, limit_val_batches=1, - callbacks=[ModelCheckpoint(dirpath=tmpdir)] + callbacks=[ModelCheckpoint(dirpath=tmpdir, save_on_train_epoch_end=save_on_train_epoch_end)] ) class TestModel(BoringModel): @@ -693,8 +694,8 @@ def configure_optimizers(self): def on_save_checkpoint(self, checkpoint): lr_dict_1 = checkpoint['lr_schedulers'][0] - # since plateau schedulers are updated after saving checkpoint, last_epoch should be 3 - assert lr_dict_1['last_epoch'] == batches - 1 # last epoch starts at 0 + last_epoch = lr_dict_1['last_epoch'] + assert last_epoch == batches - (not save_on_train_epoch_end) # last epoch starts at 0 lr_dict_2 = checkpoint['lr_schedulers'][1] assert lr_dict_2['_step_count'] - 1 == batches # step count starts at 1 From bbac98b4dda6466bf9a2c0071a02de61f9e7da8f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 5 Jul 2021 00:29:20 +0200 Subject: [PATCH 07/26] Fix tests --- tests/core/test_metric_result_integration.py | 2 +- tests/models/test_hooks.py | 10 +++++----- tests/trainer/test_dataloaders.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 7471914886a27..86cfa35746cda 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -329,7 +329,7 @@ def on_save_checkpoint(self, checkpoint) -> None: assert new_results['validation_step.v'].value.device.type == 'cpu' model = LoggingModel() - ckpt = ModelCheckpoint(dirpath=tmpdir, save_last=True) + ckpt = ModelCheckpoint(dirpath=tmpdir, save_on_train_epoch_end=False) trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 9a689fe9d725a..630b3f1f8b250 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -450,14 +450,14 @@ def test_trainer_model_hook_system_fit(tmpdir): dict(name='on_validation_start'), *model._eval_epoch('validation', trainer, model, val_batches, 'x'), dict(name='Callback.on_validation_end', args=(trainer, model)), - # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_validation_end` - dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), - dict(name='on_save_checkpoint', args=(saved_ckpt, )), dict(name='on_validation_end'), dict(name='train'), dict(name='on_validation_model_train'), dict(name='training_epoch_end', args=([dict(loss=ANY)] * train_batches, )), dict(name='Callback.on_train_epoch_end', args=(trainer, model, [dict(loss=ANY)] * train_batches)), + # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_train_epoch_end` + dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), + dict(name='on_save_checkpoint', args=(saved_ckpt, )), dict(name='on_train_epoch_end', args=([dict(loss=ANY)] * train_batches, )), dict(name='Callback.on_epoch_end', args=(trainer, model)), dict(name='on_epoch_end'), @@ -562,11 +562,11 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): model, [dict(loss=ANY)] * train_batches, )), + dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), + dict(name='on_save_checkpoint', args=(saved_ckpt, )), dict(name='on_train_epoch_end', args=([dict(loss=ANY)] * train_batches, )), dict(name='Callback.on_epoch_end', args=(trainer, model)), dict(name='on_epoch_end'), - dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), - dict(name='on_save_checkpoint', args=(saved_ckpt, )), dict(name='Callback.on_train_end', args=(trainer, model)), dict(name='on_train_end'), dict(name='Callback.on_fit_end', args=(trainer, model)), diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 14f47a2558eff..ff4d74183d9b1 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -582,7 +582,7 @@ def test_dataloaders_with_fast_dev_run(tmpdir, fast_dev_run): assert trainer.max_epochs == 1 trainer.fit(model) - assert not trainer.disable_validation + assert trainer.enable_validation assert trainer.num_training_batches == fast_dev_run assert trainer.num_val_batches == [fast_dev_run] * len(trainer.val_dataloaders) From a0afd133a24c7fbd2f06759993e07860cc077fb4 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 5 Jul 2021 01:43:12 +0200 Subject: [PATCH 08/26] Fix test --- pytorch_lightning/callbacks/pruning.py | 1 + tests/callbacks/test_pruning.py | 23 +++++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/callbacks/pruning.py b/pytorch_lightning/callbacks/pruning.py index ced8d29c14424..a15073ed719f0 100644 --- a/pytorch_lightning/callbacks/pruning.py +++ b/pytorch_lightning/callbacks/pruning.py @@ -417,6 +417,7 @@ def on_save_checkpoint( rank_zero_debug("`ModelPruning.on_save_checkpoint`. Pruning is made permanent for this checkpoint") prev_device = pl_module.device # prune a copy so training can continue with the same buffers + rank_zero_debug(f"{pl_module.layer.mlp_3.weight.grad_fn=}") copy = deepcopy(pl_module.to("cpu")) self.make_pruning_permanent(copy) checkpoint["state_dict"] = copy.state_dict() diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 1a5ddad64106e..3e14425a036ed 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -304,22 +304,27 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool assert not has_pruning if make_pruning_permanent else has_pruning -@pytest.mark.parametrize("on_train_epoch_end", (False, True)) -def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog, on_train_epoch_end): +@pytest.mark.parametrize("prune_on_train_epoch_end", (False, True)) +@pytest.mark.parametrize("save_on_train_epoch_end", (False, True)) +def test_permanent_when_model_is_saved_multiple_times( + tmpdir, caplog, prune_on_train_epoch_end, save_on_train_epoch_end +): """ When a model is saved multiple times and make_permanent=True, we need to make sure a copy is pruned and not the trained model if we want to continue with the same pruning buffers. """ + if prune_on_train_epoch_end and save_on_train_epoch_end: + # TODO: is this expected? + pytest.xfail("`pytorch_prune.global_unstructured` sets the `grad_fn` so we can't deepcopy on save") class TestPruning(ModelPruning): def on_save_checkpoint(self, trainer, pl_module, checkpoint): + had_buffers = hasattr(pl_module.layer.mlp_3, "weight_orig") super().on_save_checkpoint(trainer, pl_module, checkpoint) - if not on_train_epoch_end: - # these checks only work if pruning on `validation_epoch_end` - # because `on_save_checkpoint` is called before `on_train_epoch_end` - assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"] + assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"] + if had_buffers: assert hasattr(pl_module.layer.mlp_3, "weight_orig") model = TestModel() @@ -328,9 +333,11 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): parameters_to_prune=[(model.layer.mlp_3, "weight")], verbose=1, make_pruning_permanent=True, - prune_on_train_epoch_end=on_train_epoch_end, + prune_on_train_epoch_end=prune_on_train_epoch_end, + ) + ckpt_callback = ModelCheckpoint( + monitor="test", save_top_k=2, save_last=True, save_on_train_epoch_end=save_on_train_epoch_end ) - ckpt_callback = ModelCheckpoint(monitor="test", save_top_k=2, save_last=True) trainer = Trainer(callbacks=[pruning_callback, ckpt_callback], max_epochs=3, progress_bar_refresh_rate=0) with caplog.at_level(INFO): trainer.fit(model) From 5241864f2cf02ac59351da4f1b1f14b13f21e6e9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 5 Jul 2021 01:58:22 +0200 Subject: [PATCH 09/26] Fix test --- tests/checkpointing/test_checkpoint_callback_frequency.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 8617a9f8f7050..2b135c0a5b915 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -50,7 +50,7 @@ def test_mc_called(tmpdir): @mock.patch('torch.save') @pytest.mark.parametrize( ['epochs', 'val_check_interval', 'expected'], - [(1, 1.0, 1), (2, 1.0, 2), (1, 0.25, 4), (2, 0.3, 7)], + [(1, 1.0, 1), (2, 1.0, 2), (1, 0.25, 4), (2, 0.3, 6)], ) def test_default_checkpoint_freq(save_mock, tmpdir, epochs: int, val_check_interval: float, expected: int): @@ -60,6 +60,7 @@ def test_default_checkpoint_freq(save_mock, tmpdir, epochs: int, val_check_inter max_epochs=epochs, weights_summary=None, val_check_interval=val_check_interval, + limit_val_batches=1, progress_bar_refresh_rate=0, ) trainer.fit(model) From 9f5d886c43bcc688bd1bb7597645955d891ff12e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 5 Jul 2021 02:01:25 +0200 Subject: [PATCH 10/26] Remove debug statement --- pytorch_lightning/callbacks/pruning.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/callbacks/pruning.py b/pytorch_lightning/callbacks/pruning.py index a15073ed719f0..ced8d29c14424 100644 --- a/pytorch_lightning/callbacks/pruning.py +++ b/pytorch_lightning/callbacks/pruning.py @@ -417,7 +417,6 @@ def on_save_checkpoint( rank_zero_debug("`ModelPruning.on_save_checkpoint`. Pruning is made permanent for this checkpoint") prev_device = pl_module.device # prune a copy so training can continue with the same buffers - rank_zero_debug(f"{pl_module.layer.mlp_3.weight.grad_fn=}") copy = deepcopy(pl_module.to("cpu")) self.make_pruning_permanent(copy) checkpoint["state_dict"] = copy.state_dict() From 45156ee89f81e1a6d48d55da84d4482fee2b8f3b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 5 Jul 2021 12:26:53 +0200 Subject: [PATCH 11/26] Fix test --- tests/checkpointing/test_checkpoint_callback_frequency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 2b135c0a5b915..2d18aacde4489 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -74,7 +74,7 @@ def test_default_checkpoint_freq(save_mock, tmpdir, epochs: int, val_check_inter (1, 1, 1.0, 1), (2, 2, 1.0, 2), (2, 1, 0.25, 4), - (2, 2, 0.3, 7), + (2, 2, 0.3, 6), ]) def test_top_k(save_mock, tmpdir, k: int, epochs: int, val_check_interval: float, expected: int): From 76c6be731fe45e01a6833a20c9c4c84dd5343a2c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 8 Jul 2021 01:15:21 +0200 Subject: [PATCH 12/26] Docs and deprecation --- pytorch_lightning/callbacks/early_stopping.py | 2 +- .../callbacks/model_checkpoint.py | 86 +++++++++++-------- tests/deprecated_api/test_remove_1-6.py | 6 ++ tests/loggers/test_wandb.py | 1 - 4 files changed, 56 insertions(+), 39 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index c28e5cec5b982..0015ac47f0d41 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -61,7 +61,7 @@ class EarlyStopping(Callback): stopping_threshold: Stop training immediately once the monitored quantity reaches this threshold. divergence_threshold: Stop training as soon as the monitored quantity becomes worse than this threshold. check_on_train_epoch_end: whether to run early stopping at the end of the training epoch. - If this is ``False``, then the check runs at the end of the validation epoch. + If this is ``False``, then the check runs at the end of the validation. Raises: MisconfigurationException: diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 829fcc0df8dcd..3a1317a4cb724 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -104,28 +104,35 @@ class ModelCheckpoint(Callback): every_n_train_steps: Number of training steps between checkpoints. If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training. To disable, set ``every_n_train_steps = 0``. This value must be ``None`` or non-negative. - This must be mutually exclusive with ``train_time_interval`` and ``every_n_val_epochs``. + This must be mutually exclusive with ``train_time_interval`` and ``every_n_epochs``. train_time_interval: Checkpoints are monitored at the specified time interval. For all practical purposes, this cannot be smaller than the amount of time it takes to process a single training batch. This is not guaranteed to execute at the exact time specified, but should be close. - This must be mutually exclusive with ``every_n_train_steps`` and ``every_n_val_epochs``. - FIXME - every_n_val_epochs: Number of validation epochs between checkpoints. - If ``every_n_val_epochs == None or every_n_val_epochs == 0``, we skip saving on validation end. - To disable, set ``every_n_val_epochs = 0``. This value must be ``None`` or non-negative. + This must be mutually exclusive with ``every_n_train_steps`` and ``every_n_epochs``. + every_n_epochs: Number of epochs between checkpoints. + If ``every_n_epochs == None or every_n_epochs == 0``, we skip saving when the epoch ends. + To disable, set ``every_n_epochs = 0``. This value must be ``None`` or non-negative. This must be mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``. - Setting both ``ModelCheckpoint(..., every_n_val_epochs=V)`` and + Setting both ``ModelCheckpoint(..., every_n_epochs=V, save_on_train_epoch_end=False)`` and ``Trainer(max_epochs=N, check_val_every_n_epoch=M)`` will only save checkpoints at epochs 0 < E <= N - where both values for ``every_n_val_epochs`` and ``check_val_every_n_epoch`` evenly divide E. + where both values for ``every_n_epochs`` and ``check_val_every_n_epoch`` evenly divide E. + save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch. + If this is ``False``, then the check runs at the end of the validation. period: Interval (number of epochs) between checkpoints. - save_on_train_epoch_end: FIXME .. warning:: This argument has been deprecated in v1.3 and will be removed in v1.5. - Use ``every_n_val_epochs`` instead. + Use ``every_n_epochs`` instead. + every_n_val_epochs: Number of epochs between checkpoints. + + .. warning:: + This argument has been deprecated in v1.4 and will be removed in v1.6. + + Use ``every_n_epochs`` instead. + Note: For extra customization, ModelCheckpoint includes the following attributes: @@ -203,9 +210,10 @@ def __init__( auto_insert_metric_name: bool = True, every_n_train_steps: Optional[int] = None, train_time_interval: Optional[timedelta] = None, - every_n_val_epochs: Optional[int] = None, - period: Optional[int] = None, + every_n_epochs: Optional[int] = None, save_on_train_epoch_end: Optional[bool] = None, + period: Optional[int] = None, + every_n_val_epochs: Optional[int] = None, ): super().__init__() self.monitor = monitor @@ -224,9 +232,16 @@ def __init__( self.best_model_path = "" self.last_model_path = "" + if every_n_val_epochs is not None: + rank_zero_deprecation( + '`ModelCheckpoint(every_n_val_epochs)` is deprecated in v1.4 and will be removed in v1.6.' + ' Please use `every_n_epochs` instead.' + ) + every_n_epochs = every_n_val_epochs + self.__init_monitor_mode(mode) self.__init_ckpt_dir(dirpath, filename) - self.__init_triggers(every_n_train_steps, every_n_val_epochs, train_time_interval, period) + self.__init_triggers(every_n_train_steps, every_n_epochs, train_time_interval, period) self.__validate_init_configuration() self._save_function = None @@ -282,8 +297,7 @@ def on_train_epoch_end( """ Save a checkpoint at the end of the training epoch. """ if ( self._should_skip_saving_checkpoint(trainer) or not self._save_on_train_epoch_end - # FIXME: repurpose every_n_val_epochs to work for this hook - or self._every_n_val_epochs < 1 or (trainer.current_epoch + 1) % self._every_n_val_epochs != 0 + or self._every_n_epochs < 1 or (trainer.current_epoch + 1) % self._every_n_epochs != 0 ): return # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates @@ -295,8 +309,8 @@ def on_train_epoch_end( def on_validation_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: """ Save a checkpoint at the end of the validation stage. """ if ( - self._should_skip_saving_checkpoint(trainer) or self._save_on_train_epoch_end - or self._every_n_val_epochs < 1 or (trainer.current_epoch + 1) % self._every_n_val_epochs != 0 + self._should_skip_saving_checkpoint(trainer) or self._save_on_train_epoch_end or self._every_n_epochs < 1 + or (trainer.current_epoch + 1) % self._every_n_epochs != 0 ): return self.save_checkpoint(trainer) @@ -391,18 +405,16 @@ def __validate_init_configuration(self) -> None: raise MisconfigurationException( f'Invalid value for every_n_train_steps={self._every_n_train_steps}. Must be >= 0' ) - if self._every_n_val_epochs < 0: - raise MisconfigurationException( - f'Invalid value for every_n_val_epochs={self._every_n_val_epochs}. Must be >= 0' - ) + if self._every_n_epochs < 0: + raise MisconfigurationException(f'Invalid value for every_n_epochs={self._every_n_epochs}. Must be >= 0') every_n_train_steps_triggered = self._every_n_train_steps >= 1 - every_n_val_epochs_triggered = self._every_n_val_epochs >= 1 + every_n_epochs_triggered = self._every_n_epochs >= 1 train_time_interval_triggered = self._train_time_interval is not None - if every_n_train_steps_triggered + every_n_val_epochs_triggered + train_time_interval_triggered > 1: + if every_n_train_steps_triggered + every_n_epochs_triggered + train_time_interval_triggered > 1: raise MisconfigurationException( f"Combination of parameters every_n_train_steps={self._every_n_train_steps}, " - f"every_n_val_epochs={self._every_n_val_epochs} and train_time_interval={self._train_time_interval} " + f"every_n_epochs={self._every_n_epochs} and train_time_interval={self._train_time_interval} " "should be mutually exclusive." ) @@ -451,39 +463,39 @@ def __init_monitor_mode(self, mode: str) -> None: def __init_triggers( self, every_n_train_steps: Optional[int], - every_n_val_epochs: Optional[int], + every_n_epochs: Optional[int], train_time_interval: Optional[timedelta], period: Optional[int], ) -> None: # Default to running once after each validation epoch if neither - # every_n_train_steps nor every_n_val_epochs is set - if every_n_train_steps is None and every_n_val_epochs is None and train_time_interval is None: - every_n_val_epochs = 1 + # every_n_train_steps nor every_n_epochs is set + if every_n_train_steps is None and every_n_epochs is None and train_time_interval is None: + every_n_epochs = 1 every_n_train_steps = 0 - log.debug("Both every_n_train_steps and every_n_val_epochs are not set. Setting every_n_val_epochs=1") + log.debug("Both every_n_train_steps and every_n_epochs are not set. Setting every_n_epochs=1") else: - every_n_val_epochs = every_n_val_epochs or 0 + every_n_epochs = every_n_epochs or 0 every_n_train_steps = every_n_train_steps or 0 self._train_time_interval: Optional[timedelta] = train_time_interval - self._every_n_val_epochs: int = every_n_val_epochs + self._every_n_epochs: int = every_n_epochs self._every_n_train_steps: int = every_n_train_steps - # period takes precedence over every_n_val_epochs for backwards compatibility + # period takes precedence over every_n_epochs for backwards compatibility if period is not None: rank_zero_deprecation( 'Argument `period` in `ModelCheckpoint` is deprecated in v1.3 and will be removed in v1.5.' - ' Please use `every_n_val_epochs` instead.' + ' Please use `every_n_epochs` instead.' ) - self._every_n_val_epochs = period - self._period = self._every_n_val_epochs + self._every_n_epochs = period + self._period = self._every_n_epochs @property def period(self) -> Optional[int]: rank_zero_deprecation( 'Property `period` in `ModelCheckpoint` is deprecated in v1.3 and will be removed in v1.5.' - ' Please use `every_n_val_epochs` instead.' + ' Please use `every_n_epochs` instead.' ) return self._period @@ -491,7 +503,7 @@ def period(self) -> Optional[int]: def period(self, value: Optional[int]) -> None: rank_zero_deprecation( 'Property `period` in `ModelCheckpoint` is deprecated in v1.3 and will be removed in v1.5.' - ' Please use `every_n_val_epochs` instead.' + ' Please use `every_n_epochs` instead.' ) self._period = value diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py index 69d2a45530607..ddb551631cb57 100644 --- a/tests/deprecated_api/test_remove_1-6.py +++ b/tests/deprecated_api/test_remove_1-6.py @@ -15,6 +15,7 @@ import pytest from pytorch_lightning import Trainer +from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.plugins.training_type import DDPPlugin, DDPSpawnPlugin @@ -303,3 +304,8 @@ def test_v1_6_0_deprecated_disable_validation(): trainer = Trainer() with pytest.deprecated_call(match="disable_validation` is deprecated in v1.4"): _ = trainer.disable_validation + + +def test_v1_6_0_every_n_val_epochs(): + with pytest.deprecated_call(match="use `every_n_epochs` instead"): + _ = ModelCheckpoint(every_n_val_epochs=1) diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 27b83b75c24b9..4956a08c2fd35 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -213,7 +213,6 @@ def test_wandb_log_model(wandb, tmpdir): 'save_top_k': 1, 'save_weights_only': False, '_every_n_train_steps': 0, - '_every_n_val_epochs': 1 } } ) From f9ee8b81f3d71a2cd6d9347f3b0fe5fdaad8702e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 8 Jul 2021 01:16:00 +0200 Subject: [PATCH 13/26] fix test --- tests/checkpointing/test_model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index f0dc9907a26f4..d7aef3414dc8e 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -590,7 +590,7 @@ def test_invalid_trigger_combination(tmpdir): def test_none_every_n_train_steps_val_epochs(tmpdir): checkpoint_callback = ModelCheckpoint(dirpath=tmpdir) assert checkpoint_callback.period == 1 - assert checkpoint_callback._every_n_val_epochs == 1 + assert checkpoint_callback._every_n_epochs == 1 assert checkpoint_callback._every_n_train_steps == 0 From 15a8575ac0625cadec0d8a60c646ba1c187fc8d3 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 13 Jul 2021 17:47:11 +0200 Subject: [PATCH 14/26] Docs --- pytorch_lightning/callbacks/model_checkpoint.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 78f57f6f5137f..9ee151d571497 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -319,11 +319,10 @@ def on_validation_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModul def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: """ - Save a checkpoint at the very end of training. + Save a checkpoint when training stops. - This will only save a checkpoint if `save_last` is also enabled - as the monitor metrics logged during training/validation steps or end of epochs - are not guaranteed to be available at this stage. + This will only save a checkpoint if `save_last` is also enabled as the monitor metrics logged during + training/validation steps or end of epochs are not guaranteed to be available at this stage. """ if self._should_skip_saving_checkpoint(trainer): return From e14a80db9db89e939563af7e9dda7f5897fcdf55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 14 Jul 2021 12:13:48 +0200 Subject: [PATCH 15/26] Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: thomas chaton --- pytorch_lightning/callbacks/model_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 9ee151d571497..1a883f82dff45 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -324,9 +324,9 @@ def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') - This will only save a checkpoint if `save_last` is also enabled as the monitor metrics logged during training/validation steps or end of epochs are not guaranteed to be available at this stage. """ - if self._should_skip_saving_checkpoint(trainer): + if self._should_skip_saving_checkpoint(trainer) or not self.save_last: return - if self.save_last and self.verbose: + if self.verbose: rank_zero_info("Saving latest checkpoint...") # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates monitor_candidates = self._monitor_candidates(trainer, trainer.current_epoch, trainer.global_step - 1) From 6a0f13c1d5f10537a4e37d2a642029cdd55bdac9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 14 Jul 2021 12:32:38 +0200 Subject: [PATCH 16/26] Parametrize with save last --- .../checkpointing/test_checkpoint_callback_frequency.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 2d18aacde4489..6e1b4ece6df97 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -76,7 +76,8 @@ def test_default_checkpoint_freq(save_mock, tmpdir, epochs: int, val_check_inter (2, 1, 0.25, 4), (2, 2, 0.3, 6), ]) -def test_top_k(save_mock, tmpdir, k: int, epochs: int, val_check_interval: float, expected: int): +@pytest.mark.parametrize("save_last", (False, True)) +def test_top_k(save_mock, tmpdir, k: int, epochs: int, val_check_interval: float, expected: int, save_last: bool): class TestModel(BoringModel): @@ -94,7 +95,7 @@ def training_step(self, batch, batch_idx): model = TestModel() trainer = Trainer( - callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k)], + callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k, save_last=save_last)], default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, @@ -102,7 +103,9 @@ def training_step(self, batch, batch_idx): ) trainer.fit(model) - # make sure types are correct + if save_last: + # last epochs are saved every step (so double the save calls) and once `on_train_end` + expected = expected * 2 + 1 assert save_mock.call_count == expected From 206eefc8da86618e36d852a8d33d397f231fdf3d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 14 Jul 2021 19:04:23 +0200 Subject: [PATCH 17/26] Fix ddp test --- tests/checkpointing/test_checkpoint_callback_frequency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 6e1b4ece6df97..4aae78e622ee7 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -118,7 +118,7 @@ def test_top_k_ddp_0(save_mock, tmpdir): @mock.patch('torch.save') @RunIf(special=True, min_gpus=2) def test_top_k_ddp_1(save_mock, tmpdir): - _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) + _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=4) def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): From 238022820ecfb2aad4f342cfd5a4635f54b60d76 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 14 Jul 2021 19:16:16 +0200 Subject: [PATCH 18/26] Fix pre-commit --- .../trainer/connectors/accelerator_connector.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index a9355741a2e6d..14cbb43853d6c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -459,8 +459,10 @@ def select_precision_plugin(self) -> PrecisionPlugin: "You have asked for native AMP on CPU, but AMP is only available on GPU." ) elif not _NATIVE_AMP_AVAILABLE: - msg = "You have asked for native AMP but your PyTorch version does not support it." \ - " Consider upgrading with `pip install torch>=1.6`." + msg = ( + "You have asked for native AMP but your PyTorch version does not support it." + " Consider upgrading with `pip install torch>=1.6`." + ) if _APEX_AVAILABLE: self.amp_type = AMPType.APEX msg += " We will attempt to use NVIDIA Apex for this session." From b709a8fdac5979ba511315a0ef9618c504490e50 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 16 Jul 2021 03:01:53 +0200 Subject: [PATCH 19/26] Avoid file not found --- pl_examples/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index f22cb5b8e4805..093b1fd11650b 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -14,7 +14,7 @@ _PACKAGE_ROOT = os.path.dirname(_EXAMPLES_ROOT) _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets') -_TORCHVISION_MNIST_AVAILABLE = not bool(os.environ.get("PL_USE_MOCKED_MNIST", False)) +_TORCHVISION_MNIST_AVAILABLE = not bool(os.getenv("PL_USE_MOCKED_MNIST", False)) _DALI_AVAILABLE = _module_available("nvidia.dali") if _TORCHVISION_MNIST_AVAILABLE: @@ -23,6 +23,13 @@ MNIST(_DATASETS_PATH, download=True) except HTTPError: _TORCHVISION_MNIST_AVAILABLE = False + except RuntimeError as e: + # `torchvision` can produce the following error randomly. + # File "/usr/local/lib/python3.7/dist-packages/torchvision/datasets/utils.py", line 145, in download_url + # raise RuntimeError("File not found or corrupted.") + if "File not found" not in str(e): + raise + _TORCHVISION_MNIST_AVAILABLE = False LIGHTNING_LOGO = """ #### From 5fcd3d7dad0b4f65ae02852875f05b52c7ee9271 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 18 Jul 2021 22:31:30 +0200 Subject: [PATCH 20/26] Debug --- .azure-pipelines/gpu-tests.yml | 10 ---------- tests/special_tests.sh | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 6190d27362c52..5f31d45ab8976 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -61,16 +61,6 @@ jobs: python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" displayName: 'Env details' - - bash: | - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ - unzip -o legacy/checkpoints.zip -d legacy/ - ls -l legacy/checkpoints/ - displayName: 'Get legacy checkpoints' - - - bash: | - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 - displayName: 'Testing: standard' - - bash: | bash tests/special_tests.sh displayName: 'Testing: special' diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 96d1e3ba4affb..9ee7bd3d1bcb9 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -62,7 +62,7 @@ for i in "${!files_arr[@]}"; do # run the test report+="Ran\t$file:$lineno::$test_name\n" - python ${defaults} "${file}::${test_name}" + NCCL_DEBUG=INFO python ${defaults} "${file}::${test_name}" break fi done < <(echo "$test_code") From 8d978ccba3d209081de955fb76fd112124d0836a Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 18 Jul 2021 22:44:26 +0200 Subject: [PATCH 21/26] Increase SHM size --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5f31d45ab8976..c531aa0a8f2ad 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -32,7 +32,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=256m" + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" workspace: clean: all From d9118c5aea14504e3a146dff3d0280c76a176924 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 19 Jul 2021 02:05:03 +0200 Subject: [PATCH 22/26] Debug --- tests/special_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 9ee7bd3d1bcb9..977c37c15bac8 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -62,7 +62,7 @@ for i in "${!files_arr[@]}"; do # run the test report+="Ran\t$file:$lineno::$test_name\n" - NCCL_DEBUG=INFO python ${defaults} "${file}::${test_name}" + #python ${defaults} "${file}::${test_name}" break fi done < <(echo "$test_code") From b3748c45b89a00eab62d6717fb130d022055b53f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 19 Jul 2021 02:23:31 +0200 Subject: [PATCH 23/26] Refactor MNIST imports --- .azure-pipelines/gpu-tests.yml | 2 ++ pl_examples/__init__.py | 9 --------- pl_examples/basic_examples/autoencoder.py | 7 ++----- .../basic_examples/backbone_image_classifier.py | 7 ++----- .../basic_examples/dali_image_classifier.py | 7 ++----- pl_examples/basic_examples/mnist_datamodule.py | 16 +++++++++++++--- .../generative_adversarial_net.py | 7 ++----- 7 files changed, 23 insertions(+), 32 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index c531aa0a8f2ad..d239579e6960e 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -63,6 +63,8 @@ jobs: - bash: | bash tests/special_tests.sh + env: + PL_USE_MOCKED_MNIST: "1" displayName: 'Testing: special' - bash: | diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index f22cb5b8e4805..22d2946db8f8c 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -1,5 +1,4 @@ import os -from urllib.error import HTTPError from six.moves import urllib @@ -14,16 +13,8 @@ _PACKAGE_ROOT = os.path.dirname(_EXAMPLES_ROOT) _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets') -_TORCHVISION_MNIST_AVAILABLE = not bool(os.environ.get("PL_USE_MOCKED_MNIST", False)) _DALI_AVAILABLE = _module_available("nvidia.dali") -if _TORCHVISION_MNIST_AVAILABLE: - try: - from torchvision.datasets.mnist import MNIST - MNIST(_DATASETS_PATH, download=True) - except HTTPError: - _TORCHVISION_MNIST_AVAILABLE = False - LIGHTNING_LOGO = """ #### ########### diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py index 94e4fbfcf7ae2..8278e695af452 100644 --- a/pl_examples/basic_examples/autoencoder.py +++ b/pl_examples/basic_examples/autoencoder.py @@ -24,16 +24,13 @@ from torch.utils.data import DataLoader, random_split import pytorch_lightning as pl -from pl_examples import _DATASETS_PATH, _TORCHVISION_MNIST_AVAILABLE, cli_lightning_logo +from pl_examples import _DATASETS_PATH, cli_lightning_logo +from pl_examples.basic_examples.mnist_datamodule import MNIST from pytorch_lightning.utilities.cli import LightningCLI from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE if _TORCHVISION_AVAILABLE: from torchvision import transforms -if _TORCHVISION_MNIST_AVAILABLE: - from torchvision.datasets import MNIST -else: - from tests.helpers.datasets import MNIST class LitAutoEncoder(pl.LightningModule): diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py index 381cda088ea9d..c25d27bc4288d 100644 --- a/pl_examples/basic_examples/backbone_image_classifier.py +++ b/pl_examples/basic_examples/backbone_image_classifier.py @@ -24,16 +24,13 @@ from torch.utils.data import DataLoader, random_split import pytorch_lightning as pl -from pl_examples import _DATASETS_PATH, _TORCHVISION_MNIST_AVAILABLE, cli_lightning_logo +from pl_examples import _DATASETS_PATH, cli_lightning_logo +from pl_examples.basic_examples.mnist_datamodule import MNIST from pytorch_lightning.utilities.cli import LightningCLI from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE if _TORCHVISION_AVAILABLE: from torchvision import transforms -if _TORCHVISION_MNIST_AVAILABLE: - from torchvision.datasets import MNIST -else: - from tests.helpers.datasets import MNIST class Backbone(torch.nn.Module): diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py index 572f9a3a0c596..70c63c234c9ca 100644 --- a/pl_examples/basic_examples/dali_image_classifier.py +++ b/pl_examples/basic_examples/dali_image_classifier.py @@ -22,16 +22,13 @@ from torch.utils.data import random_split import pytorch_lightning as pl -from pl_examples import _DALI_AVAILABLE, _DATASETS_PATH, _TORCHVISION_MNIST_AVAILABLE, cli_lightning_logo +from pl_examples import _DALI_AVAILABLE, _DATASETS_PATH, cli_lightning_logo +from pl_examples.basic_examples.mnist_datamodule import MNIST from pytorch_lightning.utilities.cli import LightningCLI from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE if _TORCHVISION_AVAILABLE: from torchvision import transforms -if _TORCHVISION_MNIST_AVAILABLE: - from torchvision.datasets import MNIST -else: - from tests.helpers.datasets import MNIST if _DALI_AVAILABLE: from nvidia.dali import __version__ as dali_version diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py index ffb507a9404e6..6da3c768f5fb6 100644 --- a/pl_examples/basic_examples/mnist_datamodule.py +++ b/pl_examples/basic_examples/mnist_datamodule.py @@ -11,21 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import platform from typing import Optional +from urllib.error import HTTPError from warnings import warn from torch.utils.data import DataLoader, random_split -from pl_examples import _DATASETS_PATH, _TORCHVISION_MNIST_AVAILABLE +from pl_examples import _DATASETS_PATH from pytorch_lightning import LightningDataModule from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE if _TORCHVISION_AVAILABLE: from torchvision import transforms as transform_lib + +_TORCHVISION_MNIST_AVAILABLE = not bool(os.getenv("PL_USE_MOCKED_MNIST", False)) if _TORCHVISION_MNIST_AVAILABLE: - from torchvision.datasets import MNIST -else: + try: + from torchvision.datasets.mnist import MNIST + MNIST(_DATASETS_PATH, download=True) + except HTTPError as e: + print(f"Error {e} downloading torchvision.MNIST") + _TORCHVISION_MNIST_AVAILABLE = False +if not _TORCHVISION_MNIST_AVAILABLE: + print("torchvision MNIST not available. Using our own") from tests.helpers.datasets import MNIST diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py index 19bce65746f65..70524bab3e845 100644 --- a/pl_examples/domain_templates/generative_adversarial_net.py +++ b/pl_examples/domain_templates/generative_adversarial_net.py @@ -28,7 +28,8 @@ import torch.nn.functional as F # noqa from torch.utils.data import DataLoader -from pl_examples import _TORCHVISION_MNIST_AVAILABLE, cli_lightning_logo +from pl_examples import cli_lightning_logo +from pl_examples.basic_examples.mnist_datamodule import MNIST from pytorch_lightning.core import LightningDataModule, LightningModule from pytorch_lightning.trainer import Trainer from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE @@ -36,10 +37,6 @@ if _TORCHVISION_AVAILABLE: import torchvision from torchvision import transforms -if _TORCHVISION_MNIST_AVAILABLE: - from torchvision.datasets import MNIST -else: - from tests.helpers.datasets import MNIST class Generator(nn.Module): From 45b0d5120bd0e8fedec3f9d117eff5b00d2dda5e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 19 Jul 2021 02:31:25 +0200 Subject: [PATCH 24/26] Undo debugging --- .azure-pipelines/gpu-tests.yml | 17 +++++++++++++---- tests/special_tests.sh | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index d239579e6960e..dec51b7cf8dd1 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -61,6 +61,15 @@ jobs: python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" displayName: 'Env details' + - bash: | + wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ + unzip -o legacy/checkpoints.zip -d legacy/ + ls -l legacy/checkpoints/ + displayName: 'Get legacy checkpoints' + - bash: | + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + displayName: 'Testing: standard' + - bash: | bash tests/special_tests.sh env: @@ -91,10 +100,6 @@ jobs: testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' condition: succeededOrFailed() - - bash: | - python -m pytest benchmarks -v --maxfail=2 --durations=0 - displayName: 'Testing: benchmarks' - - script: | set -e python -m pytest pl_examples -v --maxfail=2 --durations=0 @@ -104,3 +109,7 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" displayName: 'Testing: examples' + + - bash: | + python -m pytest benchmarks -v --maxfail=2 --durations=0 + displayName: 'Testing: benchmarks' diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 977c37c15bac8..96d1e3ba4affb 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -62,7 +62,7 @@ for i in "${!files_arr[@]}"; do # run the test report+="Ran\t$file:$lineno::$test_name\n" - #python ${defaults} "${file}::${test_name}" + python ${defaults} "${file}::${test_name}" break fi done < <(echo "$test_code") From bdae378f2ef7ea9c349eee5c29b101f2c15809ee Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 19 Jul 2021 02:45:16 +0200 Subject: [PATCH 25/26] Prints --- .azure-pipelines/gpu-tests.yml | 1 + pl_examples/basic_examples/mnist_datamodule.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index dec51b7cf8dd1..bdf32cb45adce 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -66,6 +66,7 @@ jobs: unzip -o legacy/checkpoints.zip -d legacy/ ls -l legacy/checkpoints/ displayName: 'Get legacy checkpoints' + - bash: | python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py index 6da3c768f5fb6..90053c04e79ee 100644 --- a/pl_examples/basic_examples/mnist_datamodule.py +++ b/pl_examples/basic_examples/mnist_datamodule.py @@ -29,13 +29,13 @@ _TORCHVISION_MNIST_AVAILABLE = not bool(os.getenv("PL_USE_MOCKED_MNIST", False)) if _TORCHVISION_MNIST_AVAILABLE: try: - from torchvision.datasets.mnist import MNIST + from torchvision.datasets import MNIST MNIST(_DATASETS_PATH, download=True) except HTTPError as e: - print(f"Error {e} downloading torchvision.MNIST") + print(f"Error {e} downloading `torchvision.datasets.MNIST`") _TORCHVISION_MNIST_AVAILABLE = False if not _TORCHVISION_MNIST_AVAILABLE: - print("torchvision MNIST not available. Using our own") + print("`torchvision.datasets.MNIST` not available. Using our hosted version") from tests.helpers.datasets import MNIST From 4df2ac2cd0b78aa02ba01c1b4a1c56cc47bf3ae6 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 19 Jul 2021 03:00:33 +0200 Subject: [PATCH 26/26] Revert "Avoid file not found" This reverts commit b709a8fdac5979ba511315a0ef9618c504490e50. --- pl_examples/__init__.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index 093b1fd11650b..f22cb5b8e4805 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -14,7 +14,7 @@ _PACKAGE_ROOT = os.path.dirname(_EXAMPLES_ROOT) _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets') -_TORCHVISION_MNIST_AVAILABLE = not bool(os.getenv("PL_USE_MOCKED_MNIST", False)) +_TORCHVISION_MNIST_AVAILABLE = not bool(os.environ.get("PL_USE_MOCKED_MNIST", False)) _DALI_AVAILABLE = _module_available("nvidia.dali") if _TORCHVISION_MNIST_AVAILABLE: @@ -23,13 +23,6 @@ MNIST(_DATASETS_PATH, download=True) except HTTPError: _TORCHVISION_MNIST_AVAILABLE = False - except RuntimeError as e: - # `torchvision` can produce the following error randomly. - # File "/usr/local/lib/python3.7/dist-packages/torchvision/datasets/utils.py", line 145, in download_url - # raise RuntimeError("File not found or corrupted.") - if "File not found" not in str(e): - raise - _TORCHVISION_MNIST_AVAILABLE = False LIGHTNING_LOGO = """ ####