diff --git a/CHANGELOG.md b/CHANGELOG.md index 400fe58b4eaee..15658ab8d810d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,7 +72,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `prepare_data_per_node` argument from the `Trainer` constructor ([#12536](https://github.com/PyTorchLightning/pytorch-lightning/pull/12536)) -- +- Removed the deprecated `terminate_on_nan` argument from the `Trainer` constructor ([#12553](https://github.com/PyTorchLightning/pytorch-lightning/pull/12553)) - diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py index f9068b87b653d..5cd81aa30f48f 100644 --- a/pytorch_lightning/loops/optimization/optimizer_loop.py +++ b/pytorch_lightning/loops/optimization/optimizer_loop.py @@ -27,12 +27,10 @@ _block_parallel_sync_behavior, _build_training_step_kwargs, _extract_hiddens, - check_finite_loss, ) from pytorch_lightning.trainer.progress import OptimizationProgress from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.finite_checks import detect_nan_parameters from pytorch_lightning.utilities.types import STEP_OUTPUT from pytorch_lightning.utilities.warnings import WarningCache @@ -310,10 +308,6 @@ def _make_backward_fn(self, optimizer: Optimizer, opt_idx: int) -> Optional[Call def backward_fn(loss: Tensor) -> None: self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx) - # check if model weights are nan - if self.trainer._terminate_on_nan: - detect_nan_parameters(self.trainer.lightning_module) - return backward_fn def _run_optimization_start(self, opt_idx: int, optimizer: torch.optim.Optimizer) -> None: @@ -437,9 +431,6 @@ def _training_step(self, split_batch: Any, batch_idx: int, opt_idx: int) -> Clos training_step_output, self.trainer.accumulate_grad_batches ) - if self.trainer._terminate_on_nan: - check_finite_loss(result.closure_loss) - if self.trainer.move_metrics_to_cpu: # hiddens and the training step output are not moved as they are not considered "metrics" assert self.trainer._results is not None diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index add2b64c59fdc..743f2dd389c4c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -186,7 +186,6 @@ def __init__( amp_level: Optional[str] = None, move_metrics_to_cpu: bool = False, multiple_trainloader_mode: str = "max_size_cycle", - terminate_on_nan: Optional[bool] = None, ) -> None: r""" Customize every aspect of training via flags. @@ -386,16 +385,6 @@ def __init__( sync_batchnorm: Synchronize batch norm layers between process groups/whole world. Default: ``False``. - terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the - end of each training batch, if any of the parameters or the loss are NaN or +/-inf. - - .. deprecated:: v1.5 - Trainer argument ``terminate_on_nan`` was deprecated in v1.5 and will be removed in 1.7. - Please use ``detect_anomaly`` instead. - - detect_anomaly: Enable anomaly detection for the autograd engine. - Default: ``False``. - tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on (1) Default: ``None``. @@ -535,14 +524,6 @@ def __init__( reload_dataloaders_every_n_epochs, ) - if terminate_on_nan is not None: - rank_zero_deprecation( - "Trainer argument `terminate_on_nan` was deprecated in v1.5 and will be removed in 1.7." - " Please use `Trainer(detect_anomaly=True)` instead." - ) - if not isinstance(terminate_on_nan, bool): - raise TypeError(f"`terminate_on_nan` should be a bool, got {terminate_on_nan}.") - # gradient clipping if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)): raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.") @@ -563,7 +544,6 @@ def __init__( f"`track_grad_norm` must be a positive number or 'inf' (infinity norm). Got {track_grad_norm}." ) - self._terminate_on_nan = terminate_on_nan self.gradient_clip_val: Union[int, float] = gradient_clip_val self.gradient_clip_algorithm: Optional[GradClipAlgorithmType] = ( GradClipAlgorithmType(gradient_clip_algorithm.lower()) if gradient_clip_algorithm is not None else None @@ -2797,19 +2777,6 @@ def configure_optimizers(self): max_estimated_steps = min(max_estimated_steps, self.max_steps) if self.max_steps != -1 else max_estimated_steps return max_estimated_steps - @property - def terminate_on_nan(self) -> bool: - rank_zero_deprecation("`Trainer.terminate_on_nan` is deprecated in v1.5 and will be removed in 1.7.") - return self._terminate_on_nan - - @terminate_on_nan.setter - def terminate_on_nan(self, val: bool) -> None: - rank_zero_deprecation( - f"Setting `Trainer.terminate_on_nan = {val}` is deprecated in v1.5 and will be removed in 1.7." - f" Please set `Trainer(detect_anomaly={val})` instead." - ) - self._terminate_on_nan = val # : 212 - def _determine_batch_limits(batches: Optional[Union[int, float]], name: str) -> Union[int, float]: if batches is None: diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 167b3095d91b0..8e575224680e8 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -119,23 +119,6 @@ def get_progress_bar_dict(self): _ = trainer.progress_bar_dict -@pytest.mark.parametrize("terminate_on_nan", [True, False]) -def test_v1_7_0_trainer_terminate_on_nan(tmpdir, terminate_on_nan): - with pytest.deprecated_call( - match="Trainer argument `terminate_on_nan` was deprecated in v1.5 and will be removed in 1.7" - ): - trainer = Trainer(terminate_on_nan=terminate_on_nan) - assert trainer.terminate_on_nan is terminate_on_nan - assert trainer._detect_anomaly is False - - trainer = Trainer() - with pytest.deprecated_call(match=r"`Trainer.terminate_on_nan` is deprecated in v1.5"): - _ = trainer.terminate_on_nan - - with pytest.deprecated_call(match=r"Setting `Trainer.terminate_on_nan = True` is deprecated in v1.5"): - trainer.terminate_on_nan = True - - def test_v1_7_0_deprecated_on_task_dataloader(tmpdir): class CustomBoringModel(BoringModel): def on_train_dataloader(self): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 7ac94538356fa..de495dd4c12d6 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -904,72 +904,12 @@ def validation_epoch_end(self, *args, **kwargs): assert model.validation_epoch_end_invoked, "did not run `validation_epoch_end` with `fast_dev_run=True`" -@mock.patch("torch.Tensor.backward") -def test_nan_loss_detection(backward_mock, tmpdir): - class CurrentModel(BoringModel): - test_batch_inf = 3 - - def training_step(self, batch, batch_idx): - output = super().training_step(batch, batch_idx) - if batch_idx == self.test_batch_inf: - if isinstance(output, dict): - output["loss"] *= torch.tensor(math.inf) # make loss infinite - else: - output /= 0 - return output - - model = CurrentModel() - - with pytest.deprecated_call(match="terminate_on_nan` was deprecated in v1.5"): - trainer = Trainer(default_root_dir=tmpdir, max_steps=(model.test_batch_inf + 1), terminate_on_nan=True) - - with pytest.raises(ValueError, match=r".*The loss returned in `training_step` is.*"): - trainer.fit(model) - assert trainer.global_step == model.test_batch_inf - assert backward_mock.call_count == model.test_batch_inf - - for param in model.parameters(): - assert torch.isfinite(param).all() - - -def test_invalid_terminate_on_nan(tmpdir): - with pytest.raises(TypeError, match="`terminate_on_nan` should be a bool"), pytest.deprecated_call( - match="terminate_on_nan` was deprecated in v1.5" - ): - Trainer(default_root_dir=tmpdir, terminate_on_nan="False") - - @pytest.mark.parametrize("track_grad_norm", [0, torch.tensor(1), "nan"]) def test_invalid_track_grad_norm(tmpdir, track_grad_norm): with pytest.raises(MisconfigurationException, match="`track_grad_norm` must be a positive number or 'inf'"): Trainer(default_root_dir=tmpdir, track_grad_norm=track_grad_norm) -@mock.patch("torch.Tensor.backward") -def test_nan_params_detection(backward_mock, tmpdir): - class CurrentModel(BoringModel): - test_batch_nan = 3 - - def on_after_backward(self): - if self.global_step == self.test_batch_nan: - # simulate parameter that became nan - torch.nn.init.constant_(self.layer.bias, math.nan) - - model = CurrentModel() - - with pytest.deprecated_call(match="terminate_on_nan` was deprecated in v1.5"): - trainer = Trainer(default_root_dir=tmpdir, max_steps=(model.test_batch_nan + 1), terminate_on_nan=True) - - with pytest.raises(ValueError, match=r".*Detected nan and/or inf values in `layer.bias`.*"): - trainer.fit(model) - assert trainer.global_step == model.test_batch_nan - assert backward_mock.call_count == model.test_batch_nan + 1 - - # after aborting the training loop, model still has nan-valued params - params = torch.cat([param.view(-1) for param in model.parameters()]) - assert not torch.isfinite(params).all() - - def test_on_exception_hook(tmpdir): """Test the on_exception callback hook and the trainer interrupted flag.""" diff --git a/tests/utilities/test_finite_checks.py b/tests/utilities/test_finite_checks.py new file mode 100644 index 0000000000000..95a990306a9c7 --- /dev/null +++ b/tests/utilities/test_finite_checks.py @@ -0,0 +1,20 @@ +import math + +import pytest +import torch +import torch.nn as nn + +from pytorch_lightning.utilities.finite_checks import detect_nan_parameters + + +@pytest.mark.parametrize("value", (math.nan, math.inf, -math.inf)) +def test_detect_nan_parameters(value): + model = nn.Linear(2, 3) + + detect_nan_parameters(model) + + nn.init.constant_(model.bias, value) + assert not torch.isfinite(model.bias).all() + + with pytest.raises(ValueError, match=r".*Detected nan and/or inf values in `bias`.*"): + detect_nan_parameters(model)