diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d8fbf05573f6..25529d2692ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -163,6 +163,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `pl_legacy_patch` load utility for loading old checkpoints that have pickled legacy Lightning attributes ([#9166](https://github.com/PyTorchLightning/pytorch-lightning/pull/9166)) +- Added support for `torch.use_deterministic_algorithms` ([#9121](https://github.com/PyTorchLightning/pytorch-lightning/pull/9121)) + + ### Changed - `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with new [neptune-client](https://github.com/neptune-ai/neptune-client) API ([#6867](https://github.com/PyTorchLightning/pytorch-lightning/pull/6867)). @@ -225,9 +228,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Executing the `optimizer_closure` is now required when overriding the `optimizer_step` hook ([#9360](https://github.com/PyTorchLightning/pytorch-lightning/pull/9360)) -- Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/PyTorchLightning/pytorch-lightning/pull/9495)) - - - Changed logging of `LightningModule` and `LightningDataModule` hyperparameters to raise an exception only if there are colliding keys with different values ([#9496](https://github.com/PyTorchLightning/pytorch-lightning/pull/9496)) @@ -394,6 +394,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed `call_configure_sharded_model_hook` property from `Accelerator` and `TrainingTypePlugin` ([#9612](https://github.com/PyTorchLightning/pytorch-lightning/pull/9612)) +- Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/PyTorchLightning/pytorch-lightning/pull/9495)) + + ### Fixed diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py index 6612f76280076..e9442dd26e65b 100644 --- a/benchmarks/test_basic_parity.py +++ b/benchmarks/test_basic_parity.py @@ -151,6 +151,7 @@ def vanilla_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10): def lightning_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10): seed_everything(idx) + torch.backends.cudnn.deterministic = True model = cls_model() # init model parts @@ -161,7 +162,6 @@ def lightning_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10): weights_summary=None, gpus=1 if device_type == "cuda" else 0, checkpoint_callback=False, - deterministic=True, logger=False, replace_sampler_ddp=False, ) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 61e87a67c7fac..dd411da7bc995 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -60,10 +60,6 @@ TorchElasticEnvironment, ) from pytorch_lightning.utilities import ( - _APEX_AVAILABLE, - _HOROVOD_AVAILABLE, - _IPU_AVAILABLE, - _TPU_AVAILABLE, AMPType, device_parser, DeviceType, @@ -74,6 +70,14 @@ ) from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import ( + _APEX_AVAILABLE, + _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, + _TORCH_GREATER_EQUAL_1_7, + _TORCH_GREATER_EQUAL_1_8, + _TPU_AVAILABLE, +) if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -96,7 +100,7 @@ def __init__( sync_batchnorm, benchmark, replace_sampler_ddp, - deterministic, + deterministic: bool, precision, amp_type, amp_level, @@ -113,6 +117,7 @@ def __init__( f" Use `Trainer(accelerator={distributed_backend})` instead." ) distributed_backend = distributed_backend or accelerator + self._init_deterministic(deterministic) self.num_processes = num_processes self.devices = devices @@ -126,7 +131,6 @@ def __init__( self.sync_batchnorm = sync_batchnorm self.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp - self.deterministic = deterministic self.precision = precision self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None self.amp_level = amp_level @@ -177,15 +181,22 @@ def __init__( # TODO: should this be moved to GPU accelerator? torch.backends.cudnn.benchmark = self.benchmark - # determinism for cudnn - # TODO: should this be moved to GPU accelerator? - torch.backends.cudnn.deterministic = deterministic + self.replace_sampler_ddp = replace_sampler_ddp + + def _init_deterministic(self, deterministic: bool) -> None: + self.deterministic = deterministic + if _TORCH_GREATER_EQUAL_1_8: + torch.use_deterministic_algorithms(deterministic) + elif _TORCH_GREATER_EQUAL_1_7: + torch.set_deterministic(deterministic) + else: # the minimum version Lightning supports is PyTorch 1.6 + torch._set_deterministic(deterministic) if deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) - - self.replace_sampler_ddp = replace_sampler_ddp + # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" def select_accelerator_type(self) -> None: if self.distributed_backend == "auto": diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f49c892e37191..7a85afeb70928 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -222,7 +222,8 @@ def __init__( Default: ``os.getcwd()``. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' - deterministic: If true enables cudnn.deterministic. + deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms. + Default: ``False``. devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`, based on the accelerator type. diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py index 93564e27defa9..cb1560d2af355 100644 --- a/tests/accelerators/test_common.py +++ b/tests/accelerators/test_common.py @@ -16,6 +16,7 @@ import tests.helpers.utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.utilities.seed import seed_everything from tests.accelerators.test_dp import CustomClassificationModelDP from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule @@ -32,16 +33,11 @@ ) def test_evaluate(tmpdir, trainer_kwargs): tutils.set_random_master_port() - + seed_everything(1) dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=10, - limit_val_batches=10, - deterministic=True, - **trainer_kwargs + default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, **trainer_kwargs ) trainer.fit(model, datamodule=dm) @@ -49,11 +45,8 @@ def test_evaluate(tmpdir, trainer_kwargs): old_weights = model.layer_0.weight.clone().detach().cpu() - result = trainer.validate(datamodule=dm) - assert result[0]["val_acc"] > 0.55 - - result = trainer.test(datamodule=dm) - assert result[0]["test_acc"] > 0.55 + trainer.validate(datamodule=dm) + trainer.test(datamodule=dm) # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py index 040cd642556cf..0910959fc7e7c 100644 --- a/tests/checkpointing/test_legacy_checkpoints.py +++ b/tests/checkpointing/test_legacy_checkpoints.py @@ -83,9 +83,9 @@ def test_resume_legacy_checkpoints(tmpdir, pl_version: str): callbacks=[es, stop], max_epochs=21, accumulate_grad_batches=2, - deterministic=True, resume_from_checkpoint=path_ckpt, ) + torch.backends.cudnn.deterministic = True trainer.fit(model, datamodule=dm) res = trainer.test(model, datamodule=dm) assert res[0]["test_loss"] <= 0.7 diff --git a/tests/conftest.py b/tests/conftest.py index 36110e6c57c37..860f9357e4636 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,6 +22,7 @@ import torch.distributed from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 from tests import _PATH_DATASETS @@ -52,6 +53,7 @@ def restore_env_variables(): os.environ.update(env_backup) # these are currently known leakers - ideally these would not be allowed allowlist = { + "CUBLAS_WORKSPACE_CONFIG", # enabled with deterministic flag "CUDA_DEVICE_ORDER", "LOCAL_RANK", "NODE_RANK", @@ -87,6 +89,18 @@ def teardown_process_group(): torch.distributed.destroy_process_group() +@pytest.fixture(scope="function", autouse=True) +def reset_deterministic_algorithm(): + """Ensures that torch determinism settings are reset before the next test runs.""" + yield + if _TORCH_GREATER_EQUAL_1_8: + torch.use_deterministic_algorithms(False) + elif _TORCH_GREATER_EQUAL_1_7: + torch.set_deterministic(False) + else: # the minimum version Lightning supports is PyTorch 1.6 + torch._set_deterministic(False) + + @pytest.fixture def tmpdir_server(tmpdir): if sys.version_info >= (3, 7): diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index bb74040b2c37f..e58bd391feb82 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -78,7 +78,6 @@ def test_horovod_cpu(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, accelerator="horovod", - deterministic=True, ) _run_horovod(trainer_options) @@ -96,7 +95,6 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, accelerator="horovod", - deterministic=True, ) _run_horovod(trainer_options) @@ -112,7 +110,6 @@ def test_horovod_cpu_implicit(tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, - deterministic=True, ) _run_horovod(trainer_options) @@ -129,7 +126,6 @@ def test_horovod_multi_gpu(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", ) _run_horovod(trainer_options, on_gpu=True) @@ -148,7 +144,6 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", ) _run_horovod(trainer_options, on_gpu=True) @@ -170,7 +165,6 @@ def test_horovod_apex(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", amp_backend="apex", precision=16, @@ -190,7 +184,6 @@ def test_horovod_amp(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", amp_backend="native", precision=16, @@ -210,7 +203,6 @@ def test_horovod_gather(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", ) _run_horovod(trainer_options, on_gpu=True) @@ -236,7 +228,6 @@ def validation_step(self, batch, *args, **kwargs): limit_train_batches=0.4, limit_val_batches=0.2, gpus=1, - deterministic=True, accelerator="horovod", ) tpipes.run_model_test_without_loggers(trainer_options, model) @@ -253,7 +244,6 @@ def test_horovod_multi_optimizer(tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, - deterministic=True, accelerator="horovod", ) trainer.fit(model) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index bab00943ef691..46cdcc7cf7e23 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -85,8 +85,11 @@ def training_step(self, batch, batch_idx): return {"loss": loss} model = TestModel() - model.trainer = Mock() - model.trainer.state.stage = RunningStage.TRAINING + trainer = MagicMock() + trainer.state.stage = RunningStage.TRAINING + trainer.accelerator_connector._init_deterministic(False) + + model.trainer = trainer batch = torch.rand(2, 32).cuda() batch_idx = 0 @@ -123,8 +126,10 @@ def training_step(self, batch, batch_idx): return output model = TestModel().to(device) - model.trainer = Mock() - model.trainer.state.stage = RunningStage.TRAINING + trainer = MagicMock() + trainer.state.stage = RunningStage.TRAINING + trainer.accelerator_connector._init_deterministic(False) + model.trainer = trainer batch = torch.rand(2, 32).to(device) batch_idx = 0