From e95d5c6523c3d57288f960cc5bf809c9a8491f8e Mon Sep 17 00:00:00 2001 From: ananthsub Date: Fri, 10 Sep 2021 15:52:50 -0700 Subject: [PATCH 01/16] re-add changes --- CHANGELOG.md | 3 ++ benchmarks/test_basic_parity.py | 2 +- .../connectors/accelerator_connector.py | 32 ++++++++++++------- pytorch_lightning/trainer/trainer.py | 3 +- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d8fbf05573f6..3c50fb1ee2408 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -163,6 +163,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `pl_legacy_patch` load utility for loading old checkpoints that have pickled legacy Lightning attributes ([#9166](https://github.com/PyTorchLightning/pytorch-lightning/pull/9166)) +- Added support for `torch.use_deterministic_algorithms` ([#9121](https://github.com/PyTorchLightning/pytorch-lightning/pull/9121)) + + ### Changed - `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with new [neptune-client](https://github.com/neptune-ai/neptune-client) API ([#6867](https://github.com/PyTorchLightning/pytorch-lightning/pull/6867)). diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py index 6612f76280076..e9442dd26e65b 100644 --- a/benchmarks/test_basic_parity.py +++ b/benchmarks/test_basic_parity.py @@ -151,6 +151,7 @@ def vanilla_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10): def lightning_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10): seed_everything(idx) + torch.backends.cudnn.deterministic = True model = cls_model() # init model parts @@ -161,7 +162,6 @@ def lightning_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10): weights_summary=None, gpus=1 if device_type == "cuda" else 0, checkpoint_callback=False, - deterministic=True, logger=False, replace_sampler_ddp=False, ) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 61e87a67c7fac..5d5d5d3a348ff 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -60,10 +60,6 @@ TorchElasticEnvironment, ) from pytorch_lightning.utilities import ( - _APEX_AVAILABLE, - _HOROVOD_AVAILABLE, - _IPU_AVAILABLE, - _TPU_AVAILABLE, AMPType, device_parser, DeviceType, @@ -74,6 +70,15 @@ ) from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import ( + _APEX_AVAILABLE, + _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, + _NATIVE_AMP_AVAILABLE, + _TORCH_GREATER_EQUAL_1_7, + _TORCH_GREATER_EQUAL_1_8, + _TPU_AVAILABLE, +) if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -96,7 +101,7 @@ def __init__( sync_batchnorm, benchmark, replace_sampler_ddp, - deterministic, + deterministic: bool, precision, amp_type, amp_level, @@ -113,6 +118,7 @@ def __init__( f" Use `Trainer(accelerator={distributed_backend})` instead." ) distributed_backend = distributed_backend or accelerator + self._init_deterministic(deterministic) self.num_processes = num_processes self.devices = devices @@ -126,7 +132,6 @@ def __init__( self.sync_batchnorm = sync_batchnorm self.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp - self.deterministic = deterministic self.precision = precision self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None self.amp_level = amp_level @@ -177,16 +182,21 @@ def __init__( # TODO: should this be moved to GPU accelerator? torch.backends.cudnn.benchmark = self.benchmark - # determinism for cudnn - # TODO: should this be moved to GPU accelerator? - torch.backends.cudnn.deterministic = deterministic + self.replace_sampler_ddp = replace_sampler_ddp + + def _init_deterministic(self, deterministic: bool) -> None: + self.deterministic = deterministic + if _TORCH_GREATER_EQUAL_1_8: + torch.use_deterministic_algorithms(deterministic) + elif _TORCH_GREATER_EQUAL_1_7: + torch.set_deterministic(deterministic) + else: # the minimum version Lightning supports is PyTorch 1.6 + torch._set_deterministic(deterministic) if deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) - self.replace_sampler_ddp = replace_sampler_ddp - def select_accelerator_type(self) -> None: if self.distributed_backend == "auto": if self.has_tpu: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f49c892e37191..7a85afeb70928 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -222,7 +222,8 @@ def __init__( Default: ``os.getcwd()``. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' - deterministic: If true enables cudnn.deterministic. + deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms. + Default: ``False``. devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`, based on the accelerator type. From af9d303dbe829680297b474b030aa616b53e1b3d Mon Sep 17 00:00:00 2001 From: ananthsub Date: Fri, 10 Sep 2021 16:39:24 -0700 Subject: [PATCH 02/16] Update test_data_parallel.py --- tests/overrides/test_data_parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index bab00943ef691..5af2a164c459e 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -87,6 +87,7 @@ def training_step(self, batch, batch_idx): model = TestModel() model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING + model.trainer.accelerator_connector.deterministic = False batch = torch.rand(2, 32).cuda() batch_idx = 0 @@ -125,6 +126,7 @@ def training_step(self, batch, batch_idx): model = TestModel().to(device) model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING + model.trainer.accelerator_connector.deterministic = False batch = torch.rand(2, 32).to(device) batch_idx = 0 From ad5e2f7035808958d4b852c7622005f6a2dfd62b Mon Sep 17 00:00:00 2001 From: ananthsub Date: Fri, 24 Sep 2021 21:31:58 -0700 Subject: [PATCH 03/16] Update CHANGELOG.md --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c50fb1ee2408..25529d2692ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -228,9 +228,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Executing the `optimizer_closure` is now required when overriding the `optimizer_step` hook ([#9360](https://github.com/PyTorchLightning/pytorch-lightning/pull/9360)) -- Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/PyTorchLightning/pytorch-lightning/pull/9495)) - - - Changed logging of `LightningModule` and `LightningDataModule` hyperparameters to raise an exception only if there are colliding keys with different values ([#9496](https://github.com/PyTorchLightning/pytorch-lightning/pull/9496)) @@ -397,6 +394,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed `call_configure_sharded_model_hook` property from `Accelerator` and `TrainingTypePlugin` ([#9612](https://github.com/PyTorchLightning/pytorch-lightning/pull/9612)) +- Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/PyTorchLightning/pytorch-lightning/pull/9495)) + + ### Fixed From ab1c3286d2a84d442ac2de118e12f8e989a7d274 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Fri, 24 Sep 2021 22:55:49 -0700 Subject: [PATCH 04/16] Update test_legacy_checkpoints.py --- tests/checkpointing/test_legacy_checkpoints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py index 040cd642556cf..0910959fc7e7c 100644 --- a/tests/checkpointing/test_legacy_checkpoints.py +++ b/tests/checkpointing/test_legacy_checkpoints.py @@ -83,9 +83,9 @@ def test_resume_legacy_checkpoints(tmpdir, pl_version: str): callbacks=[es, stop], max_epochs=21, accumulate_grad_batches=2, - deterministic=True, resume_from_checkpoint=path_ckpt, ) + torch.backends.cudnn.deterministic = True trainer.fit(model, datamodule=dm) res = trainer.test(model, datamodule=dm) assert res[0]["test_loss"] <= 0.7 From da175005cbab2376156f08cb04f5c5d21b5c24aa Mon Sep 17 00:00:00 2001 From: ananthsub Date: Fri, 24 Sep 2021 23:22:51 -0700 Subject: [PATCH 05/16] Update test_horovod.py --- tests/models/test_horovod.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index bb74040b2c37f..18fbcfbc09d42 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -78,7 +78,6 @@ def test_horovod_cpu(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, accelerator="horovod", - deterministic=True, ) _run_horovod(trainer_options) @@ -170,7 +169,6 @@ def test_horovod_apex(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", amp_backend="apex", precision=16, @@ -190,7 +188,6 @@ def test_horovod_amp(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", amp_backend="native", precision=16, @@ -210,7 +207,6 @@ def test_horovod_gather(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", ) _run_horovod(trainer_options, on_gpu=True) @@ -236,7 +232,6 @@ def validation_step(self, batch, *args, **kwargs): limit_train_batches=0.4, limit_val_batches=0.2, gpus=1, - deterministic=True, accelerator="horovod", ) tpipes.run_model_test_without_loggers(trainer_options, model) @@ -253,7 +248,6 @@ def test_horovod_multi_optimizer(tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, - deterministic=True, accelerator="horovod", ) trainer.fit(model) From 3b2ad552333c91fdd06ff18e74ad6a20c53e66eb Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 15:31:07 -0700 Subject: [PATCH 06/16] Update test_horovod.py --- tests/models/test_horovod.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 18fbcfbc09d42..e58bd391feb82 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -95,7 +95,6 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, accelerator="horovod", - deterministic=True, ) _run_horovod(trainer_options) @@ -111,7 +110,6 @@ def test_horovod_cpu_implicit(tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, - deterministic=True, ) _run_horovod(trainer_options) @@ -128,7 +126,6 @@ def test_horovod_multi_gpu(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", ) _run_horovod(trainer_options, on_gpu=True) @@ -147,7 +144,6 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir): limit_train_batches=0.4, limit_val_batches=0.2, gpus=2, - deterministic=True, accelerator="horovod", ) _run_horovod(trainer_options, on_gpu=True) From a75d07ed0fa3fed8b54cf69d3c56867779efadee Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 15:34:10 -0700 Subject: [PATCH 07/16] Update accelerator_connector.py --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 5d5d5d3a348ff..11de9909f388e 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -74,7 +74,6 @@ _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _IPU_AVAILABLE, - _NATIVE_AMP_AVAILABLE, _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE, From c4e5ee666930b2db40d1a44a95898a5fbad5e0f8 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 16:09:39 -0700 Subject: [PATCH 08/16] update tests --- tests/accelerators/test_common.py | 10 +++------- tests/overrides/test_data_parallel.py | 5 +++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py index 93564e27defa9..d40222d3a974c 100644 --- a/tests/accelerators/test_common.py +++ b/tests/accelerators/test_common.py @@ -16,6 +16,7 @@ import tests.helpers.utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.utilities.seed import seed_everything from tests.accelerators.test_dp import CustomClassificationModelDP from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule @@ -32,16 +33,11 @@ ) def test_evaluate(tmpdir, trainer_kwargs): tutils.set_random_master_port() - + seed_everything(1) dm = ClassifDataModule() model = CustomClassificationModelDP() trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=10, - limit_val_batches=10, - deterministic=True, - **trainer_kwargs + default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, **trainer_kwargs ) trainer.fit(model, datamodule=dm) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index 5af2a164c459e..41ed25f778d97 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -87,7 +87,8 @@ def training_step(self, batch, batch_idx): model = TestModel() model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING - model.trainer.accelerator_connector.deterministic = False + model.trainer.accelerator._init_deterministic(False) + batch = torch.rand(2, 32).cuda() batch_idx = 0 @@ -126,7 +127,7 @@ def training_step(self, batch, batch_idx): model = TestModel().to(device) model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING - model.trainer.accelerator_connector.deterministic = False + model.trainer.accelerator._init_deterministic(False) batch = torch.rand(2, 32).to(device) batch_idx = 0 From 421d044ab5e34a290ead8d20d7a2190c69af9f8a Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 16:39:20 -0700 Subject: [PATCH 09/16] update tests --- tests/accelerators/test_common.py | 7 ++----- tests/overrides/test_data_parallel.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py index d40222d3a974c..cb1560d2af355 100644 --- a/tests/accelerators/test_common.py +++ b/tests/accelerators/test_common.py @@ -45,11 +45,8 @@ def test_evaluate(tmpdir, trainer_kwargs): old_weights = model.layer_0.weight.clone().detach().cpu() - result = trainer.validate(datamodule=dm) - assert result[0]["val_acc"] > 0.55 - - result = trainer.test(datamodule=dm) - assert result[0]["test_acc"] > 0.55 + trainer.validate(datamodule=dm) + trainer.test(datamodule=dm) # make sure weights didn't change new_weights = model.layer_0.weight.clone().detach().cpu() diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index 41ed25f778d97..c6e575558bab3 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -87,7 +87,7 @@ def training_step(self, batch, batch_idx): model = TestModel() model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING - model.trainer.accelerator._init_deterministic(False) + model.trainer.accelerator_connector._init_deterministic(False) batch = torch.rand(2, 32).cuda() batch_idx = 0 @@ -127,7 +127,7 @@ def training_step(self, batch, batch_idx): model = TestModel().to(device) model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING - model.trainer.accelerator._init_deterministic(False) + model.trainer.accelerator_connector._init_deterministic(False) batch = torch.rand(2, 32).to(device) batch_idx = 0 From 68434f5dbe34d79bf7f9ff1c75faa67be772c44a Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 17:25:58 -0700 Subject: [PATCH 10/16] Update test_data_parallel.py --- tests/overrides/test_data_parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index c6e575558bab3..c43d9fe322a71 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -85,7 +85,7 @@ def training_step(self, batch, batch_idx): return {"loss": loss} model = TestModel() - model.trainer = Mock() + model.trainer = Mock(deterministic=False) model.trainer.state.stage = RunningStage.TRAINING model.trainer.accelerator_connector._init_deterministic(False) @@ -125,7 +125,7 @@ def training_step(self, batch, batch_idx): return output model = TestModel().to(device) - model.trainer = Mock() + model.trainer = Mock(deterministic=False) model.trainer.state.stage = RunningStage.TRAINING model.trainer.accelerator_connector._init_deterministic(False) batch = torch.rand(2, 32).to(device) From 54dc9fec5ead531a69f9ac0b46e6d0edeefb0dfa Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 17:58:56 -0700 Subject: [PATCH 11/16] Update test_data_parallel.py --- tests/overrides/test_data_parallel.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index c43d9fe322a71..2a4bd21aed87e 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock from unittest.mock import MagicMock, Mock import pytest @@ -85,7 +86,7 @@ def training_step(self, batch, batch_idx): return {"loss": loss} model = TestModel() - model.trainer = Mock(deterministic=False) + model.trainer = Mock() model.trainer.state.stage = RunningStage.TRAINING model.trainer.accelerator_connector._init_deterministic(False) @@ -113,8 +114,9 @@ def test_python_scalar_to_tensor(inp, expected): @RunIf(min_gpus=1) +@mock.patch("pytorch_lightning.trainer.Trainer", autospec=True) @pytest.mark.parametrize("device", [torch.device("cpu"), torch.device("cuda", 0)]) -def test_lightning_parallel_module_python_scalar_conversion(device): +def test_lightning_parallel_module_python_scalar_conversion(mock_trainer, device): """Test that LightningParallelModule can convert Python scalars to tensors.""" class TestModel(BoringModel): @@ -125,9 +127,9 @@ def training_step(self, batch, batch_idx): return output model = TestModel().to(device) - model.trainer = Mock(deterministic=False) + model.trainer = mock_trainer.return_value model.trainer.state.stage = RunningStage.TRAINING - model.trainer.accelerator_connector._init_deterministic(False) + # model.trainer.accelerator_connector._init_deterministic(False) batch = torch.rand(2, 32).to(device) batch_idx = 0 From c6f288bcba829725255daa824329800e8f4a6a03 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 18:36:36 -0700 Subject: [PATCH 12/16] Update test_data_parallel.py --- tests/overrides/test_data_parallel.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index 2a4bd21aed87e..c415eca674c51 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock from unittest.mock import MagicMock, Mock import pytest @@ -86,10 +85,11 @@ def training_step(self, batch, batch_idx): return {"loss": loss} model = TestModel() - model.trainer = Mock() - model.trainer.state.stage = RunningStage.TRAINING - model.trainer.accelerator_connector._init_deterministic(False) + trainer = MagicMock() + trainer.state.stage = RunningStage.TRAINING + trainer.accelerator_connector._init_deterministic(False) + model.trainer = trainer batch = torch.rand(2, 32).cuda() batch_idx = 0 @@ -114,9 +114,8 @@ def test_python_scalar_to_tensor(inp, expected): @RunIf(min_gpus=1) -@mock.patch("pytorch_lightning.trainer.Trainer", autospec=True) @pytest.mark.parametrize("device", [torch.device("cpu"), torch.device("cuda", 0)]) -def test_lightning_parallel_module_python_scalar_conversion(mock_trainer, device): +def test_lightning_parallel_module_python_scalar_conversion(device): """Test that LightningParallelModule can convert Python scalars to tensors.""" class TestModel(BoringModel): @@ -127,9 +126,9 @@ def training_step(self, batch, batch_idx): return output model = TestModel().to(device) - model.trainer = mock_trainer.return_value - model.trainer.state.stage = RunningStage.TRAINING - # model.trainer.accelerator_connector._init_deterministic(False) + trainer = MagicMock() + trainer.state.stage = RunningStage.TRAINING + trainer.accelerator_connector._init_deterministic(False) batch = torch.rand(2, 32).to(device) batch_idx = 0 From bb8bdc09496299d65ca942f73bbb951c1fa4339a Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 18:41:12 -0700 Subject: [PATCH 13/16] Update test_data_parallel.py --- tests/overrides/test_data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py index c415eca674c51..46cdcc7cf7e23 100644 --- a/tests/overrides/test_data_parallel.py +++ b/tests/overrides/test_data_parallel.py @@ -129,6 +129,7 @@ def training_step(self, batch, batch_idx): trainer = MagicMock() trainer.state.stage = RunningStage.TRAINING trainer.accelerator_connector._init_deterministic(False) + model.trainer = trainer batch = torch.rand(2, 32).to(device) batch_idx = 0 From 1986bb9b05117bd3b123f39bdce2c879eb847f8f Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 19:05:24 -0700 Subject: [PATCH 14/16] Update conftest.py --- tests/conftest.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 36110e6c57c37..d4596008f65b4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,6 +22,7 @@ import torch.distributed from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 from tests import _PATH_DATASETS @@ -87,6 +88,18 @@ def teardown_process_group(): torch.distributed.destroy_process_group() +@pytest.fixture(scope="function", autouse=True) +def reset_deterministic_algorithm(): + """Ensures that torch determinism settings are reset before the next test runs.""" + yield + if _TORCH_GREATER_EQUAL_1_8: + torch.use_deterministic_algorithms(False) + elif _TORCH_GREATER_EQUAL_1_7: + torch.set_deterministic(False) + else: # the minimum version Lightning supports is PyTorch 1.6 + torch._set_deterministic(False) + + @pytest.fixture def tmpdir_server(tmpdir): if sys.version_info >= (3, 7): From fa269b3f731e1279bb0c20cb4696b6e8f2d017b7 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 20:28:17 -0700 Subject: [PATCH 15/16] Update accelerator_connector.py --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 11de9909f388e..dd411da7bc995 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -195,6 +195,8 @@ def _init_deterministic(self, deterministic: bool) -> None: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) + # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" def select_accelerator_type(self) -> None: if self.distributed_backend == "auto": From 9ab9bbd76fb57fd79adcda5586a56b8182ae9b59 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 29 Sep 2021 21:05:26 -0700 Subject: [PATCH 16/16] Update conftest.py --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index d4596008f65b4..860f9357e4636 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,6 +53,7 @@ def restore_env_variables(): os.environ.update(env_backup) # these are currently known leakers - ideally these would not be allowed allowlist = { + "CUBLAS_WORKSPACE_CONFIG", # enabled with deterministic flag "CUDA_DEVICE_ORDER", "LOCAL_RANK", "NODE_RANK",