From 1ffc159b69e6812ceb6a4e10f6d6bc9d12239af6 Mon Sep 17 00:00:00 2001 From: s-rog Date: Tue, 6 Apr 2021 09:26:40 +0800 Subject: [PATCH 01/11] init test --- tests/trainer/test_data_loading.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index ec7f020faa4c3..3599723f954b2 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -102,3 +102,14 @@ def check_replace_distrubuted_sampler(tmpdir, save_preds_on_dl_idx, accelerator, @pytest.mark.parametrize("mode", [1, 2]) def test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler(tmpdir, mode): check_replace_distrubuted_sampler(tmpdir, True, "ddp", 2, 2, mode) + +@RunIf(min_gpus=2, special=True) +def test_dataloader_warnings(): + trainer = Trainer(accelerator="ddp_spawn") + dl = DataLoader(RandomDataset(32, 64), num_workers=1) + if hasattr(dl, "persistent_workers"): + warn_str = "Consider setting persistent_workers=True" + else: + warn_str = "Consider setting accelerator=ddp" + with pytest.warns(UserWarning, match=warn_str): + trainer.test(BoringModel(), test_dataloaders=dl) From 8bc4683c45e7a573bb074ebc5bb0739a5b7e792d Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Wed, 31 Mar 2021 13:50:57 +0800 Subject: [PATCH 02/11] Update data_loading.py --- pytorch_lightning/trainer/data_loading.py | 33 ++++++++++++++++------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 59ec40c3df2e8..98e0584601564 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -61,18 +61,31 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn" if is_dataloader and not on_windows: if dataloader.num_workers > 0 and using_spawn: - rank_zero_warn( - 'Dataloader(num_workers>0) and ddp_spawn do not mix well!' - ' Your performance might suffer dramatically.' - ' Please consider setting accelerator=ddp to use num_workers > 0' - ' (this is a bottleneck of Python .spawn() and PyTorch' - ) + if hasattr(dataloader, "persistent_workers"): + if not dataloader.persistent_workers: + rank_zero_warn( + 'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn may result in data loading bottlenecks.' + ' Consider setting persistent_workers=True (this is a limitation of Python .spawn() and PyTorch)' + ) + else: + rank_zero_warn( + 'num_workers>0 and accelerator=ddp_spawn do not mix well and may result in data loading bottlenecks.' + ' Consider setting accelerator=ddp to use num_workers>0 (this is a limitation of Python .spawn() and PyTorch)' + ) elif dataloader.num_workers == 0 and using_spawn: - rank_zero_warn( - 'You are using `accelerator=ddp_spawn` with num_workers=0.' - ' For much faster performance, switch to `accelerator=ddp` and set `num_workers>0`' - ) + # checks for the attr persistent_workers not available on pytorch < 1.7 + if hasattr(dataloader, "persistent_workers"): + if not dataloader.persistent_workers: + rank_zero_warn( + 'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.' + ' Consider setting num_workers>0 and persistent_workers=True' + ) + else: + rank_zero_warn( + 'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.' + ' Consider setting accelerator=ddp and set num_workers>0' + ) elif dataloader.num_workers <= 2 and multiprocessing.cpu_count() > 2 and not using_spawn: num_cpus = multiprocessing.cpu_count() From 97411f2202104614336b71c397e706cad7328d44 Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Wed, 31 Mar 2021 13:55:26 +0800 Subject: [PATCH 03/11] fmt --- pytorch_lightning/trainer/data_loading.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 98e0584601564..0480c8023c3f8 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -61,20 +61,25 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn" if is_dataloader and not on_windows: if dataloader.num_workers > 0 and using_spawn: + # checks for the attr persistent_workers available in pytorch >= 1.7 if hasattr(dataloader, "persistent_workers"): if not dataloader.persistent_workers: rank_zero_warn( - 'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn may result in data loading bottlenecks.' - ' Consider setting persistent_workers=True (this is a limitation of Python .spawn() and PyTorch)' + 'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn' + ' may result in data loading bottlenecks.' + ' Consider setting persistent_workers=True' + ' (this is a limitation of Python .spawn() and PyTorch)' ) else: rank_zero_warn( - 'num_workers>0 and accelerator=ddp_spawn do not mix well and may result in data loading bottlenecks.' - ' Consider setting accelerator=ddp to use num_workers>0 (this is a limitation of Python .spawn() and PyTorch)' + 'num_workers>0 and accelerator=ddp_spawn do not mix well' + ' and may result in data loading bottlenecks.' + ' Consider setting accelerator=ddp to use num_workers>0' + ' (this is a limitation of Python .spawn() and PyTorch)' ) elif dataloader.num_workers == 0 and using_spawn: - # checks for the attr persistent_workers not available on pytorch < 1.7 + # checks for the attr persistent_workers available in pytorch >= 1.7 if hasattr(dataloader, "persistent_workers"): if not dataloader.persistent_workers: rank_zero_warn( From e1b1702fdff3828d457a97dcf178ae831ee1f1ff Mon Sep 17 00:00:00 2001 From: s-rog Date: Tue, 6 Apr 2021 09:38:09 +0800 Subject: [PATCH 04/11] pep8 --- tests/trainer/test_data_loading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 3599723f954b2..d42e1bb62128e 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -103,6 +103,7 @@ def check_replace_distrubuted_sampler(tmpdir, save_preds_on_dl_idx, accelerator, def test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler(tmpdir, mode): check_replace_distrubuted_sampler(tmpdir, True, "ddp", 2, 2, mode) + @RunIf(min_gpus=2, special=True) def test_dataloader_warnings(): trainer = Trainer(accelerator="ddp_spawn") From a52e500d40a32c549a2147668affb8a48cec584d Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Tue, 6 Apr 2021 10:39:53 +0800 Subject: [PATCH 05/11] remove special --- tests/trainer/test_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index d42e1bb62128e..6448ac3eea2b7 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -104,7 +104,7 @@ def test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler(tmpd check_replace_distrubuted_sampler(tmpdir, True, "ddp", 2, 2, mode) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2) def test_dataloader_warnings(): trainer = Trainer(accelerator="ddp_spawn") dl = DataLoader(RandomDataset(32, 64), num_workers=1) From c710c9aca08a25656b90601b2e670d7df80740fc Mon Sep 17 00:00:00 2001 From: s-rog Date: Tue, 6 Apr 2021 11:13:06 +0800 Subject: [PATCH 06/11] Update chlog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bab4034d80f6c..3f2dbaee9b754 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -102,6 +102,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) +- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762/)) + + ### Deprecated - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146)) From aa96f06cb49c3c59e7f17d44c3349ed10569cdfa Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Thu, 8 Apr 2021 09:31:07 +0800 Subject: [PATCH 07/11] mock --- tests/trainer/test_data_loading.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 6448ac3eea2b7..8e260077b1e13 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -15,6 +15,7 @@ import pytest from torch.utils.data import DataLoader from torch.utils.data.sampler import BatchSampler, SequentialSampler +from unittest import mock from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -104,7 +105,9 @@ def test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler(tmpd check_replace_distrubuted_sampler(tmpdir, True, "ddp", 2, 2, mode) -@RunIf(min_gpus=2) +@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) +@mock.patch('torch.cuda.device_count', return_value=2) +@mock.patch('torch.cuda.is_available', return_value=True) def test_dataloader_warnings(): trainer = Trainer(accelerator="ddp_spawn") dl = DataLoader(RandomDataset(32, 64), num_workers=1) @@ -112,5 +115,6 @@ def test_dataloader_warnings(): warn_str = "Consider setting persistent_workers=True" else: warn_str = "Consider setting accelerator=ddp" - with pytest.warns(UserWarning, match=warn_str): - trainer.test(BoringModel(), test_dataloaders=dl) + with pytest.raises(RuntimeError): + with pytest.warns(UserWarning, match=warn_str): + trainer.test(BoringModel(), test_dataloaders=dl) From 80254dbd4d2ef53c5fcb8a25e460503f05ce82e8 Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Thu, 8 Apr 2021 09:33:07 +0800 Subject: [PATCH 08/11] add missing import --- tests/trainer/test_data_loading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 8e260077b1e13..c44d208ef0417 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import pytest from torch.utils.data import DataLoader from torch.utils.data.sampler import BatchSampler, SequentialSampler From 2aaef2f949cc041793d5eb4038fd6ca02726e487 Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Thu, 8 Apr 2021 10:55:15 +0800 Subject: [PATCH 09/11] missing args --- tests/trainer/test_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index c44d208ef0417..88a62e2aa5cf5 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -109,7 +109,7 @@ def test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler(tmpd @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch('torch.cuda.device_count', return_value=2) @mock.patch('torch.cuda.is_available', return_value=True) -def test_dataloader_warnings(): +def test_dataloader_warnings(cuda_available_mock, device_count_mock): trainer = Trainer(accelerator="ddp_spawn") dl = DataLoader(RandomDataset(32, 64), num_workers=1) if hasattr(dl, "persistent_workers"): From 511f962fb85d2a28f07471cb073af1909f0b5078 Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Thu, 8 Apr 2021 11:49:01 +0800 Subject: [PATCH 10/11] test --- tests/trainer/test_data_loading.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 88a62e2aa5cf5..12e3fc6e2572f 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -116,6 +116,5 @@ def test_dataloader_warnings(cuda_available_mock, device_count_mock): warn_str = "Consider setting persistent_workers=True" else: warn_str = "Consider setting accelerator=ddp" - with pytest.raises(RuntimeError): - with pytest.warns(UserWarning, match=warn_str): - trainer.test(BoringModel(), test_dataloaders=dl) + with pytest.warns(UserWarning, match=warn_str): + trainer.test(BoringModel(), test_dataloaders=dl) From 90f29fe591b0c128568f1d306c7ad1f935143109 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 9 Apr 2021 02:50:12 +0200 Subject: [PATCH 11/11] Do not require mock. Add num workers parametrize --- tests/trainer/test_data_loading.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 12e3fc6e2572f..382311c107958 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import pytest from torch.utils.data import DataLoader from torch.utils.data.sampler import BatchSampler, SequentialSampler -from unittest import mock from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -106,15 +104,23 @@ def test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler(tmpd check_replace_distrubuted_sampler(tmpdir, True, "ddp", 2, 2, mode) -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch('torch.cuda.device_count', return_value=2) -@mock.patch('torch.cuda.is_available', return_value=True) -def test_dataloader_warnings(cuda_available_mock, device_count_mock): - trainer = Trainer(accelerator="ddp_spawn") - dl = DataLoader(RandomDataset(32, 64), num_workers=1) +@pytest.mark.parametrize("num_workers", [0, 1]) +def test_dataloader_warnings(num_workers): + + class TestModel(BoringModel): + + def on_train_start(self, *_) -> None: + raise SystemExit() + + dl = DataLoader(RandomDataset(32, 64), num_workers=num_workers) if hasattr(dl, "persistent_workers"): - warn_str = "Consider setting persistent_workers=True" + if num_workers == 0: + warn_str = "Consider setting num_workers>0 and persistent_workers=True" + else: + warn_str = "Consider setting persistent_workers=True" else: warn_str = "Consider setting accelerator=ddp" - with pytest.warns(UserWarning, match=warn_str): - trainer.test(BoringModel(), test_dataloaders=dl) + + trainer = Trainer(accelerator="ddp_spawn") + with pytest.warns(UserWarning, match=warn_str), pytest.raises(SystemExit): + trainer.fit(TestModel(), dl)