From 380133355663e4cfec9e0ec0eecadeef81363d85 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 15 Sep 2022 00:00:30 +0200 Subject: [PATCH 1/9] forking advice --- .../strategies/launchers/multiprocessing.py | 24 ++++++++++++++++++- .../strategies/launchers/multiprocessing.py | 4 ++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py index ca47efe030302..fe5213c64afbf 100644 --- a/src/lightning_lite/strategies/launchers/multiprocessing.py +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -24,7 +24,7 @@ from lightning_lite.strategies.launchers.base import _Launcher from lightning_lite.strategies.strategy import Strategy from lightning_lite.utilities.apply_func import move_data_to_device -from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_11 +from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _IS_INTERACTIVE from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states @@ -86,6 +86,9 @@ def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any: *args: Optional positional arguments to be passed to the given function. **kwargs: Optional keyword arguments to be passed to the given function. """ + if self._start_method in ("fork", "forkserver"): + _check_bad_cuda_fork() + # The default cluster environment in Lightning chooses a random free port number # This needs to be done in the main process here before starting processes to ensure each rank will connect # through the same port @@ -176,3 +179,22 @@ def restore(self) -> None: def _is_forking_disabled() -> bool: """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``.""" return bool(int(os.environ.get("PL_DISABLE_FORK", "0"))) + + +def _check_bad_cuda_fork() -> None: + """Checks whether it is safe to fork and initialize CUDA in the new processes, and raises an exception if not. + + The error message replaces PyTorch's 'Cannot re-initialize CUDA in forked subprocess' with helpful advice for + Lightning users. + """ + if not torch.cuda.is_initialized(): + return + + message = ( + "Lightning can't create new processes if CUDA is already initialized. Did you manually call" + " `torch.cuda.*` functions, have moved the model to the device or allocated memory on the GPU any" + " other way? Please remove any such calls, or change the selected strategy." + ) + if _IS_INTERACTIVE: + message += " You will have to restart the Python session; in a notebook, that means restart the kernel." + raise RuntimeError(message) diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index be6a56b2e35dc..3429e1c9075d0 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -27,6 +27,7 @@ import pytorch_lightning as pl from lightning_lite.strategies.launchers.base import _Launcher +from lightning_lite.strategies.launchers.multiprocessing import _check_bad_cuda_fork from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states from lightning_lite.utilities.types import _PATH @@ -94,6 +95,9 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] **kwargs: Optional keyword arguments to be passed to the given function. """ self._check_torchdistx_support() + if self._start_method in ("fork", "forkserver"): + _check_bad_cuda_fork() + # The default cluster environment in Lightning chooses a random free port number # This needs to be done in the main process here before starting processes to ensure each rank will connect # through the same port From 8d7d33a6f85ca10c9af49d6812045634b09eeed7 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 15 Sep 2022 00:09:18 +0200 Subject: [PATCH 2/9] add test --- .../strategies/launchers/test_multiprocessing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/tests_lite/strategies/launchers/test_multiprocessing.py b/tests/tests_lite/strategies/launchers/test_multiprocessing.py index 70b45763fe2df..a5b502f148c0b 100644 --- a/tests/tests_lite/strategies/launchers/test_multiprocessing.py +++ b/tests/tests_lite/strategies/launchers/test_multiprocessing.py @@ -93,3 +93,13 @@ def test_global_state_snapshot(): assert torch.are_deterministic_algorithms_enabled() assert not torch.backends.cudnn.benchmark assert torch.initial_seed() == 123 + + +@pytest.mark.parametrize("start_method", ["fork", "forkserver"]) +@mock.patch("torch.cuda.is_initialized", return_value=True) +@mock.patch("lightning_lite.strategies.launchers.multiprocessing.mp") +def test_multiprocessing_launcher_check_for_bad_cuda_fork(mp_mock, _, start_method): + mp_mock.get_all_start_methods.return_value = [start_method] + launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + with pytest.raises(RuntimeError, match="Lightning can't create new processes if CUDA is already initialized"): + launcher.launch(function=Mock()) From e26290c95360a6ea34604439a9756eeaf829ac4f Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 15 Sep 2022 00:10:39 +0200 Subject: [PATCH 3/9] add changelog --- src/pytorch_lightning/CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 128a7b7ee0d6d..7340a7ef88a8b 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -40,6 +40,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `WandbLogger.download_artifact` and `WandbLogger.use_artifact` for managing artifacts with Weights and Biases ([#14551](https://github.com/Lightning-AI/lightning/issues/14551)) +- Added a friendlier error message when attempting to fork processes with pre-initialized CUDA context ([#XXXX](https://github.com/Lightning-AI/lightning/issues/XXXX)) + + + ### Changed - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) From 59d2ae0ab1e3d6dd3bca339b55911d03c6965183 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Sep 2022 22:19:41 +0000 Subject: [PATCH 4/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning_lite/strategies/launchers/multiprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py index fe5213c64afbf..8569cc5a16204 100644 --- a/src/lightning_lite/strategies/launchers/multiprocessing.py +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -24,7 +24,7 @@ from lightning_lite.strategies.launchers.base import _Launcher from lightning_lite.strategies.strategy import Strategy from lightning_lite.utilities.apply_func import move_data_to_device -from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _IS_INTERACTIVE +from lightning_lite.utilities.imports import _IS_INTERACTIVE, _TORCH_GREATER_EQUAL_1_11 from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states From 94ded0ee3528a3a38741d87fe417d2a1159b0a2a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 15 Sep 2022 00:21:24 +0200 Subject: [PATCH 5/9] chlog --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 7340a7ef88a8b..9fdfe6b5a362e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -40,7 +40,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `WandbLogger.download_artifact` and `WandbLogger.use_artifact` for managing artifacts with Weights and Biases ([#14551](https://github.com/Lightning-AI/lightning/issues/14551)) -- Added a friendlier error message when attempting to fork processes with pre-initialized CUDA context ([#XXXX](https://github.com/Lightning-AI/lightning/issues/XXXX)) +- Added a friendlier error message when attempting to fork processes with pre-initialized CUDA context ([#14709](https://github.com/Lightning-AI/lightning/issues/14709)) From a91b87ec6386de2269d25c6cfa56820c749aa2fe Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 26 Sep 2022 21:14:33 +0200 Subject: [PATCH 6/9] format --- src/pytorch_lightning/strategies/launchers/multiprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index e013918a5b854..de41b8ff2b455 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -27,8 +27,8 @@ import pytorch_lightning as pl from lightning_lite.strategies.launchers.base import _Launcher -from lightning_lite.utilities import move_data_to_device from lightning_lite.strategies.launchers.multiprocessing import _check_bad_cuda_fork +from lightning_lite.utilities import move_data_to_device from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states from lightning_lite.utilities.types import _PATH from pytorch_lightning.trainer.states import TrainerFn, TrainerState From 15b4f4051db814f24ef3548f9e60faa26c524c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 26 Sep 2022 15:14:52 -0400 Subject: [PATCH 7/9] Update src/pytorch_lightning/CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 7bd29a46f5444..a246f3f4eb242 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -47,7 +47,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -- Added a friendlier error message when attempting to fork processes with pre-initialized CUDA context ([#14709](https://github.com/Lightning-AI/lightning/issues/14709)) +- Added a more descriptive error message when attempting to fork processes with pre-initialized CUDA context ([#14709](https://github.com/Lightning-AI/lightning/issues/14709)) From c09931e0b2fed7a25225241b0c5432405b5ff164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 26 Sep 2022 15:15:06 -0400 Subject: [PATCH 8/9] Update src/lightning_lite/strategies/launchers/multiprocessing.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/lightning_lite/strategies/launchers/multiprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py index f80466cc12af8..fc0e42a811192 100644 --- a/src/lightning_lite/strategies/launchers/multiprocessing.py +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -182,7 +182,7 @@ def _check_bad_cuda_fork() -> None: message = ( "Lightning can't create new processes if CUDA is already initialized. Did you manually call" - " `torch.cuda.*` functions, have moved the model to the device or allocated memory on the GPU any" + " `torch.cuda.*` functions, have moved the model to the device, or allocated memory on the GPU any" " other way? Please remove any such calls, or change the selected strategy." ) if _IS_INTERACTIVE: From 10fdd618a286ae26152fdb92e704809360985b01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 26 Sep 2022 15:16:17 -0400 Subject: [PATCH 9/9] Update src/lightning_lite/strategies/launchers/multiprocessing.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/lightning_lite/strategies/launchers/multiprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py index fc0e42a811192..7fb161b2ed259 100644 --- a/src/lightning_lite/strategies/launchers/multiprocessing.py +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -186,5 +186,5 @@ def _check_bad_cuda_fork() -> None: " other way? Please remove any such calls, or change the selected strategy." ) if _IS_INTERACTIVE: - message += " You will have to restart the Python session; in a notebook, that means restart the kernel." + message += " You will have to restart the Python kernel." raise RuntimeError(message)