Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added a `_Stateful` support for `LightningDataModule` ([#11637](https://github.com/PyTorchLightning/pytorch-lightning/pull/11637))


- Added `Accelerator.is_available` to check device availability ([#11797](https://github.com/PyTorchLightning/pytorch-lightning/pull/11797))


### Changed

- Implemented a new native and rich format in `_print_results` method of the `EvaluationLoop` ([#11332](https://github.com/PyTorchLightning/pytorch-lightning/pull/11332))
Expand Down
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,8 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
@abstractmethod
def auto_device_count() -> int:
"""Get the device count when set to auto."""

@staticmethod
@abstractmethod
def is_available() -> bool:
"""Detect if the hardware is available."""
6 changes: 6 additions & 0 deletions pytorch_lightning/accelerators/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def setup_environment(self, root_device: torch.device) -> None:
MisconfigurationException:
If the selected device is not CPU.
"""
super().setup_environment(root_device)
if root_device.type != "cpu":
raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.")

Expand All @@ -42,3 +43,8 @@ def get_device_stats(self, device: _DEVICE) -> dict[str, Any]:
def auto_device_count() -> int:
"""Get the devices when set to auto."""
return 1

@staticmethod
def is_available() -> bool:
"""CPU is always available for execution."""
return True
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def setup_environment(self, root_device: torch.device) -> None:
MisconfigurationException:
If the selected device is not GPU.
"""
super().setup_environment(root_device)
if root_device.type != "cuda":
raise MisconfigurationException(f"Device should be GPU, got {root_device} instead")
torch.cuda.set_device(root_device)
Expand Down Expand Up @@ -79,6 +80,10 @@ def auto_device_count() -> int:
"""Get the devices when set to auto."""
return torch.cuda.device_count()

@staticmethod
def is_available() -> bool:
return torch.cuda.device_count() > 0


def get_nvidia_gpu_stats(device: _DEVICE) -> dict[str, float]:
"""Get GPU stats including memory, fan speed, and temperature from nvidia-smi.
Expand Down
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/ipu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import torch

from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.utilities import _IPU_AVAILABLE


class IPUAccelerator(Accelerator):
Expand All @@ -31,3 +32,7 @@ def auto_device_count() -> int:
# TODO (@kaushikb11): 4 is the minimal unit they are shipped in.
# Update this when api is exposed by the Graphcore team.
return 4

@staticmethod
def is_available() -> bool:
return _IPU_AVAILABLE
6 changes: 5 additions & 1 deletion pytorch_lightning/accelerators/tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import torch

from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.utilities import _XLA_AVAILABLE
from pytorch_lightning.utilities.imports import _TPU_AVAILABLE, _XLA_AVAILABLE

if _XLA_AVAILABLE:
import torch_xla.core.xla_model as xm
Expand Down Expand Up @@ -47,3 +47,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
def auto_device_count() -> int:
"""Get the devices when set to auto."""
return 8

@staticmethod
def is_available() -> bool:
return _TPU_AVAILABLE
17 changes: 16 additions & 1 deletion tests/accelerators/test_accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_slurm(*_):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2)
Expand All @@ -123,6 +124,7 @@ def test_accelerator_choice_ddp_slurm(*_):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp2_slurm(*_):
with pytest.deprecated_call(match=r"accelerator='ddp2'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2)
Expand All @@ -148,6 +150,7 @@ def test_accelerator_choice_ddp2_slurm(*_):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_te(*_):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2)
Expand All @@ -172,6 +175,7 @@ def test_accelerator_choice_ddp_te(*_):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp2_te(*_):
with pytest.deprecated_call(match=r"accelerator='ddp2'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2)
Expand Down Expand Up @@ -210,6 +214,7 @@ def test_accelerator_choice_ddp_cpu_te(*_):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_kubeflow(*_):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1)
Expand Down Expand Up @@ -340,6 +345,10 @@ class Accel(Accelerator):
def auto_device_count() -> int:
return 1

@staticmethod
def is_available() -> bool:
return True

class Prec(PrecisionPlugin):
pass

Expand Down Expand Up @@ -735,8 +744,11 @@ def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
@pytest.mark.parametrize("strategy", ["ddp2", DDP2Strategy()])
def test_strategy_choice_ddp2_slurm(set_device_mock, device_count_mock, setup_distributed_mock, strategy):
def test_strategy_choice_ddp2_slurm(
set_device_mock, device_count_mock, setup_distributed_mock, is_available_mock, strategy
):
trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2)
assert trainer._accelerator_connector._is_slurm_managing_tasks()
assert isinstance(trainer.accelerator, GPUAccelerator)
Expand All @@ -760,6 +772,7 @@ def test_strategy_choice_ddp2_slurm(set_device_mock, device_count_mock, setup_di
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_strategy_choice_ddp_te(*_):
trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2)
assert isinstance(trainer.accelerator, GPUAccelerator)
Expand All @@ -783,6 +796,7 @@ def test_strategy_choice_ddp_te(*_):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_strategy_choice_ddp2_te(*_):
trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2)
assert isinstance(trainer.accelerator, GPUAccelerator)
Expand Down Expand Up @@ -820,6 +834,7 @@ def test_strategy_choice_ddp_cpu_te(*_):
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_strategy_choice_ddp_kubeflow(*_):
trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1)
assert isinstance(trainer.accelerator, GPUAccelerator)
Expand Down
4 changes: 4 additions & 0 deletions tests/accelerators/test_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ def test_restore_checkpoint_after_pre_setup_default():
assert not plugin.restore_checkpoint_after_setup


def test_availability():
assert CPUAccelerator.is_available()


@pytest.mark.parametrize("restore_after_pre_setup", [True, False])
def test_restore_checkpoint_after_pre_setup(tmpdir, restore_after_pre_setup):
"""Test to ensure that if restore_checkpoint_after_setup is True, then we only load the state after pre-
Expand Down
10 changes: 7 additions & 3 deletions tests/accelerators/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module):

@RunIf(skip_windows=True)
@pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine")
@mock.patch("torch.cuda.is_available", return_value=True)
def test_torch_distributed_backend_env_variables(tmpdir):
"""This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError."""
_environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}
Expand All @@ -90,11 +91,14 @@ def test_torch_distributed_backend_env_variables(tmpdir):


@RunIf(skip_windows=True)
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("torch.cuda.is_available", return_value=True)
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.is_available", return_value=True)
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.accelerators.gpu.GPUAccelerator.is_available", return_value=True)
@mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "gloo"}, clear=True)
def test_ddp_torch_dist_is_available_in_setup(mock_set_device, mock_is_available, mock_device_count, tmpdir):
def test_ddp_torch_dist_is_available_in_setup(
mock_gpu_is_available, mock_device_count, mock_cuda_available, mock_set_device, tmpdir
):
"""Test to ensure torch distributed is available within the setup hook using ddp."""

class TestModel(BoringModel):
Expand Down
7 changes: 5 additions & 2 deletions tests/accelerators/test_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock

import pytest
import torch
import torch.nn.functional as F
Expand Down Expand Up @@ -154,9 +156,10 @@ def _assert_extra_outputs(self, outputs):
assert out.dtype is torch.float


def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, monkeypatch):
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_dp_raise_exception_with_batch_transfer_hooks(mock_is_available, mock_device_count, tmpdir):
"""Test that an exception is raised when overriding batch_transfer_hooks in DP model."""
monkeypatch.setattr("torch.cuda.device_count", lambda: 2)

class CustomModel(BoringModel):
def transfer_batch_to_device(self, batch, device, dataloader_idx):
Expand Down
5 changes: 5 additions & 0 deletions tests/accelerators/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,8 @@ def test_set_cuda_device(set_device_mock, tmpdir):
)
trainer.fit(model)
set_device_mock.assert_called_once()


@RunIf(min_gpus=1)
def test_gpu_availability():
assert GPUAccelerator.is_available()
1 change: 1 addition & 0 deletions tests/accelerators/test_ipu.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def test_fail_if_no_ipus(tmpdir):

@RunIf(ipu=True)
def test_accelerator_selected(tmpdir):
assert IPUAccelerator.is_available()
trainer = Trainer(default_root_dir=tmpdir, ipus=1)
assert isinstance(trainer.accelerator, IPUAccelerator)
trainer = Trainer(default_root_dir=tmpdir, ipus=1, accelerator="ipu")
Expand Down
1 change: 1 addition & 0 deletions tests/accelerators/test_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_if_test_works_after_train(tmpdir):

@RunIf(tpu=True)
def test_accelerator_tpu():
assert TPUAccelerator.is_available()

trainer = Trainer(accelerator="tpu", tpu_cores=8)

Expand Down
3 changes: 2 additions & 1 deletion tests/models/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,9 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun
},
)
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("torch.cuda.is_available", return_value=True)
@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"])
def test_torchelastic_gpu_parsing(mocked_device_count, gpus):
def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus):
"""Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit
sanitizing the gpus as only one of the GPUs is visible."""
trainer = Trainer(gpus=gpus)
Expand Down
3 changes: 2 additions & 1 deletion tests/plugins/test_amp_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class MyApexPlugin(ApexMixedPrecisionPlugin):
"SLURM_LOCALID": "0",
},
)
@mock.patch("torch.cuda.is_available", return_value=True)
@mock.patch("torch.cuda.device_count", return_value=2)
@pytest.mark.parametrize("strategy,gpus", [("ddp", 2), ("ddp2", 2), ("ddp_spawn", 2)])
@pytest.mark.parametrize(
Expand All @@ -56,7 +57,7 @@ class MyApexPlugin(ApexMixedPrecisionPlugin):
pytest.param("apex", True, MyApexPlugin, marks=RunIf(amp_apex=True)),
],
)
def test_amp_apex_ddp(mocked_device_count, strategy, gpus, amp, custom_plugin, plugin_cls):
def test_amp_apex_ddp(mocked_device_count, mocked_is_available, strategy, gpus, amp, custom_plugin, plugin_cls):
plugin = None
if custom_plugin:
plugin = plugin_cls(16, "cpu") if amp == "native" else plugin_cls()
Expand Down
1 change: 1 addition & 0 deletions tests/utilities/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def test_parse_args_parsing_complex_types(cli_args, expected, instantiate):
def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu):
"""Test parsing of gpus and instantiation of Trainer."""
monkeypatch.setattr("torch.cuda.device_count", lambda: 2)
monkeypatch.setattr("torch.cuda.is_available", lambda: True)
cli_args = cli_args.split(" ") if cli_args else []
with mock.patch("sys.argv", ["any.py"] + cli_args):
parser = LightningArgumentParser(add_help=False, parse_as_dict=False)
Expand Down