is_available

ananthsub · ananthsub · commit 274c9aa84cd3 · 2022-02-09T00:35:23.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -98,7 +98,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a `_Stateful` support for `LightningDataModule` ([#11637](https://github.com/PyTorchLightning/pytorch-lightning/pull/11637))
 
 
-- Added checks to `GPUAccelerator` to assert CUDA availability at initialization ([#11797](https://github.com/PyTorchLightning/pytorch-lightning/pull/11797))
+- Added `Accelerator.is_available` to assert device availability ([#11797](https://github.com/PyTorchLightning/pytorch-lightning/pull/11797))
 
 
 ### Changed
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -35,7 +35,13 @@ def setup_environment(self, root_device: torch.device) -> None:
 
         This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator
         environment before setup is complete.
+
+        Raises:
+            RuntimeError:
+                If corresponding hardware is not found.
         """
+        if not self.is_available():
+            raise RuntimeError(f"{self.__class__.__qualname__} is not configured to run on this hardware.")
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """Setup plugins for the trainer fit and creates optimizers.
@@ -59,3 +65,8 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
     @abstractmethod
     def auto_device_count() -> int:
         """Get the device count when set to auto."""
+
+    @staticmethod
+    @abstractmethod
+    def is_available() -> bool:
+        """Detect if the hardware is available."""
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
@@ -31,6 +31,7 @@ def setup_environment(self, root_device: torch.device) -> None:
             MisconfigurationException:
                 If the selected device is not CPU.
         """
+        super().setup_environment(root_device)
         if root_device.type != "cpu":
             raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.")
 
@@ -42,3 +43,8 @@ def get_device_stats(self, device: _DEVICE) -> dict[str, Any]:
     def auto_device_count() -> int:
         """Get the devices when set to auto."""
         return 1
+
+    @staticmethod
+    def is_available() -> bool:
+        """CPU is always available for execution."""
+        return True
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
@@ -33,24 +33,13 @@
 class GPUAccelerator(Accelerator):
     """Accelerator for GPU devices."""
 
-    def __init__(self) -> None:
-        """
-        Raises:
-            MisconfigurationException:
-                If torch.cuda isn't available.
-                If no CUDA devices are found.
-        """
-        if not torch.cuda.is_available():
-            raise MisconfigurationException("GPU Accelerator used, but CUDA isn't available.")
-        if torch.cuda.device_count() == 0:
-            raise MisconfigurationException("GPU Accelerator used, but found no CUDA devices available.")
-
     def setup_environment(self, root_device: torch.device) -> None:
         """
         Raises:
             MisconfigurationException:
                 If the selected device is not GPU.
         """
+        super().setup_environment(root_device)
         if root_device.type != "cuda":
             raise MisconfigurationException(f"Device should be GPU, got {root_device} instead")
         torch.cuda.set_device(root_device)
@@ -91,6 +80,10 @@ def auto_device_count() -> int:
         """Get the devices when set to auto."""
         return torch.cuda.device_count()
 
+    @staticmethod
+    def is_available() -> bool:
+        return torch.cuda.is_available() and torch.cuda.device_count > 0
+
 
 def get_nvidia_gpu_stats(device: _DEVICE) -> dict[str, float]:
     """Get GPU stats including memory, fan speed, and temperature from nvidia-smi.
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
@@ -16,6 +16,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.utilities import _IPU_AVAILABLE
 
 
 class IPUAccelerator(Accelerator):
@@ -31,3 +32,7 @@ def auto_device_count() -> int:
         # TODO (@kaushikb11): 4 is the minimal unit they are shipped in.
         # Update this when api is exposed by the Graphcore team.
         return 4
+
+    @staticmethod
+    def is_available() -> bool:
+        return _IPU_AVAILABLE
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
@@ -16,7 +16,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import _XLA_AVAILABLE
+from pytorch_lightning.utilities.imports import _TPU_AVAILABLE, _XLA_AVAILABLE
 
 if _XLA_AVAILABLE:
     import torch_xla.core.xla_model as xm
@@ -47,3 +47,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
     def auto_device_count() -> int:
         """Get the devices when set to auto."""
         return 8
+
+    @staticmethod
+    def is_available() -> bool:
+        return _TPU_AVAILABLE
diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py
@@ -22,6 +22,10 @@ def test_restore_checkpoint_after_pre_setup_default():
     assert not plugin.restore_checkpoint_after_setup
 
 
+def test_availability():
+    assert CPUAccelerator.is_available
+
+
 @pytest.mark.parametrize("restore_after_pre_setup", [True, False])
 def test_restore_checkpoint_after_pre_setup(tmpdir, restore_after_pre_setup):
     """Test to ensure that if restore_checkpoint_after_setup is True, then we only load the state after pre-
diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py
@@ -160,7 +160,6 @@ def _assert_extra_outputs(self, outputs):
 @mock.patch("torch.cuda.device_count", return_value=2)
 def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, mock_is_available, mock_device_count):
     """Test that an exception is raised when overriding batch_transfer_hooks in DP model."""
-    # monkeypatch.setattr("torch.cuda.device_count", lambda: 2)
 
     class CustomModel(BoringModel):
         def transfer_batch_to_device(self, batch, device, dataloader_idx):
diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py
@@ -60,3 +60,20 @@ def test_set_cuda_device(set_device_mock, tmpdir):
     )
     trainer.fit(model)
     set_device_mock.assert_called_once()
+
+
+@RunIf(min_gpus=1)
+def test_gpu_availability():
+    assert GPUAccelerator.is_available()
+
+
+@mock.patch("torch.cuda.is_available", return_value=True)
+@mock.patch("torch.cuda.device_count", return_value=2)
+def test_mocked_gpu_available(*_):
+    assert GPUAccelerator.is_available()
+
+
+@mock.patch("torch.cuda.is_available", return_value=False)
+@mock.patch("torch.cuda.device_count", return_value=0)
+def test_mocked_gpu_availability(*_):
+    assert not GPUAccelerator.is_available()
diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py
@@ -106,6 +106,7 @@ def test_fail_if_no_ipus(tmpdir):
 
 @RunIf(ipu=True)
 def test_accelerator_selected(tmpdir):
+    assert IPUAccelerator.is_available()
     trainer = Trainer(default_root_dir=tmpdir, ipus=1)
     assert isinstance(trainer.accelerator, IPUAccelerator)
     trainer = Trainer(default_root_dir=tmpdir, ipus=1, accelerator="ipu")
diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py
@@ -83,6 +83,7 @@ def test_if_test_works_after_train(tmpdir):
 
 @RunIf(tpu=True)
 def test_accelerator_tpu():
+    assert TPUAccelerator.is_available()
 
     trainer = Trainer(accelerator="tpu", tpu_cores=8)