Add automatic GPU choice to trainer

Allard Hendriksen · Allard Hendriksen · commit 2d5db37eb7ad · 2020-04-09T14:59:52.000+02:00
This commit adds the `gpu_choice` parameter to Trainer. By default,
this parameter is set to 'manual' which causes no observable
difference in behavior.

When `gpu_choice` is set to "auto" and `gpus` is an int, then the
trainer will automatically allocate the first available GPU.
This is especially useful when GPUs are configured to be in "exclusive
mode", which means that only one process at a time can use them.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [unreleased] - YYYY-MM-DD
+
+### Added
+
+- Added `gpu_choice` to trainer which can enable automatically picking the first available GPU on exclusive mode systems.
+
 ## [0.7.2] - 2020-04-07
 
 ### Added
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -339,7 +339,8 @@
 
 import os
 from abc import ABC, abstractmethod
-
+import time
+import random
 import torch
 
 from pytorch_lightning import _logger as log
@@ -646,3 +647,44 @@ def determine_root_gpu_device(gpus):
     root_gpu = gpus[0]
 
     return root_gpu
+
+
+def retry_jittered_backoff(f, num_retries=5):
+    # Based on:
+    # https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
+    cap = 1.0                  # max sleep time is 1s
+    base = 0.01                # initial sleep time is 10ms
+    sleep = base               # initial sleep time is 10ms
+
+    for i in range(num_retries):
+        try:
+            return f()
+        except RuntimeError as e:
+            if i == num_retries - 1:
+                raise e
+            else:
+                continue
+        time.sleep(sleep)
+        sleep = min(cap, random.uniform(base, sleep * 3))
+
+
+def pick_single_gpu(exclude_gpus=[]):
+    for i in range(torch.cuda.device_count()):
+        if i in exclude_gpus:
+            continue
+        # Try to allocate on device:
+        device = torch.device(f"cuda:{i}")
+        try:
+            torch.ones(1).to(device)
+        except RuntimeError:
+            continue
+        return i
+    raise RuntimeError("No GPUs available.")
+
+
+def pick_multiple_gpus(n):
+    picked = []
+    for _ in range(n):
+        picked.append(pick_single_gpu(exclude_gpus=picked))
+
+    return picked
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -23,7 +23,12 @@
 from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
 from pytorch_lightning.trainer.deprecated_api import TrainerDeprecatedAPITillVer0_8, TrainerDeprecatedAPITillVer0_9
 from pytorch_lightning.trainer.distrib_data_parallel import TrainerDDPMixin
-from pytorch_lightning.trainer.distrib_parts import TrainerDPMixin, parse_gpu_ids, determine_root_gpu_device
+from pytorch_lightning.trainer.distrib_parts import (
+    TrainerDPMixin,
+    parse_gpu_ids,
+    determine_root_gpu_device,
+    pick_multiple_gpus,
+)
 from pytorch_lightning.trainer.evaluation_loop import TrainerEvaluationLoopMixin
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
@@ -85,6 +90,7 @@ def __init__(
             process_position: int = 0,
             num_nodes: int = 1,
             gpus: Optional[Union[List[int], str, int]] = None,
+            gpu_choice: str = 'manual',
             num_tpu_cores: Optional[int] = None,
             log_gpu_memory: Optional[str] = None,
             progress_bar_refresh_rate: int = 1,
@@ -158,6 +164,14 @@ def __init__(
 
             gpus: Which GPUs to train on.
 
+            gpu_choice: 'manual' (default) or 'auto'.
+
+                If 'auto' and `gpus` is an integer, pick the first
+                available gpus automatically. This is especially
+                useful when GPUs are configured to be in "exclusive
+                mode", which means that only one process at a time can
+                use them.
+
             num_tpu_cores: How many TPU cores to train on (1 or 8).
 
             log_gpu_memory: None, 'min_max', 'all'. Might slow performance
@@ -385,8 +399,12 @@ def __init__(
         self.accumulate_grad_batches = accumulate_grad_batches
         self.configure_accumulated_gradients(accumulate_grad_batches)
 
-        # allow int, string and gpu list
-        self.gpus = gpus
+        # for gpus allow int, string and gpu list
+        if gpu_choice == "auto" and isinstance(gpus, int):
+            self.gpus = pick_multiple_gpus(gpus)
+        else:
+            self.gpus = gpus
+
         self.data_parallel_device_ids = parse_gpu_ids(self.gpus)
         self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids)
         self.root_device = torch.device("cpu")
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -658,3 +658,18 @@ def on_batch_start(self, trainer, pl_module):
     assert not trainer.interrupted
     trainer.fit(model)
     assert trainer.interrupted
+
+
+def test_gpu_choice(tmpdir):
+    trainer_options = dict(
+        default_save_path=tmpdir,
+    )
+    # Only run if CUDA is available
+    if not torch.cuda.is_available():
+        return
+
+    num_gpus = torch.cuda.device_count()
+    Trainer(**trainer_options, gpus=num_gpus, gpu_choice="auto")
+
+    with pytest.raises(RuntimeError, match=r'.*No GPUs available.*'):
+        Trainer(**trainer_options, gpus=num_gpus + 1, gpu_choice="auto")