343343import time
344344import random
345345import torch
346- from typing import Union
346+ from typing import Union , Callable
347347
348348from pytorch_lightning import _logger as log
349349from pytorch_lightning .loggers import LightningLoggerBase
@@ -748,26 +748,33 @@ def determine_root_gpu_device(gpus):
748748 return root_gpu
749749
750750
751- def retry_jittered_backoff (f , num_retries = 5 ):
752- # Based on:
753- # https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
754- cap = 1.0 # max sleep time is 1s
755- base = 0.01 # initial sleep time is 10ms
756- sleep = base # initial sleep time is 10ms
751+ def retry_jittered_backoff (func : Callable , num_retries : int = 5 , cap_delay : float = 1.0 , base_delay : float = 0.01 ):
752+ """Retry jittered backoff.
753+
754+ Based on:
755+ https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
756+
757+ Args:
758+ func: tested function
759+ num_retries: number of tries
760+ cap_delay: max sleep time
761+ base_delay: initial sleep time is 10ms
762+ """
763+ sleep_delay = base_delay # initial sleep time is 10ms
757764
758765 for i in range (num_retries ):
759766 try :
760- return f ()
761- except RuntimeError as e :
767+ return func ()
768+ except RuntimeError as err :
762769 if i == num_retries - 1 :
763- raise e
770+ raise err
764771 else :
765772 continue
766- time .sleep (sleep )
767- sleep = min (cap , random .uniform (base , sleep * 3 ))
773+ time .sleep (sleep_delay )
774+ sleep_delay = min (cap_delay , random .uniform (base_delay , sleep_delay * 3 ))
768775
769776
770- def pick_single_gpu (exclude_gpus = [] ):
777+ def pick_single_gpu (exclude_gpus : list ):
771778 for i in range (torch .cuda .device_count ()):
772779 if i in exclude_gpus :
773780 continue
@@ -781,9 +788,9 @@ def pick_single_gpu(exclude_gpus=[]):
781788 raise RuntimeError ("No GPUs available." )
782789
783790
784- def pick_multiple_gpus (n ):
791+ def pick_multiple_gpus (nb ):
785792 picked = []
786- for _ in range (n ):
793+ for _ in range (nb ):
787794 picked .append (pick_single_gpu (exclude_gpus = picked ))
788795
789796 return picked
0 commit comments