Bugfix/cuda oom detection and handling (#6934)

ejohb · awaelchli · web-flow · commit 5bd3cd5f712b · 2021-04-15T03:22:11.000+02:00
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py
@@ -53,7 +53,8 @@ def is_oom_error(exception):
 def is_cuda_out_of_memory(exception):
     return isinstance(exception, RuntimeError) \
         and len(exception.args) == 1 \
-        and "CUDA out of memory." in exception.args[0]
+        and "CUDA" in exception.args[0] \
+        and "out of memory" in exception.args[0]
 
 
 # based on https://github.com/BlackHC/toma/blob/master/toma/torch_cuda_memory.py
@@ -76,4 +77,10 @@ def garbage_collection_cuda():
     """Garbage collection Torch (CUDA) memory."""
     gc.collect()
     if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        try:
+            # This is the last thing that should cause an OOM error, but seemingly it can.
+            torch.cuda.empty_cache()
+        except RuntimeError as exception:
+            if not is_oom_error(exception):
+                # Only handle OOM errors
+                raise