Lightning-AI · tchaton · Apr 13, 2021 · Apr 10, 2021 · Apr 12, 2021 · Apr 12, 2021
@@ -243,6 +243,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `AttributeError for `require_backward_grad_sync` when running manual optimization with sharded plugin ([#6915](https://github.com/PyTorchLightning/pytorch-lightning/pull/6915))
 
 
+- Fixed multi-gpu join for Horovod ([#6954](https://github.com/PyTorchLightning/pytorch-lightning/pull/6954))
+
+
 - Fixed a bug where `LightningModule.training_epoch_end` was called after the `on_train_end_epoch` hook ([#6969](https://github.com/PyTorchLightning/pytorch-lightning/pull/6969))
 
 

@@ -99,26 +99,26 @@ def start_training(self, trainer):
             self._results = trainer.run_stage()
 
         # Make sure all workers have finished training before returning to the user
-        hvd.join()
+        self.join()
 
     def start_evaluating(self, trainer):
         with ExitStack():
             self._results = trainer.run_stage()
 
         # Make sure all workers have finished training before returning to the user
-        hvd.join()
+        self.join()
 
     def start_predicting(self, trainer):
         with ExitStack():
             # set up training routine
             self._results = trainer.run_stage()
 
         # Make sure all workers have finished training before returning to the user
-        hvd.join()
+        self.join()
 
     def barrier(self, *args, **kwargs):
         if torch_distrib.is_initialized():
-            hvd.join()
+            self.join()
 
     def broadcast(self, obj: object, src: int = 0) -> object:
         obj = hvd.broadcast_object(obj, src)
@@ -129,6 +129,12 @@ def model_to_device(self):
             torch.cuda.set_device(self.root_device)
         self.model.to(self.root_device)
 
+    def join(self):
+        if self.on_gpu:
+            hvd.join(self.local_rank)
+        else:
+            hvd.join()
+
     def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"):
         """
         Reduces a tensor from several distributed processes to one aggregated tensor.
@@ -156,7 +162,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[
             raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")
 
         # sync all processes before reduction
-        hvd.join()
+        self.join()
         return hvd.allreduce(tensor, op=reduce_op)
 
     def all_gather(
@@ -176,7 +182,7 @@ def all_gather(
             result = result.reshape(1)
 
         # sync and gather all
-        hvd.join()
+        self.join()
         gathered = hvd.allgather(result)
         gathered_result = list(gathered.split(1, dim=0))
         return gathered_result

@@ -377,7 +377,7 @@ def _compute_batch():
 
 
 # todo: need to be fixed :]
-@pytest.mark.skip(reason="TODO Breaking CI: Aborted (core dumped)")
+@pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted')
 @RunIf(skip_windows=True, horovod=True)
 def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -243,6 +243,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Fixed `AttributeError for `require_backward_grad_sync` when running manual optimization with sharded plugin ([#6915](https://github.com/PyTorchLightning/pytorch-lightning/pull/6915))


		- Fixed multi-gpu join for Horovod ([#6954](https://github.com/PyTorchLightning/pytorch-lightning/pull/6954))


		- Fixed a bug where `LightningModule.training_epoch_end` was called after the `on_train_end_epoch` hook ([#6969](https://github.com/PyTorchLightning/pytorch-lightning/pull/6969))


Expand Down