Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `AttributeError for `require_backward_grad_sync` when running manual optimization with sharded plugin ([#6915](https://github.com/PyTorchLightning/pytorch-lightning/pull/6915))


- Fixed multi-gpu join for Horovod ([#6954](https://github.com/PyTorchLightning/pytorch-lightning/pull/6954))


- Fixed a bug where `LightningModule.training_epoch_end` was called after the `on_train_end_epoch` hook ([#6969](https://github.com/PyTorchLightning/pytorch-lightning/pull/6969))


Expand Down
18 changes: 12 additions & 6 deletions pytorch_lightning/plugins/training_type/horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,26 +99,26 @@ def start_training(self, trainer):
self._results = trainer.run_stage()

# Make sure all workers have finished training before returning to the user
hvd.join()
self.join()

def start_evaluating(self, trainer):
with ExitStack():
self._results = trainer.run_stage()

# Make sure all workers have finished training before returning to the user
hvd.join()
self.join()

def start_predicting(self, trainer):
with ExitStack():
# set up training routine
self._results = trainer.run_stage()

# Make sure all workers have finished training before returning to the user
hvd.join()
self.join()

def barrier(self, *args, **kwargs):
if torch_distrib.is_initialized():
hvd.join()
self.join()

def broadcast(self, obj: object, src: int = 0) -> object:
obj = hvd.broadcast_object(obj, src)
Expand All @@ -129,6 +129,12 @@ def model_to_device(self):
torch.cuda.set_device(self.root_device)
self.model.to(self.root_device)

def join(self):
if self.on_gpu:
hvd.join(self.local_rank)
else:
hvd.join()

def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"):
"""
Reduces a tensor from several distributed processes to one aggregated tensor.
Expand Down Expand Up @@ -156,7 +162,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[
raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")

# sync all processes before reduction
hvd.join()
self.join()
return hvd.allreduce(tensor, op=reduce_op)

def all_gather(
Expand All @@ -176,7 +182,7 @@ def all_gather(
result = result.reshape(1)

# sync and gather all
hvd.join()
self.join()
gathered = hvd.allgather(result)
gathered_result = list(gathered.split(1, dim=0))
return gathered_result
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ def _compute_batch():


# todo: need to be fixed :]
@pytest.mark.skip(reason="TODO Breaking CI: Aborted (core dumped)")
@pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted')
@RunIf(skip_windows=True, horovod=True)
def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir):

Expand Down