Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
89f284d
Fix some test errors
Mar 23, 2021
80cfbff
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
Mar 23, 2021
536c132
checkpoint consolidation
Mar 24, 2021
f172101
Update ddp_spawn.py
shuyingsunshine21 Mar 24, 2021
bf70e43
Update test_metric_result_integration.py
shuyingsunshine21 Mar 24, 2021
ea74906
Update test_results.py
shuyingsunshine21 Mar 24, 2021
a9aae99
Update utils.py
shuyingsunshine21 Mar 24, 2021
70fe5da
Update utils.py
shuyingsunshine21 Mar 24, 2021
0d23d75
Update test_all_gather_grad.py
shuyingsunshine21 Mar 24, 2021
ca6f98b
Update test_all_gather_grad.py
shuyingsunshine21 Mar 24, 2021
c5053da
Merge pull request #1 from shuyingsunshine21/shuyingsunshine21-checkp…
shuyingsunshine21 Mar 24, 2021
9d4a2b8
Update test_results.py
shuyingsunshine21 Mar 24, 2021
7635b4f
Revert "Update test_results.py"
shuyingsunshine21 Mar 24, 2021
d64f90c
Revert "Merge pull request #1 from shuyingsunshine21/shuyingsunshine2…
shuyingsunshine21 Mar 24, 2021
dcdcd29
Revert "Update test_all_gather_grad.py"
shuyingsunshine21 Mar 24, 2021
8651d54
Revert "Update utils.py"
shuyingsunshine21 Mar 24, 2021
15f4b9e
Revert "Update utils.py"
shuyingsunshine21 Mar 24, 2021
250d0aa
Revert "Update test_results.py"
shuyingsunshine21 Mar 24, 2021
6c095b2
Revert "Update test_metric_result_integration.py"
shuyingsunshine21 Mar 24, 2021
8222dc9
Revert "Update ddp_spawn.py"
shuyingsunshine21 Mar 24, 2021
3a9fde9
Revert "checkpoint consolidation"
shuyingsunshine21 Mar 24, 2021
7a369f4
Revert "Revert "checkpoint consolidation""
shuyingsunshine21 Mar 24, 2021
b4a0b9e
Revert "Revert "Revert "checkpoint consolidation"""
shuyingsunshine21 Mar 24, 2021
5cf1db1
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
Mar 24, 2021
0ce7e05
Revert "Revert "Update ddp_spawn.py""
shuyingsunshine21 Mar 24, 2021
fe9736d
Revert "Revert "Update test_metric_result_integration.py""
shuyingsunshine21 Mar 24, 2021
c314ef6
Revert "Revert "Update test_results.py""
shuyingsunshine21 Mar 24, 2021
c3feda0
Revert "Revert "Update utils.py""
shuyingsunshine21 Mar 24, 2021
c759477
Revert "Revert "Update test_all_gather_grad.py""
shuyingsunshine21 Mar 24, 2021
7a8e540
Merge branch 'master' of https://github.com/shuyingsunshine21/pytorch…
Mar 24, 2021
ab8b849
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
Mar 24, 2021
4e67db2
modify distributed environment to make test pass
Mar 24, 2021
67b6188
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
Mar 25, 2021
56864d9
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
Apr 5, 2021
cb39c74
remove fragile error handling
Apr 6, 2021
1f12018
draft fix v1
Apr 7, 2021
c3119ac
pull upstream master
Apr 7, 2021
834aa53
remove test related env flags
Apr 7, 2021
46be491
comments
Apr 8, 2021
0aff9a3
add changelog
Apr 8, 2021
7ddb196
fix
Apr 8, 2021
ba50f5d
formatting issue
Apr 8, 2021
2644953
modify trainer doc
Apr 9, 2021
b8f877a
Update CHANGELOG
carmocca Apr 13, 2021
6403717
Merge branch 'master' into train_end_try_catch
carmocca Apr 13, 2021
d9f45e3
Merge branch 'master' into train_end_try_catch
carmocca Apr 13, 2021
b11bd93
Fix test
carmocca Apr 13, 2021
8c1153f
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
Apr 14, 2021
2bb6c99
fix test for test_training_loop
Apr 14, 2021
0986f37
Merge branch 'train_end_try_catch' of https://github.com/shuyingsunsh…
Apr 14, 2021
c8c8b08
formatting
Apr 14, 2021
d8ca310
Fix broken test
ethanwharris Apr 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Changed profilers to save separate report files per state and rank ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))


- The trainer no longer tries to save a checkpoint on exception or run callback's `on_train_end` functions ([#6864](https://github.com/PyTorchLightning/pytorch-lightning/pull/6864))


- Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))


Expand Down Expand Up @@ -258,6 +261,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `sync_dist` for tpus ([#6950](https://github.com/PyTorchLightning/pytorch-lightning/pull/6950))


- Fixed bug for trainer error handling which would cause hang for distributed training ([#6864](https://github.com/PyTorchLightning/pytorch-lightning/pull/6864))


- Fixed `self.device` not returning the correct device in replicas of data-parallel ([#6414](https://github.com/PyTorchLightning/pytorch-lightning/pull/6414))


Expand Down
4 changes: 2 additions & 2 deletions docs/source/common/trainer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ So you can run it like so:
.. note::
If you want to stop a training run early, you can press "Ctrl + C" on your keyboard.
The trainer will catch the ``KeyboardInterrupt`` and attempt a graceful shutdown, including
running callbacks such as ``on_train_end``. The trainer object will also set an attribute
``interrupted`` to ``True`` in such cases. If you have a callback which shuts down compute
running accelerator callback ``on_train_end`` to clean up memory. The trainer object will also set
an attribute ``interrupted`` to ``True`` in such cases. If you have a callback which shuts down compute
resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs.

------------
Expand Down
22 changes: 12 additions & 10 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import warnings
from itertools import count
from pathlib import Path
from traceback import print_exc
from typing import Any, Dict, Iterable, List, Optional, Union

import torch
Expand Down Expand Up @@ -420,7 +419,7 @@ def fit(
# we reuse fit for other functions. When already set, it shouldn't be modified.
if not self.state.running:
self.state = TrainerState.FITTING
if self._running_stage is None:
if self._running_stage is None or self.tuning:
self.training = True

# set local properties on the model
Expand Down Expand Up @@ -607,6 +606,7 @@ def run_train(self) -> None:
self.train_loop.run_training_epoch()

if self.max_steps and self.max_steps <= self.global_step:
self.train_loop.on_train_end()
return

# early stopping
Expand All @@ -615,6 +615,7 @@ def run_train(self) -> None:

if self.should_stop:
if met_min_epochs and met_min_steps:
self.train_loop.on_train_end()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we also remove this from train_loop.on_train_end ? https://github.com/PyTorchLightning/pytorch-lightning/blob/b7a22ba046ba57072a71b12d16caff000e66f798/pytorch_lightning/trainer/training_loop.py#L114-L118

as this is one of the possible sources of bugs with the attempt at graceful shutdown

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I have a PR in progress for this. Need to finish it. This is part of what causes #6470

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ananthsub , for this case, train does not have error (just early stopping), should we run train_loop.on_train_end to perform checkpointing?

@camruta , for why here I use train_loop.on_train_end, but for the case exception, I use accelerator.on_train_end, is because trying to avoid checkpointing when there is error for some ranks but not all (hence hang in broadcast).

To avoid divergence (normal end vs. error end), do you think we actually should remove the problematic checkpointing logic in training_loop.on_train_end. But there is also risk of adding logics in training_loop.on_train_end which could potentially need all ranks to be involved. How do we enforce that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

open to discussion

return
else:
log.info(
Expand All @@ -633,14 +634,15 @@ def run_train(self) -> None:
if not self.interrupted:
self.state = TrainerState.INTERRUPTED
self.on_keyboard_interrupt()
except (RuntimeError, AssertionError):
# if an exception is raised, the finally block is executed and can hide the actual exception
# that was initially raised if `on_train_end` also raises an exception. we want to avoid that
# for assertions and other runtime errors so we aren't misled while debugging
print_exc()
finally:
# hook
self.train_loop.on_train_end()
# same treatment as below
self.accelerator.on_train_end()
self._running_stage = None
except BaseException:
# give accelerators a chance to finish
self.accelerator.on_train_end()
# reset bookkeeping
self._running_stage = None
raise

def run_evaluation(self, on_epoch=False):
if not (self.evaluating or self.sanity_checking):
Expand Down
3 changes: 1 addition & 2 deletions tests/callbacks/test_callback_hook_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, datal
assert 'x' in outputs

def on_train_epoch_end(self, trainer, pl_module, outputs):
d = outputs[0]
assert len(d) == trainer.num_training_batches
assert len(outputs) == trainer.num_training_batches

class TestModel(BoringModel):

Expand Down
4 changes: 3 additions & 1 deletion tests/trainer/logging_/test_logger_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,10 @@ def train_dataloader(self):
sampler=None,
)

def training_step_end(self, *_):
def training_step_end(self, training_step_output):
self.train_results = deepcopy(self.trainer.logger_connector.cached_results)
# must return
return training_step_output

model = TestModel()
model.training_epoch_end = None
Expand Down
41 changes: 13 additions & 28 deletions tests/trainer/optimization/test_manual_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@


@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
def test_multiple_optimizers_manual(tmpdir):
def test_multiple_optimizers_manual_no_return(tmpdir):
"""
Tests that only training_step can be used
"""
Expand Down Expand Up @@ -68,8 +68,9 @@ def training_step(self, batch, batch_idx):
assert torch.all(self.layer.weight.grad == 0)

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2
# outputs is empty as training_step does not return
# and it is not automatic optimization
assert len(outputs) == 0

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
Expand Down Expand Up @@ -279,8 +280,9 @@ def training_step(self, batch, batch_idx):
assert torch.all(self.layer.weight.grad == 0)

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2
# outputs is empty as training_step does not return
# and it is not automatic optimization
assert len(outputs) == 0

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
Expand Down Expand Up @@ -310,7 +312,7 @@ def configure_optimizers(self):

@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
@RunIf(min_gpus=1, amp_apex=True)
def test_multiple_optimizers_manual_apex(tmpdir):
def test_multiple_optimizers_manual_apex_no_return(tmpdir):
"""
Tests that only training_step can be used
"""
Expand Down Expand Up @@ -353,8 +355,9 @@ def training_step(self, batch, batch_idx):
assert torch.all(self.layer.weight.grad == 0)

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2
# outputs is empty as training_step does not return
# and it is not automatic optimization
assert len(outputs) == 0

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
Expand Down Expand Up @@ -638,6 +641,8 @@ def training_step(self, batch, batch_idx):
opt_b.step()
opt_b.zero_grad()

return {'loss1': loss_1, 'loss2': loss_2}

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2
Expand Down Expand Up @@ -724,10 +729,6 @@ def optimizer_closure():
weight_after = self.layer.weight.clone()
assert not torch.equal(weight_before, weight_after)

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
return optimizer
Expand Down Expand Up @@ -788,10 +789,6 @@ def optimizer_closure():
else:
assert self.layer.weight.grad is not None

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
return optimizer
Expand Down Expand Up @@ -845,10 +842,6 @@ def optimizer_closure():
opt.step(closure=optimizer_closure)
opt.zero_grad()

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
return optimizer
Expand Down Expand Up @@ -923,10 +916,6 @@ def dis_closure():
opt_dis.step(closure=dis_closure)
opt_dis.zero_grad()

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2

def configure_optimizers(self):
optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1)
optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
Expand Down Expand Up @@ -1031,10 +1020,6 @@ def dis_closure():
if make_dis_optimizer_step:
opt_dis.step(closure=dis_closure)

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2

def configure_optimizers(self):
optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1)
optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
Expand Down
5 changes: 3 additions & 2 deletions tests/trainer/optimization/test_multiple_optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def training_step(self, batch, batch_idx):
opt_b.zero_grad()

def training_epoch_end(self, outputs) -> None:
# outputs should be an array with an entry per optimizer
assert len(outputs) == 2
# outputs is empty as training_step does not return
# and it is not automatic optimization
assert len(outputs) == 0

model = TestModel()
model.val_dataloader = None
Expand Down
62 changes: 36 additions & 26 deletions tests/trainer/test_training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,17 @@ def on_after_backward(self):
super().on_after_backward()

def optimizer_step(
self,
self,
epoch,
batch_idx,
optimizer,
optimizer_idx,
optimizer_closure,
on_tpu,
using_native_amp,
using_lbfgs,
):
super().optimizer_step(
epoch,
batch_idx,
optimizer,
Expand All @@ -67,11 +77,10 @@ def optimizer_step(
on_tpu,
using_native_amp,
using_lbfgs,
):
super().optimizer_step(
epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs
)
self.called.append("optimizer_step") # append after as closure calls other methods
self.called.append(
"optimizer_step"
) # append after as closure calls other methods

def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
self.called.append("on_train_batch_end")
Expand Down Expand Up @@ -106,23 +115,23 @@ def on_epoch_end(self):

trainer.fit(model)
expected = [
'on_epoch_start', # validation
'on_epoch_end',
'on_epoch_start', # training
'on_train_epoch_start',
'on_train_batch_start',
'training_step',
'on_before_zero_grad',
'optimizer_zero_grad',
'backward',
'on_after_backward',
'optimizer_step',
'on_train_batch_end',
'training_epoch_end',
'on_train_epoch_end',
'on_epoch_end',
'on_epoch_start', # validation
'on_epoch_end'
"on_epoch_start", # validation
"on_epoch_end",
"on_epoch_start", # training
"on_train_epoch_start",
"on_train_batch_start",
"training_step",
"on_before_zero_grad",
"optimizer_zero_grad",
"backward",
"on_after_backward",
"optimizer_step",
"on_train_batch_end",
"training_epoch_end",
"on_train_epoch_end",
"on_epoch_end",
"on_epoch_start", # validation
"on_epoch_end",
]
assert model.called == expected

Expand All @@ -132,15 +141,16 @@ def test_outputs_format(tmpdir):

class HookedModel(BoringModel):
def training_step(self, batch, batch_idx):
self.log("foo", "bar")
return super().training_step(batch, batch_idx)
output = super().training_step(batch, batch_idx)
self.log("foo", 123)
output["foo"] = 123
return output

@staticmethod
def _check_output(output):
assert "loss" in output

assert "foo" in output
assert output["foo"] == "bar"
assert output["foo"] == 123

def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
HookedModel._check_output(outputs)
Expand Down