Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
[#8627](https://github.com/PyTorchLightning/pytorch-lightning/pull/8627))


- Fixed `EarlyStopping` running on train epoch end when `check_val_every_n_epoch>1` is set ([#9156](https://github.com/PyTorchLightning/pytorch-lightning/pull/9156))


- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8333](https://github.com/PyTorchLightning/pytorch-lightning/issues/8333))


Expand Down
8 changes: 4 additions & 4 deletions pytorch_lightning/callbacks/early_stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@ def __init__(
def state_key(self) -> str:
return self._generate_state_key(monitor=self.monitor, mode=self.mode)

def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
def on_init_end(self, trainer: "pl.Trainer") -> None:
if self._check_on_train_epoch_end is None:
# if the user runs validation multiple times per training epoch, we try to check after
# validation instead of on train epoch end
self._check_on_train_epoch_end = trainer.val_check_interval == 1.0
# if the user runs validation multiple times per training epoch or multiple training epochs without
# validation, then we run after validation instead of on train epoch end
self._check_on_train_epoch_end = trainer.val_check_interval == 1.0 and trainer.check_val_every_n_epoch == 1

def _validate_condition_metric(self, logs):
monitor_val = logs.get(self.monitor)
Expand Down
6 changes: 3 additions & 3 deletions pytorch_lightning/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,9 @@ def state_key(self) -> str:

def on_init_end(self, trainer: "pl.Trainer") -> None:
if self._save_on_train_epoch_end is None:
# if the user runs validation multiple times per training epoch, we try to save checkpoint after
# validation instead of on train epoch end
self._save_on_train_epoch_end = trainer.val_check_interval == 1.0
# if the user runs validation multiple times per training epoch or multiple training epochs without
# validation, then we run after validation instead of on train epoch end
self._save_on_train_epoch_end = trainer.val_check_interval == 1.0 and trainer.check_val_every_n_epoch == 1

def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
"""When pretrain routine starts we build the ckpt dir on the fly."""
Expand Down
20 changes: 14 additions & 6 deletions tests/callbacks/test_early_stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,22 +424,27 @@ def test_multiple_early_stopping_callbacks(
trainer.fit(model)


def test_check_on_train_epoch_end_with_val_check_interval(tmpdir):
@pytest.mark.parametrize(
"case",
{
"val_check_interval": {"val_check_interval": 0.3, "limit_train_batches": 10, "max_epochs": 10},
"check_val_every_n_epoch": {"check_val_every_n_epoch": 2, "max_epochs": 5},
}.items(),
)
def test_check_on_train_epoch_end_smart_handling(tmpdir, case):
class TestModel(BoringModel):
def validation_step(self, batch, batch_idx):
self.log("foo", 1)
return super().validation_step(batch, batch_idx)

case, kwargs = case
model = TestModel()
val_check_interval, limit_train_batches = 0.3, 10
trainer = Trainer(
default_root_dir=tmpdir,
val_check_interval=val_check_interval,
max_epochs=1,
limit_train_batches=limit_train_batches,
limit_val_batches=1,
callbacks=EarlyStopping(monitor="foo"),
progress_bar_refresh_rate=0,
**kwargs,
)

side_effect = [(False, "A"), (True, "B")]
Expand All @@ -449,4 +454,7 @@ def validation_step(self, batch, batch_idx):
trainer.fit(model)

assert es_mock.call_count == len(side_effect)
assert trainer.global_step == len(side_effect) * int(limit_train_batches * val_check_interval)
if case == "val_check_interval":
assert trainer.global_step == len(side_effect) * int(trainer.limit_train_batches * trainer.val_check_interval)
else:
assert trainer.current_epoch == len(side_effect) * trainer.check_val_every_n_epoch - 1
18 changes: 18 additions & 0 deletions tests/checkpointing/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,3 +1248,21 @@ def test_trainer_checkpoint_callback_bool(tmpdir):
mc = ModelCheckpoint(dirpath=tmpdir)
with pytest.raises(MisconfigurationException, match="Invalid type provided for checkpoint_callback"):
Trainer(checkpoint_callback=mc)


def test_check_val_every_n_epochs_top_k_integration(tmpdir):
model = BoringModel()
mc = ModelCheckpoint(dirpath=tmpdir, monitor="epoch", save_top_k=-1, filename="{epoch}")
trainer = Trainer(
default_root_dir=tmpdir,
limit_train_batches=1,
limit_val_batches=1,
num_sanity_val_steps=0,
max_epochs=5,
check_val_every_n_epoch=2,
callbacks=mc,
weights_summary=None,
logger=False,
)
trainer.fit(model)
assert set(os.listdir(tmpdir)) == {"epoch=1.ckpt", "epoch=3.ckpt"}