Change trainer.should_stop to not stop in between an epoch and run until min_steps/min_epochs only (#13890)

rohitgr7 · web-flow · commit f3574176e2a0 · 2022-08-27T12:12:24.000Z
diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
@@ -1745,3 +1745,63 @@ execution within that function, and the status of the Trainer.
     trainer.state.status
     # stage in ("train", "sanity_check", "validate", "test", "predict", "tune")
     trainer.state.stage
+
+should_stop
+***********
+
+If you want to terminate the training during ``.fit``, you can set ``trainer.should_stop=True`` to terminate the training
+as soon as possible. Note that, it will respect the arguments ``min_steps`` and ``min_epochs`` to check whether to stop. If these
+arguments are set and the ``current_epoch`` or ``global_step`` don't meet these minimum conditions, training will continue until
+both conditions are met. If any of these arguments is not set, it won't be considered for the final decision.
+
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` at any point of training will terminate it
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            self.trainer.should_stop = True
+
+
+    trainer = Trainer()
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` will stop training only after at least 5 epochs have run
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.current_epoch == 2:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_epochs=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` will stop training only after at least 5 steps have run
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.global_step == 2:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_steps=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` at any until both min_steps and min_epochs are satisfied
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.global_step == 7:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_steps=5, min_epochs=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -45,6 +45,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Included `torch.cuda` rng state to the aggregate `_collect_rng_states()` and `_set_rng_states()` ([#14384](https://github.com/Lightning-AI/lightning/pull/14384))
 
 
+- Changed `trainer.should_stop` to not stop in between an epoch and run until `min_steps/min_epochs` only ([#13890](https://github.com/Lightning-AI/lightning/pull/13890))
+
+
 - When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325))
 
 
diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -102,7 +102,21 @@ def _is_validation_done(self) -> bool:
     @property
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
-        return (self._is_training_done and self._is_validation_done) or self.trainer.should_stop
+        if self._is_training_done and self._is_validation_done:
+            return True
+
+        if self.trainer.should_stop:
+            # early stopping
+            min_epochs = self.trainer.fit_loop.min_epochs
+            should_stop_early = self.trainer.fit_loop._should_stop_early
+            if not should_stop_early:
+                self._warning_cache.info(
+                    f"Trainer was signaled to stop but the required `min_epochs={min_epochs!r}` or"
+                    f" `min_steps={self.min_steps!r}` has not been met. Training will continue..."
+                )
+            return should_stop_early
+
+        return False
 
     def connect(  # type: ignore[override]
         self,
diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py
@@ -146,6 +146,12 @@ def _results(self) -> _ResultCollection:
             return self.epoch_loop.val_loop._results
         raise RuntimeError("`FitLoop._results` property isn't defined. Accessed outside of scope")
 
+    @property
+    def _should_stop_early(self) -> bool:
+        met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
+        met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True
+        return met_min_epochs and met_min_steps
+
     @property
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
@@ -169,20 +175,10 @@ def done(self) -> bool:
             rank_zero_info(f"`Trainer.fit` stopped: `max_epochs={self.max_epochs!r}` reached.")
             return True
 
-        if self.trainer.should_stop:
-            # early stopping
-            met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
-            met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True
-            if met_min_epochs and met_min_steps:
-                self.trainer.should_stop = True
-                rank_zero_debug("`Trainer.fit` stopped: `trainer.should_stop` was set.")
-                return True
-            else:
-                rank_zero_info(
-                    f"Trainer was signaled to stop but the required `min_epochs={self.min_epochs!r}` or"
-                    f" `min_steps={self.min_steps!r}` has not been met. Training will continue..."
-                )
-        self.trainer.should_stop = False
+        if self.trainer.should_stop and self._should_stop_early:
+            rank_zero_debug("`Trainer.fit` stopped: `trainer.should_stop` was set.")
+            return True
+
         return False
 
     @property
diff --git a/src/pytorch_lightning/utilities/warnings.py b/src/pytorch_lightning/utilities/warnings.py
@@ -18,6 +18,7 @@
 
 from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning as NewLightningDeprecationWarning
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation as new_rank_zero_deprecation
+from pytorch_lightning.utilities.rank_zero import rank_zero_info as new_rank_zero_info
 from pytorch_lightning.utilities.rank_zero import rank_zero_warn as new_rank_zero_warn
 
 # enable our warnings
@@ -39,6 +40,11 @@ def deprecation(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None:
             self.add(message)
             new_rank_zero_deprecation(message, stacklevel=stacklevel, **kwargs)
 
+    def info(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None:
+        if message not in self:
+            self.add(message)
+            new_rank_zero_info(message, stacklevel=stacklevel, **kwargs)
+
 
 def rank_zero_warn(*args: Any, **kwargs: Any) -> Any:
     new_rank_zero_deprecation(
diff --git a/tests/tests_pytorch/callbacks/test_early_stopping.py b/tests/tests_pytorch/callbacks/test_early_stopping.py
@@ -265,25 +265,28 @@ def validation_epoch_end(self, outputs):
     assert early_stopping.stopped_epoch == expected_stop_epoch
 
 
-@pytest.mark.parametrize("limit_train_batches", (3, 5))
 @pytest.mark.parametrize(
-    ["min_epochs", "min_steps"],
+    "limit_train_batches,min_epochs,min_steps,stop_step",
     [
         # IF `min_steps` was set to a higher value than the `trainer.global_step` when `early_stopping` is being
         # triggered, THEN the trainer should continue until reaching `trainer.global_step == min_steps` and stop
-        (0, 10),
+        (3, 0, 10, 10),
+        (5, 0, 10, 10),
         # IF `min_epochs` resulted in a higher number of steps than the `trainer.global_step` when `early_stopping` is
         # being triggered, THEN the trainer should continue until reaching
         # `trainer.global_step` == `min_epochs * len(train_dataloader)`
-        (2, 0),
+        (3, 2, 0, 6),
+        (5, 2, 0, 10),
         # IF both `min_epochs` and `min_steps` are provided and higher than the `trainer.global_step` when
         # `early_stopping` is being triggered, THEN the highest between `min_epochs * len(train_dataloader)` and
         # `min_steps` would be reached
-        (1, 10),
-        (3, 10),
+        (3, 1, 10, 10),
+        (5, 1, 10, 10),
+        (3, 3, 10, 10),
+        (5, 3, 10, 15),
     ],
 )
-def test_min_epochs_min_steps_global_step(tmpdir, limit_train_batches, min_epochs, min_steps):
+def test_min_epochs_min_steps_global_step(tmpdir, limit_train_batches, min_epochs, min_steps, stop_step):
     if min_steps:
         assert limit_train_batches < min_steps
 
@@ -317,8 +320,7 @@ def training_step(self, batch, batch_idx):
     # epochs continue until min steps are reached
     assert trainer.current_epoch == expected_epochs
     # steps continue until min steps are reached AND the epoch is exhausted
-    # stopping mid-epoch is not supported
-    assert trainer.global_step == limit_train_batches * expected_epochs
+    assert trainer.global_step == stop_step
 
 
 def test_early_stopping_mode_options():
diff --git a/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py b/tests/tests_pytorch/loops/epoch/test_training_epoch_loop.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 from unittest.mock import patch
 
 import pytest
@@ -184,3 +185,36 @@ def test_no_val_on_train_epoch_loop_restart(tmpdir):
     ) as advance_mocked:
         trainer.fit(model, ckpt_path=ckpt_path)
         assert advance_mocked.call_count == 1
+
+
+@pytest.mark.parametrize(
+    "min_epochs, min_steps, current_epoch, global_step, early_stop, epoch_loop_done, raise_info_msg",
+    [
+        (None, None, 1, 4, True, True, False),
+        (None, None, 1, 10, True, True, False),
+        (4, None, 1, 4, False, False, True),
+        (4, 2, 1, 4, False, False, True),
+        (4, None, 1, 10, False, True, False),
+        (4, 3, 1, 3, False, False, True),
+        (4, 10, 1, 10, False, True, False),
+        (None, 4, 1, 4, True, True, False),
+    ],
+)
+def test_should_stop_early_stopping_conditions_not_met(
+    caplog, min_epochs, min_steps, current_epoch, global_step, early_stop, epoch_loop_done, raise_info_msg
+):
+    """Test that checks that info message is logged when users sets `should_stop` but min conditions are not
+    met."""
+    trainer = Trainer(min_epochs=min_epochs, min_steps=min_steps, limit_val_batches=0)
+    trainer.num_training_batches = 10
+    trainer.should_stop = True
+    trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = global_step
+    trainer.fit_loop.epoch_loop.batch_progress.current.ready = global_step
+    trainer.fit_loop.epoch_progress.current.completed = current_epoch - 1
+
+    message = f"min_epochs={min_epochs}` or `min_steps={min_steps}` has not been met. Training will continue"
+    with caplog.at_level(logging.INFO, logger="pytorch_lightning.loops"):
+        assert trainer.fit_loop.epoch_loop.done is epoch_loop_done
+
+    assert (message in caplog.text) is raise_info_msg
+    assert trainer.fit_loop._should_stop_early is early_stop
diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py
@@ -180,7 +180,6 @@ def test_fit_loop_done_log_messages(caplog):
 
     fit_loop.epoch_loop.min_steps = 100
     assert not fit_loop.done
-    assert "was signaled to stop but" in caplog.text
 
 
 def test_warning_valid_train_step_end(tmpdir):
@@ -198,3 +197,35 @@ def training_step_end(self, outputs):
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
 
     trainer.fit(model)
+
+
+@pytest.mark.parametrize(
+    "min_epochs, min_steps, current_epoch, early_stop, fit_loop_done, raise_debug_msg",
+    [
+        (4, None, 100, True, True, False),
+        (4, None, 3, False, False, False),
+        (4, 10, 3, False, False, False),
+        (None, 10, 4, True, True, True),
+        (4, None, 4, True, True, True),
+        (4, 10, 4, True, True, True),
+    ],
+)
+def test_should_stop_early_stopping_conditions_met(
+    caplog, min_epochs, min_steps, current_epoch, early_stop, fit_loop_done, raise_debug_msg
+):
+    """Test that checks that debug message is logged when users sets `should_stop` and min conditions are met."""
+    trainer = Trainer(min_epochs=min_epochs, min_steps=min_steps, limit_val_batches=0, max_epochs=100)
+    trainer.num_training_batches = 10
+    trainer.should_stop = True
+    trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = (
+        current_epoch * trainer.num_training_batches
+    )
+    trainer.fit_loop.epoch_loop.batch_progress.current.ready = 10
+    trainer.fit_loop.epoch_progress.current.processed = current_epoch
+
+    message = "`Trainer.fit` stopped: `trainer.should_stop` was set."
+    with caplog.at_level(level=logging.DEBUG, logger="pytorch_lightning.utilities.rank_zero"):
+        assert trainer.fit_loop.done is fit_loop_done
+
+    assert (message in caplog.text) is raise_debug_msg
+    assert trainer.fit_loop._should_stop_early is early_stop
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py
@@ -622,7 +622,10 @@ def training_step(self, batch, batch_idx):
             output["loss"] = output["loss"] * 0.0  # force minimal loss to trigger early stopping
             self.log("loss", output["loss"])
             self.training_step_invoked += 1
-            assert not self.trainer.should_stop
+            if self.current_epoch < 2:
+                assert not self.trainer.should_stop
+            else:
+                assert self.trainer.should_stop
             return output
 
     model = TestModel()
@@ -641,7 +644,7 @@ def training_step(self, batch, batch_idx):
 
     message = f"min_epochs={min_epochs}` or `min_steps=None` has not been met. Training will continue"
     num_messages = sum(1 for record in caplog.records if message in record.message)
-    assert num_messages == min_epochs - 2
+    assert num_messages == 1
     assert model.training_step_invoked == min_epochs * 2