Issues with checkpointing

gianscarpe · gianscarpe · commit 0cd7796978d4 · 2020-11-23T17:21:54.000+01:00
diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py
@@ -106,31 +106,6 @@ def training_step(self, batch, batch_idx):
         loss = self.loss(batch, output)
         return {"loss": loss}
 
-    def training_step_end(self, training_step_outputs):
-        return training_step_outputs
-
-    def training_epoch_end(self, outputs) -> None:
-        train_loss = torch.stack([x["loss"] for x in outputs]).mean()
-        self.log('train_loss', train_loss)
-
-    def validation_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        return {"x": loss}
-
-    def validation_epoch_end(self, outputs) -> None:
-        val_loss = torch.stack([x["x"] for x in outputs]).mean()
-        self.log('val_loss', val_loss)
-
-    def test_step(self, batch, batch_idx):
-        output = self.layer(batch)
-        loss = self.loss(batch, output)
-        return {"y": loss}
-
-    def test_epoch_end(self, outputs) -> None:
-        test_loss = torch.stack([x["y"] for x in outputs]).mean()
-        self.log('test_loss', test_loss)
-
     def configure_optimizers(self):
         optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate)
         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py
@@ -61,6 +61,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None,
     logger = get_default_logger(save_dir, version=version)
     trainer_options.update(logger=logger)
 
+    # TODO: DEPRECATED option
     if "checkpoint_callback" not in trainer_options:
         trainer_options.update(checkpoint_callback=True)
 
@@ -71,7 +72,8 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None,
 
     assert result == 1, "trainer failed"
     # Check that the model is actually changed post-training
-    assert torch.norm(initial_values - post_train_values) > 0.1
+    change_ratio = torch.norm(initial_values - post_train_values)
+    assert change_ratio > 0.1, f"the model is changed of {change_ratio}"
 
     # test model loading
     pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model))
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
@@ -115,20 +115,34 @@ def test_all_features_cpu_model(tmpdir):
 
 
 def test_early_stopping_cpu_model(tmpdir):
-    """Test each of the trainer options."""
+    """Test each of the trainer options. Simply test the combo trainer and
+    model; callbacks functionality tests are in /tests/callbacks"""
+    class ModelTrainVal(BoringModel):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            return {"x": loss}
+
+        def validation_epoch_end(self, outputs) -> None:
+            val_loss = torch.stack([x["x"] for x in outputs]).mean()
+            self.log('val_loss', val_loss)
+
     stopping = EarlyStopping(monitor="val_loss", min_delta=0.1)
     trainer_options = dict(
         default_root_dir=tmpdir,
         callbacks=[stopping],
         max_epochs=2,
-        gradient_clip_val=1.0,
-        overfit_batches=0.20,
+        gradient_clip_val=1,
         track_grad_norm=2,
-        limit_train_batches=0.1,
+        limit_train_batches=0.2,
         limit_val_batches=0.1,
     )
 
-    model = BoringModel()
+    model = ModelTrainVal()
+
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
 
     # test freeze on cpu
@@ -199,7 +213,29 @@ def test_default_logger_callbacks_cpu_model(tmpdir):
 
 def test_running_test_after_fitting(tmpdir):
     """Verify test() on fitted model."""
-    model = BoringModel()
+    class ModelTrainValTest(BoringModel):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            return {"x": loss}
+
+        def validation_epoch_end(self, outputs) -> None:
+            val_loss = torch.stack([x["x"] for x in outputs]).mean()
+            self.log('val_loss', val_loss)
+
+        def test_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            return {"y": loss}
+
+        def test_epoch_end(self, outputs) -> None:
+            test_loss = torch.stack([x["y"] for x in outputs]).mean()
+            self.log('test_loss', test_loss)
+
+    model = ModelTrainValTest()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -230,7 +266,20 @@ def test_running_test_after_fitting(tmpdir):
 
 def test_running_test_no_val(tmpdir):
     """Verify `test()` works on a model with no `val_loader`."""
-    model = BoringModel()
+    class ModelTrainTest(BoringModel):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def test_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            return {"y": loss}
+
+        def test_epoch_end(self, outputs) -> None:
+            test_loss = torch.stack([x["y"] for x in outputs]).mean()
+            self.log('test_loss', test_loss)
+
+    model = ModelTrainTest()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
@@ -46,7 +46,7 @@ def test_multi_gpu_none_backend(tmpdir):
     )
 
     model = BoringModel()
-    tpipes.run_model_test(trainer_options, model)
+    tpipes.run_model_test(trainer_options, model, min_acc=0.20)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ def test_multi_gpu_none_backend(tmpdir):`
`46`	`46`	`)`
`47`	`47`
`48`	`48`	`model = BoringModel()`
`49`		`- tpipes.run_model_test(trainer_options, model)`
	`49`	`+ tpipes.run_model_test(trainer_options, model, min_acc=0.20)`
`50`	`50`
`51`	`51`
`52`	`52`	`@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")`