From bb37edcfe7b222dd6eb3aac05dfa0d95b7f3a931 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 14:53:37 +0100 Subject: [PATCH 01/47] test_cpu refactoring - BoringModel and checkpoints; test_gpu refactoring - BoringModelboring_model refactoring - validation, testing; Fix - run_prediction as dispatcher for testing BoringModel --- tests/base/boring_model.py | 50 +++++-- tests/base/develop_pipelines.py | 55 +++++--- .../data/horovod/train_default_model.py | 5 +- tests/models/test_cpu.py | 122 +++++++----------- tests/models/test_gpu.py | 9 +- tests/models/test_horovod.py | 1 + tests/trainer/test_dataloaders.py | 4 +- 7 files changed, 135 insertions(+), 111 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 6ceffe8562372..98874377dc211 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from pytorch_lightning import LightningModule from torch.utils.data import Dataset +from pytorch_lightning import LightningModule + class RandomDictDataset(Dataset): def __init__(self, size, length): @@ -24,7 +25,7 @@ def __init__(self, size, length): def __getitem__(self, index): a = self.data[index] b = a + 2 - return {'a': a, 'b': b} + return {"a": a, "b": b} def __len__(self): return self.len @@ -55,8 +56,14 @@ def __len__(self): class BoringModel(LightningModule): - - def __init__(self): + def __init__( + self, + batch_size: int = 1, + in_features: int = 32, + learning_rate: float = 0.1, + optimizer_name: str = "SGD", + out_features: int = 2, + ): """ Testing PL Module @@ -75,7 +82,12 @@ def training_step(...): """ super().__init__() - self.layer = torch.nn.Linear(32, 2) + self.layer = torch.nn.Linear(in_features, out_features) + self.batch_size = batch_size + self.in_features = in_features + self.learning_rate = learning_rate + self.optimizer_name = optimizer_name + self.out_features = out_features def forward(self, x): return self.layer(x) @@ -98,7 +110,8 @@ def training_step_end(self, training_step_outputs): return training_step_outputs def training_epoch_end(self, outputs) -> None: - torch.stack([x["loss"] for x in outputs]).mean() + train_loss = torch.stack([x["loss"] for x in outputs]).mean() + self.log('train_loss', train_loss) def validation_step(self, batch, batch_idx): output = self.layer(batch) @@ -106,7 +119,8 @@ def validation_step(self, batch, batch_idx): return {"x": loss} def validation_epoch_end(self, outputs) -> None: - torch.stack([x['x'] for x in outputs]).mean() + val_loss = torch.stack([x["x"] for x in outputs]).mean() + self.log('val_loss', val_loss) def test_step(self, batch, batch_idx): output = self.layer(batch) @@ -114,18 +128,30 @@ def test_step(self, batch, batch_idx): return {"y": loss} def test_epoch_end(self, outputs) -> None: - torch.stack([x["y"] for x in outputs]).mean() + test_loss = torch.stack([x["y"] for x in outputs]).mean() + self.log('test_loss', test_loss) def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + + def get_default_hparams(continue_training: bool = False, hpc_exp_number: int = 0) -> dict: + args = dict( + batch_size=1, + in_features=32, + learning_rate=0.1, + optimizer_name="SGD", + out_features=2, + ) + + return args diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 24535dc67da8e..6002369ec4010 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -11,11 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from functools import singledispatch + import torch +from torch.utils.data import DataLoader -from pytorch_lightning import Trainer -from tests.base.develop_utils import load_model_from_checkpoint, get_default_logger, \ - reset_seed +from pytorch_lightning import LightningModule, Trainer +from tests.base import BoringModel, EvalModelTemplate +from tests.base.develop_utils import get_default_logger, load_model_from_checkpoint, reset_seed def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50): @@ -26,11 +29,12 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 result = trainer.fit(model) # correct result and ok accuracy - assert result == 1, 'amp + ddp model failed to complete' + assert result == 1, "amp + ddp model failed to complete" pretrained_model = load_model_from_checkpoint( trainer.logger, trainer.checkpoint_callback.best_model_path, + type(model) ) # test new model accuracy @@ -39,7 +43,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(dataloader, pretrained_model, min_acc=min_acc) + run_prediction(pretrained_model, dataloader, min_acc=min_acc) if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test @@ -47,16 +51,17 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() -def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True): +def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, + with_hpc: bool = True, min_acc: float = 0.25): reset_seed() - save_dir = trainer_options['default_root_dir'] + save_dir = trainer_options["default_root_dir"] # logger file to get meta logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) - if 'checkpoint_callback' not in trainer_options: + if "checkpoint_callback" not in trainer_options: trainer_options.update(checkpoint_callback=True) trainer = Trainer(**trainer_options) @@ -64,12 +69,12 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi result = trainer.fit(model) post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) - assert result == 1, 'trainer failed' + assert result == 1, "trainer failed" # Check that the model is actually changed post-training assert torch.norm(initial_values - post_train_values) > 0.1 # test model loading - pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path) + pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model)) # test new model accuracy test_loaders = model.test_dataloader() @@ -77,14 +82,15 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(dataloader, pretrained_model) + run_prediction(pretrained_model, dataloader, min_acc=min_acc) if with_hpc: if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model - trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ - trainer.init_optimizers(pretrained_model) + trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers( + pretrained_model + ) # test HPC saving trainer.checkpoint_connector.hpc_save(save_dir, logger) @@ -93,7 +99,8 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) -def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): +@singledispatch +def run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): # run prediction on 1 batch batch = next(iter(dataloader)) x, y = batch @@ -102,7 +109,7 @@ def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): if dp: with torch.no_grad(): output = trained_model(batch, 0) - acc = output['val_acc'] + acc = output["val_acc"] acc = torch.mean(acc).item() else: @@ -119,3 +126,21 @@ def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): acc = acc.item() assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})" + + +@run_prediction.register(BoringModel) +def _(trained_model, dataloader, dp=False, min_acc=0.25): + # run prediction on 1 batch + batch = next(iter(dataloader)) + + if dp: + with torch.no_grad(): + output = trained_model(batch) + acc = trained_model.loss(batch, output) + + else: + with torch.no_grad(): + output = trained_model(batch) + output = output.cpu() + acc = trained_model.loss(batch, output) + assert acc >= min_acc, f"This model is expected to get , {min_acc} in test set (it got {acc})" diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 62f874902b094..d757d1d03f4f7 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -35,8 +35,7 @@ from tests.base import EvalModelTemplate # noqa: E402 from tests.base.develop_pipelines import run_prediction # noqa: E402 -from tests.base.develop_utils import set_random_master_port, reset_seed # noqa: E402 - +from tests.base.develop_utils import reset_seed, set_random_master_port # noqa: E402 parser = argparse.ArgumentParser() parser.add_argument('--trainer-options', required=True) @@ -72,7 +71,7 @@ def run_test_from_config(trainer_options): test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(dataloader, pretrained_model) + run_prediction(pretrained_model, dataloader) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 892077ccdb1be..3cd27c9219fb6 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -21,15 +21,15 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from tests.base import EvalModelTemplate +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Callback +from tests.base import BoringModel @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): """Verify model save/load/checkpoint on CPU.""" - hparams = EvalModelTemplate.get_default_hparams() - model = EvalModelTemplate(**hparams) + hparams = BoringModel.get_default_hparams() + model = BoringModel(**hparams) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -49,7 +49,7 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): real_global_step = trainer.global_step # traning complete - assert result == 1, 'cpu model failed to complete' + assert result == 1, "cpu model failed to complete" # predict with trained model before saving # make a prediction @@ -61,11 +61,8 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): for batch in dataloader: break - x, y = batch - x = x.view(x.size(0), -1) - model.eval() - pred_before_saving = model(x) + pred_before_saving = model(batch) # test HPC saving # simulate snapshot on slurm @@ -75,26 +72,27 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): # new logger file to get meta logger = tutils.get_default_logger(tmpdir, version=version) + model = BoringModel(**hparams) + + class _StartCallback(Callback): + def on_init_start(self, trainer): + print("Starting to init trainer!") + + # set the epoch start hook so we can predict before the model does the full training + def on_epoch_start(self, trainer, model): + assert trainer.global_step == real_global_step and trainer.global_step > 0 + # predict with loaded model to make sure answers are the same + model.eval() + new_pred = model(batch) + assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, logger=logger, - callbacks=[ModelCheckpoint(dirpath=tmpdir)], enable_pl_optimizer=enable_pl_optimizer, + callbacks=[_StartCallback(), ModelCheckpoint(dirpath=tmpdir)], ) - model = EvalModelTemplate(**hparams) - - # set the epoch start hook so we can predict before the model does the full training - def assert_pred_same(): - assert trainer.global_step == real_global_step and trainer.global_step > 0 - - # predict with loaded model to make sure answers are the same - trainer.model.eval() - new_pred = trainer.model(x) - assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 - - model.on_epoch_start = assert_pred_same - # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model) @@ -103,7 +101,7 @@ def assert_pred_same(): @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): """Test each of the trainer options.""" - stopping = EarlyStopping(monitor='early_stop_on', min_delta=0.1) + stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, callbacks=[stopping], @@ -116,7 +114,7 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) - model = EvalModelTemplate() + model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu @@ -146,26 +144,24 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) - model = EvalModelTemplate() + model = BoringModel() tpipes.run_model_test(trainer_options, model, on_gpu=False) def test_lbfgs_cpu_model(tmpdir): - """Test each of the trainer options.""" + """Test each of the trainer options. Testing LBFGS optimizer""" trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, progress_bar_refresh_rate=0, - weights_summary='top', + weights_summary="top", limit_train_batches=0.2, limit_val_batches=0.2, ) - hparams = EvalModelTemplate.get_default_hparams() - hparams.update(optimizer_name='lbfgs', - learning_rate=0.004) - model = EvalModelTemplate(**hparams) - model.configure_optimizers = model.configure_optimizers__lbfgs + hparams = BoringModel.get_default_hparams() + hparams.update(optimizer_name="LBFGS", learning_rate=0.004) + model = BoringModel(**hparams) tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) @@ -181,8 +177,8 @@ def test_default_logger_callbacks_cpu_model(tmpdir): limit_val_batches=0.01, ) - model = EvalModelTemplate() - tpipes.run_model_test_without_loggers(trainer_options, model) + model = BoringModel() + tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.01) # test freeze on cpu model.freeze() @@ -191,7 +187,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - model = EvalModelTemplate() + model = BoringModel() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -212,17 +208,17 @@ def test_running_test_after_fitting(tmpdir): ) result = trainer.fit(model) - assert result == 1, 'training failed to complete' + assert result == 1, "training failed to complete" trainer.test() # test we have good test accuracy - tutils.assert_ok_model_acc(trainer, thr=0.5) + tutils.assert_ok_model_acc(trainer, key='test_loss', thr=0.5) def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" - model = EvalModelTemplate() + model = BoringModel() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -243,17 +239,17 @@ def test_running_test_no_val(tmpdir): ) result = trainer.fit(model) - assert result == 1, 'training failed to complete' + assert result == 1, "training failed to complete" trainer.test() # test we have good test accuracy - tutils.assert_ok_model_acc(trainer) + tutils.assert_ok_model_acc(trainer, key='test_loss') def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" - model = EvalModelTemplate() + model = BoringModel() # fit model trainer = Trainer( @@ -265,7 +261,7 @@ def test_simple_cpu(tmpdir): result = trainer.fit(model) # traning complete - assert result == 1, 'amp + ddp model failed to complete' + assert result == 1, "amp + ddp model failed to complete" def test_cpu_model(tmpdir): @@ -275,32 +271,12 @@ def test_cpu_model(tmpdir): progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, - limit_val_batches=0.4 - ) - - model = EvalModelTemplate() - - tpipes.run_model_test(trainer_options, model, on_gpu=False) - - -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_all_features_cpu_model(enable_pl_optimizer, tmpdir): - """Test each of the trainer options.""" - trainer_options = dict( - default_root_dir=tmpdir, - gradient_clip_val=1.0, - overfit_batches=0.20, - track_grad_norm=2, - progress_bar_refresh_rate=0, - accumulate_grad_batches=2, - max_epochs=1, - limit_train_batches=0.4, limit_val_batches=0.4, - enable_pl_optimizer=enable_pl_optimizer, ) - model = EvalModelTemplate() - tpipes.run_model_test(trainer_options, model, on_gpu=False) + model = BoringModel() + + tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.01) def test_tbptt_cpu_model(tmpdir): @@ -319,7 +295,7 @@ def __getitem__(self, i): def __len__(self): return 1 - class BpttTestModel(EvalModelTemplate): + class BpttTestModel(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None @@ -335,11 +311,10 @@ def training_step(self, batch, batch_idx, hiddens): assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) - loss_val = torch.nn.functional.mse_loss( - pred, y_tensor.view(batch_size, truncated_bptt_steps)) + loss_val = torch.nn.functional.mse_loss(pred, y_tensor.view(batch_size, truncated_bptt_steps)) return { - 'loss': loss_val, - 'hiddens': self.test_hidden, + "loss": loss_val, + "hiddens": self.test_hidden, } def training_epoch_end(self, training_step_outputs): @@ -356,12 +331,11 @@ def train_dataloader(self): sampler=None, ) - hparams = EvalModelTemplate.get_default_hparams() + hparams = BoringModel.get_default_hparams() hparams.update( batch_size=batch_size, in_features=truncated_bptt_steps, - hidden_dim=truncated_bptt_steps, - out_features=truncated_bptt_steps + out_features=truncated_bptt_steps, ) model = BpttTestModel(**hparams) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index cd61da7c008bc..08c2cd15e0434 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -21,11 +21,10 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import EvalModelTemplate -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator - +from tests.base import BoringModel PRETEND_N_OF_GPUS = 16 @@ -43,7 +42,7 @@ def test_multi_gpu_none_backend(tmpdir): gpus=2, ) - model = EvalModelTemplate() + model = BoringModel() tpipes.run_model_test(trainer_options, model) @@ -60,7 +59,7 @@ def test_single_gpu_model(tmpdir, gpus): gpus=gpus ) - model = EvalModelTemplate() + model = BoringModel() tpipes.run_model_test(trainer_options, model) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 3a2ae8750443f..cc1ed9da9fd8f 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -63,6 +63,7 @@ def _run_horovod(trainer_options, on_gpu=False): ] if on_gpu: cmdline += ['--on-gpu'] + exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 9b42aa98c9dd0..614b2a8e66ab8 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -128,7 +128,7 @@ def test_multiple_val_dataloader(tmpdir): # make sure predictions are good for each val set for dataloader in trainer.val_dataloaders: - tpipes.run_prediction(dataloader, trainer.model) + tpipes.run_prediction(trainer.model, dataloader) @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific']) @@ -164,7 +164,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs): # make sure predictions are good for each test set for dataloader in trainer.test_dataloaders: - tpipes.run_prediction(dataloader, trainer.model) + tpipes.run_prediction(trainer.model, dataloader) # run the test method trainer.test(ckpt_path=ckpt_path) From 186747d829a8bdb7cb13d66398092950cb5d5de7 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 15:06:52 +0100 Subject: [PATCH 02/47] Removed EvalModelTemplate import from test_cpu and test_gpu --- tests/models/test_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 3cd27c9219fb6..bab2bd92b87e2 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -21,7 +21,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Callback +from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint from tests.base import BoringModel From dddd8e1061fd2779748baea820aceaf8ef398841 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 15:11:05 +0100 Subject: [PATCH 03/47] Reverting unintended changes --- setup.py | 2 +- tests/utilities/distributed.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6b68d5524167d..c548d508ab434 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ # the goal of the project is simplicity for researchers, don't want to add too much # engineer specific practices setup( - name="pytorch-lightning-nightly", + name="pytorch-lightning", version=pytorch_lightning.__version__, description=pytorch_lightning.__docs__, author=pytorch_lightning.__author__, diff --git a/tests/utilities/distributed.py b/tests/utilities/distributed.py index 80c0246ce6c57..c569f51143f25 100644 --- a/tests/utilities/distributed.py +++ b/tests/utilities/distributed.py @@ -41,4 +41,5 @@ def call_training_script(module_file, cli_args, method, tmpdir, timeout=60): except TimeoutExpired: p.kill() std, err = p.communicate() + return std, err From 43482879d61c788770543abbcef8e5c77c255f1d Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 17:21:54 +0100 Subject: [PATCH 04/47] Issues with checkpointing --- tests/base/boring_model.py | 25 ------------- tests/base/develop_pipelines.py | 4 ++- tests/models/test_cpu.py | 63 +++++++++++++++++++++++++++++---- tests/models/test_gpu.py | 2 +- 4 files changed, 60 insertions(+), 34 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 98874377dc211..31a5d4c73de1a 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -106,31 +106,6 @@ def training_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"loss": loss} - def training_step_end(self, training_step_outputs): - return training_step_outputs - - def training_epoch_end(self, outputs) -> None: - train_loss = torch.stack([x["loss"] for x in outputs]).mean() - self.log('train_loss', train_loss) - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"x": loss} - - def validation_epoch_end(self, outputs) -> None: - val_loss = torch.stack([x["x"] for x in outputs]).mean() - self.log('val_loss', val_loss) - - def test_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"y": loss} - - def test_epoch_end(self, outputs) -> None: - test_loss = torch.stack([x["y"] for x in outputs]).mean() - self.log('test_loss', test_loss) - def configure_optimizers(self): optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 6002369ec4010..e336d6266b215 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -61,6 +61,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) + # TODO: DEPRECATED option if "checkpoint_callback" not in trainer_options: trainer_options.update(checkpoint_callback=True) @@ -71,7 +72,8 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, assert result == 1, "trainer failed" # Check that the model is actually changed post-training - assert torch.norm(initial_values - post_train_values) > 0.1 + change_ratio = torch.norm(initial_values - post_train_values) + assert change_ratio > 0.1, f"the model is changed of {change_ratio}" # test model loading pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model)) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index bab2bd92b87e2..2e408dfbe7bd9 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -100,21 +100,35 @@ def on_epoch_start(self, trainer, model): @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): - """Test each of the trainer options.""" + """Test each of the trainer options. Simply test the combo trainer and + model; callbacks functionality tests are in /tests/callbacks""" + class ModelTrainVal(BoringModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"x": loss} + + def validation_epoch_end(self, outputs) -> None: + val_loss = torch.stack([x["x"] for x in outputs]).mean() + self.log('val_loss', val_loss) + stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, callbacks=[stopping], max_epochs=2, - gradient_clip_val=1.0, - overfit_batches=0.20, + gradient_clip_val=1, track_grad_norm=2, - limit_train_batches=0.1, + limit_train_batches=0.2, limit_val_batches=0.1, enable_pl_optimizer=enable_pl_optimizer, ) - model = BoringModel() + model = ModelTrainVal() + tpipes.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu @@ -187,7 +201,29 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - model = BoringModel() + class ModelTrainValTest(BoringModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"x": loss} + + def validation_epoch_end(self, outputs) -> None: + val_loss = torch.stack([x["x"] for x in outputs]).mean() + self.log('val_loss', val_loss) + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"y": loss} + + def test_epoch_end(self, outputs) -> None: + test_loss = torch.stack([x["y"] for x in outputs]).mean() + self.log('test_loss', test_loss) + + model = ModelTrainValTest() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -218,7 +254,20 @@ def test_running_test_after_fitting(tmpdir): def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" - model = BoringModel() + class ModelTrainTest(BoringModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"y": loss} + + def test_epoch_end(self, outputs) -> None: + test_loss = torch.stack([x["y"] for x in outputs]).mean() + self.log('test_loss', test_loss) + + model = ModelTrainTest() # logger file to get meta logger = tutils.get_default_logger(tmpdir) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 08c2cd15e0434..b7e669dcafd4b 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -43,7 +43,7 @@ def test_multi_gpu_none_backend(tmpdir): ) model = BoringModel() - tpipes.run_model_test(trainer_options, model) + tpipes.run_model_test(trainer_options, model, min_acc=0.20) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") From 3861630aa72229ef248690f610ac3ed77383850c Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 17:43:48 +0100 Subject: [PATCH 05/47] Fixed tests for logging and checkpointing --- tests/base/boring_model.py | 19 +++++++++++++++++++ tests/models/test_cpu.py | 8 +++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 31a5d4c73de1a..0283b258dccff 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -106,6 +106,25 @@ def training_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"loss": loss} + def training_epoch_end(self, outputs) -> None: + torch.stack([x["loss"] for x in outputs]).mean() + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"x": loss} + + def validation_epoch_end(self, outputs) -> None: + torch.stack([x['x'] for x in outputs]).mean() + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"y": loss} + + def test_epoch_end(self, outputs) -> None: + torch.stack([x["y"] for x in outputs]).mean() + def configure_optimizers(self): optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 2e408dfbe7bd9..939478aeccafb 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -106,11 +106,6 @@ class ModelTrainVal(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"x": loss} - def validation_epoch_end(self, outputs) -> None: val_loss = torch.stack([x["x"] for x in outputs]).mean() self.log('val_loss', val_loss) @@ -258,6 +253,9 @@ class ModelTrainTest(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + def val_loader(self): + pass + def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) From d974232c4dc608734ce095fdada405783e5af66c Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 18:26:02 +0100 Subject: [PATCH 06/47] Fix for dispatcher --- tests/models/test_restore.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 476d1db99ce60..47fbf7048d547 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -142,6 +142,7 @@ def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" + tutils.set_random_master_port() model = EvalModelTemplate() @@ -186,7 +187,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir): dataloaders = [dataloaders] for dataloader in dataloaders: - tpipes.run_prediction(dataloader, pretrained_model) + tpipes.run_prediction(pretrained_model, dataloader) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -237,7 +238,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): dataloaders = [dataloaders] for dataloader in dataloaders: - tpipes.run_prediction(dataloader, pretrained_model) + tpipes.run_prediction(pretrained_model, dataloader) def test_running_test_pretrained_model_cpu(tmpdir): @@ -379,7 +380,7 @@ def assert_good_acc(): dp_model.eval() dataloader = trainer.train_dataloader - tpipes.run_prediction(dataloader, dp_model, dp=True) + tpipes.run_prediction(dp_model, dataloader, dp=True) # new model model = EvalModelTemplate(**hparams) From d0782405d8639da9ffe07bec0a36991325d7e013 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 24 Nov 2020 08:52:42 +0100 Subject: [PATCH 07/47] Fixed acc check for stocasticity of seeds --- tests/models/test_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 939478aeccafb..a931077df35bd 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -154,7 +154,7 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): ) model = BoringModel() - tpipes.run_model_test(trainer_options, model, on_gpu=False) + tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.20) def test_lbfgs_cpu_model(tmpdir): From 41dd1b054b572f9544bec3e9cb5e8324917521d8 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 14:53:37 +0100 Subject: [PATCH 08/47] test_cpu refactoring - BoringModel and checkpoints; test_gpu refactoring - BoringModelboring_model refactoring - validation, testing; Fix - run_prediction as dispatcher for testing BoringModel --- tests/base/boring_model.py | 9 ++++++--- tests/base/develop_pipelines.py | 6 ++---- tests/models/test_cpu.py | 3 +-- tests/utilities/distributed.py | 1 - 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 0283b258dccff..5c31903d1dd6a 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -107,7 +107,8 @@ def training_step(self, batch, batch_idx): return {"loss": loss} def training_epoch_end(self, outputs) -> None: - torch.stack([x["loss"] for x in outputs]).mean() + train_loss = torch.stack([x["loss"] for x in outputs]).mean() + self.log('train_loss', train_loss) def validation_step(self, batch, batch_idx): output = self.layer(batch) @@ -115,7 +116,8 @@ def validation_step(self, batch, batch_idx): return {"x": loss} def validation_epoch_end(self, outputs) -> None: - torch.stack([x['x'] for x in outputs]).mean() + val_loss = torch.stack([x["x"] for x in outputs]).mean() + self.log('val_loss', val_loss) def test_step(self, batch, batch_idx): output = self.layer(batch) @@ -123,7 +125,8 @@ def test_step(self, batch, batch_idx): return {"y": loss} def test_epoch_end(self, outputs) -> None: - torch.stack([x["y"] for x in outputs]).mean() + test_loss = torch.stack([x["y"] for x in outputs]).mean() + self.log('test_loss', test_loss) def configure_optimizers(self): optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index e336d6266b215..67db586e98a80 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -14,10 +14,8 @@ from functools import singledispatch import torch -from torch.utils.data import DataLoader - -from pytorch_lightning import LightningModule, Trainer -from tests.base import BoringModel, EvalModelTemplate +from pytorch_lightning import Trainer +from tests.base import BoringModel from tests.base.develop_utils import get_default_logger, load_model_from_checkpoint, reset_seed diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index a931077df35bd..c6991de265360 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -123,7 +123,6 @@ def validation_epoch_end(self, outputs) -> None: ) model = ModelTrainVal() - tpipes.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu @@ -398,4 +397,4 @@ def train_dataloader(self): ) result = trainer.fit(model) - assert result == 1, 'training failed to complete' + assert result == 1, "training failed to complete" diff --git a/tests/utilities/distributed.py b/tests/utilities/distributed.py index c569f51143f25..80c0246ce6c57 100644 --- a/tests/utilities/distributed.py +++ b/tests/utilities/distributed.py @@ -41,5 +41,4 @@ def call_training_script(module_file, cli_args, method, tmpdir, timeout=60): except TimeoutExpired: p.kill() std, err = p.communicate() - return std, err From 67c3bcb36f1a21bfcbb8200bdb2855ae0fa554be Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 15:11:05 +0100 Subject: [PATCH 09/47] Reverting unintended changes --- tests/utilities/distributed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utilities/distributed.py b/tests/utilities/distributed.py index 80c0246ce6c57..c569f51143f25 100644 --- a/tests/utilities/distributed.py +++ b/tests/utilities/distributed.py @@ -41,4 +41,5 @@ def call_training_script(module_file, cli_args, method, tmpdir, timeout=60): except TimeoutExpired: p.kill() std, err = p.communicate() + return std, err From dba0ab83374aefa28713d617b7053477787f7a8b Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 17:21:54 +0100 Subject: [PATCH 10/47] Issues with checkpointing --- tests/base/boring_model.py | 22 ---------------------- tests/models/test_cpu.py | 3 --- 2 files changed, 25 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 5c31903d1dd6a..31a5d4c73de1a 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -106,28 +106,6 @@ def training_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"loss": loss} - def training_epoch_end(self, outputs) -> None: - train_loss = torch.stack([x["loss"] for x in outputs]).mean() - self.log('train_loss', train_loss) - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"x": loss} - - def validation_epoch_end(self, outputs) -> None: - val_loss = torch.stack([x["x"] for x in outputs]).mean() - self.log('val_loss', val_loss) - - def test_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"y": loss} - - def test_epoch_end(self, outputs) -> None: - test_loss = torch.stack([x["y"] for x in outputs]).mean() - self.log('test_loss', test_loss) - def configure_optimizers(self): optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index c6991de265360..8aff92102751c 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -252,9 +252,6 @@ class ModelTrainTest(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def val_loader(self): - pass - def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) From 5955112bd51b5d351cd2d61edcd49269e39f00d4 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 23 Nov 2020 17:43:48 +0100 Subject: [PATCH 11/47] Fixed tests for logging and checkpointing --- tests/base/boring_model.py | 19 +++++++++++++++++++ tests/models/test_cpu.py | 8 +++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 31a5d4c73de1a..0283b258dccff 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -106,6 +106,25 @@ def training_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"loss": loss} + def training_epoch_end(self, outputs) -> None: + torch.stack([x["loss"] for x in outputs]).mean() + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"x": loss} + + def validation_epoch_end(self, outputs) -> None: + torch.stack([x['x'] for x in outputs]).mean() + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"y": loss} + + def test_epoch_end(self, outputs) -> None: + torch.stack([x["y"] for x in outputs]).mean() + def configure_optimizers(self): optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 8aff92102751c..14e028d28742a 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -199,11 +199,6 @@ class ModelTrainValTest(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"x": loss} - def validation_epoch_end(self, outputs) -> None: val_loss = torch.stack([x["x"] for x in outputs]).mean() self.log('val_loss', val_loss) @@ -252,6 +247,9 @@ class ModelTrainTest(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + def val_loader(self): + pass + def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) From a5c4920e0b11999c336af730db8d71e8d0822b06 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 24 Nov 2020 16:21:31 +0100 Subject: [PATCH 12/47] Fixed according to @borda suggestions --- tests/base/boring_model.py | 16 ++++------------ tests/base/develop_pipelines.py | 13 +++---------- tests/models/test_cpu.py | 19 +++++-------------- 3 files changed, 12 insertions(+), 36 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 0283b258dccff..8b783db002372 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -88,6 +88,8 @@ def training_step(...): self.learning_rate = learning_rate self.optimizer_name = optimizer_name self.out_features = out_features + self.save_hyperparameters('batch_size', 'in_features', 'out_features', + 'optimizer_name', 'learning_rate') def forward(self, x): return self.layer(x) @@ -126,7 +128,8 @@ def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() def configure_optimizers(self): - optimizer = getattr(torch.optim, self.optimizer_name)(self.layer.parameters(), lr=self.learning_rate) + optimizer_class = getattr(torch.optim, self.optimizer_name) + optimizer = optimizer_class(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] @@ -138,14 +141,3 @@ def val_dataloader(self): def test_dataloader(self): return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) - - def get_default_hparams(continue_training: bool = False, hpc_exp_number: int = 0) -> dict: - args = dict( - batch_size=1, - in_features=32, - learning_rate=0.1, - optimizer_name="SGD", - out_features=2, - ) - - return args diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 67db586e98a80..a6f8e55471f7c 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -132,15 +132,8 @@ def run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): def _(trained_model, dataloader, dp=False, min_acc=0.25): # run prediction on 1 batch batch = next(iter(dataloader)) + with torch.no_grad(): + output = trained_model(batch) + acc = trained_model.loss(batch, output) - if dp: - with torch.no_grad(): - output = trained_model(batch) - acc = trained_model.loss(batch, output) - - else: - with torch.no_grad(): - output = trained_model(batch) - output = output.cpu() - acc = trained_model.loss(batch, output) assert acc >= min_acc, f"This model is expected to get , {min_acc} in test set (it got {acc})" diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 14e028d28742a..c26d7c8509c54 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -28,8 +28,7 @@ @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): """Verify model save/load/checkpoint on CPU.""" - hparams = BoringModel.get_default_hparams() - model = BoringModel(**hparams) + model = BoringModel() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -72,7 +71,7 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): # new logger file to get meta logger = tutils.get_default_logger(tmpdir, version=version) - model = BoringModel(**hparams) + model = BoringModel() class _StartCallback(Callback): def on_init_start(self, trainer): @@ -167,9 +166,7 @@ def test_lbfgs_cpu_model(tmpdir): limit_val_batches=0.2, ) - hparams = BoringModel.get_default_hparams() - hparams.update(optimizer_name="LBFGS", learning_rate=0.004) - model = BoringModel(**hparams) + model = BoringModel(optimizer_name="LBFGS", learning_rate=0.004) tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) @@ -372,14 +369,8 @@ def train_dataloader(self): sampler=None, ) - hparams = BoringModel.get_default_hparams() - hparams.update( - batch_size=batch_size, - in_features=truncated_bptt_steps, - out_features=truncated_bptt_steps, - ) - - model = BpttTestModel(**hparams) + model = BpttTestModel(batch_size=batch_size, + in_features=truncated_bptt_steps, out_features=truncated_bptt_steps) model.example_input_array = torch.randn(5, truncated_bptt_steps) # fit model From 8c7b8028d461ea3be2b7eb7e77607faf724ccac3 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 24 Nov 2020 18:39:22 +0100 Subject: [PATCH 13/47] Hparams for boring_model --- tests/base/boring_model.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 8b783db002372..87420726d267e 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -83,13 +83,7 @@ def training_step(...): """ super().__init__() self.layer = torch.nn.Linear(in_features, out_features) - self.batch_size = batch_size - self.in_features = in_features - self.learning_rate = learning_rate - self.optimizer_name = optimizer_name - self.out_features = out_features - self.save_hyperparameters('batch_size', 'in_features', 'out_features', - 'optimizer_name', 'learning_rate') + self.save_hyperparameters() def forward(self, x): return self.layer(x) @@ -128,16 +122,16 @@ def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() def configure_optimizers(self): - optimizer_class = getattr(torch.optim, self.optimizer_name) - optimizer = optimizer_class(self.layer.parameters(), lr=self.learning_rate) + optimizer_class = getattr(torch.optim, self.hparams.optimizer_name) + optimizer = optimizer_class(self.layer.parameters(), lr=self.hparams.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) From a6af6a3d76b8771a9c1dc0e6cdf4b4803dcdb9ce Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 24 Nov 2020 20:19:42 +0100 Subject: [PATCH 14/47] Deprecated RuntimeParamChagneModelAssing (functionality is tested in RuntimeParamChangeModelSaving) --- tests/models/test_hparams.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 5e5fab7d0a0b4..270283f285769 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -20,14 +20,14 @@ import pytest import torch from fsspec.implementations.local import LocalFileSystem -from omegaconf import OmegaConf, Container +from omegaconf import Container, OmegaConf from torch.nn import functional as F from torch.utils.data import DataLoader -from pytorch_lightning import Trainer, LightningModule -from pytorch_lightning.core.saving import save_hparams_to_yaml, load_hparams_from_yaml +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.utilities import AttributeDict, is_picklable -from tests.base import EvalModelTemplate, TrialMNIST, BoringModel +from tests.base import BoringModel, EvalModelTemplate, TrialMNIST class SaveHparamsModel(BoringModel): @@ -595,16 +595,11 @@ def __init__(self, **kwargs): self.save_hyperparameters() -class RuntimeParamChangeModelAssign(BoringModel): - def __init__(self, **kwargs): - super().__init__() - self.hparams = kwargs - - -@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving, RuntimeParamChangeModelAssign]) +@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" model = cls(running_arg=123) + assert model.hparams.running_arg == 123 model.hparams.running_arg = -1 assert model.hparams.running_arg == -1 @@ -619,7 +614,6 @@ def test_init_arg_with_runtime_change(tmpdir, cls): max_epochs=1, ) trainer.fit(model) - path_yaml = os.path.join(trainer.logger.log_dir, trainer.logger.NAME_HPARAMS_FILE) hparams = load_hparams_from_yaml(path_yaml) assert hparams.get('running_arg') == 123 From ed0cf404dfb15183a7a413b17069560730822f91 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 24 Nov 2020 20:57:14 +0100 Subject: [PATCH 15/47] Reduced boring_model parameters to just in and out features, test_cpu modelsinherit BoringModel to specify additional parameters (e.g., optimizer) --- tests/base/boring_model.py | 23 ++++++++++------------- tests/models/test_cpu.py | 12 ++++++++++-- tests/models/test_hparams.py | 1 - 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 87420726d267e..8e2d665fec7c5 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -56,14 +56,7 @@ def __len__(self): class BoringModel(LightningModule): - def __init__( - self, - batch_size: int = 1, - in_features: int = 32, - learning_rate: float = 0.1, - optimizer_name: str = "SGD", - out_features: int = 2, - ): + def __init__(self, in_features: int = 32, out_features: int = 2): """ Testing PL Module @@ -82,6 +75,10 @@ def training_step(...): """ super().__init__() + + self.batch_size = 1 + self.learning_rate = 0.1 + self.optimizer_name = "SGD" self.layer = torch.nn.Linear(in_features, out_features) self.save_hyperparameters() @@ -122,16 +119,16 @@ def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() def configure_optimizers(self): - optimizer_class = getattr(torch.optim, self.hparams.optimizer_name) - optimizer = optimizer_class(self.layer.parameters(), lr=self.hparams.learning_rate) + optimizer_class = getattr(torch.optim, self.optimizer_name) + optimizer = optimizer_class(self.layer.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.hparams.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index c26d7c8509c54..ce64c99fb8b98 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -157,6 +157,13 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): def test_lbfgs_cpu_model(tmpdir): """Test each of the trainer options. Testing LBFGS optimizer""" + class ModelSpecifiedOptimizer(BoringModel): + def __init__(self, optimizer_name, learning_rate): + super().__init__() + self.optimizer_name = optimizer_name + self.learning_rate = learning_rate + self.save_hyperparameters() + trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -166,7 +173,7 @@ def test_lbfgs_cpu_model(tmpdir): limit_val_batches=0.2, ) - model = BoringModel(optimizer_name="LBFGS", learning_rate=0.004) + model = ModelSpecifiedOptimizer(optimizer_name="LBFGS", learning_rate=0.004) tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) @@ -334,9 +341,10 @@ def __len__(self): return 1 class BpttTestModel(BoringModel): - def __init__(self, *args, **kwargs): + def __init__(self, batch_size, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None + self.batch_size = batch_size def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 270283f285769..0ab116d411fa9 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -594,7 +594,6 @@ def __init__(self, **kwargs): super().__init__() self.save_hyperparameters() - @pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" From 00a7f88ad97de2050cf78ca79c97f5878d93b09c Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Wed, 25 Nov 2020 08:41:23 +0100 Subject: [PATCH 16/47] Fix PEP8 --- tests/base/develop_pipelines.py | 2 +- tests/models/test_hparams.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index a6f8e55471f7c..df615aba9c538 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -59,7 +59,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) - # TODO: DEPRECATED option + # DEPRECATED option if "checkpoint_callback" not in trainer_options: trainer_options.update(checkpoint_callback=True) diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 0ab116d411fa9..270283f285769 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -594,6 +594,7 @@ def __init__(self, **kwargs): super().__init__() self.save_hyperparameters() + @pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" From b36e5c59500b76b4513c536b298673f54a92690f Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Thu, 26 Nov 2020 18:57:29 +0100 Subject: [PATCH 17/47] Merged test_early_stopping with all_features; added TODO for self.log --- tests/base/boring_model.py | 2 ++ tests/models/test_cpu.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 8e2d665fec7c5..c1a4a2d1c031a 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -109,6 +109,7 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs) -> None: torch.stack([x['x'] for x in outputs]).mean() + # TODO: add self.log() and refactoring appropriate tests def test_step(self, batch, batch_idx): output = self.layer(batch) @@ -117,6 +118,7 @@ def test_step(self, batch, batch_idx): def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() + # TODO: add self.log() and refactoring appropriate tests def configure_optimizers(self): optimizer_class = getattr(torch.optim, self.optimizer_name) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index ce64c99fb8b98..923fc36b8f98e 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -99,8 +99,6 @@ def on_epoch_start(self, trainer, model): @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): - """Test each of the trainer options. Simply test the combo trainer and - model; callbacks functionality tests are in /tests/callbacks""" class ModelTrainVal(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -112,13 +110,16 @@ def validation_epoch_end(self, outputs) -> None: stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, - callbacks=[stopping], - max_epochs=2, - gradient_clip_val=1, + gradient_clip_val=1.0, + overfit_batches=0.20, track_grad_norm=2, - limit_train_batches=0.2, - limit_val_batches=0.1, enable_pl_optimizer=enable_pl_optimizer, + progress_bar_refresh_rate=0, + accumulate_grad_batches=2, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.4, + callbacks=[stopping], ) model = ModelTrainVal() From eaf51f770871ecbed62823ece7393eef83874a8a Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Wed, 25 Nov 2020 20:41:36 +0100 Subject: [PATCH 18/47] Update tests/base/develop_pipelines.py Co-authored-by: Rohit Gupta --- tests/base/develop_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index df615aba9c538..a6f8e55471f7c 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -59,7 +59,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) - # DEPRECATED option + # TODO: DEPRECATED option if "checkpoint_callback" not in trainer_options: trainer_options.update(checkpoint_callback=True) From e318eb1eea1922dbdfdcfc46b3ed7a696e344002 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Wed, 25 Nov 2020 20:41:52 +0100 Subject: [PATCH 19/47] Update tests/base/boring_model.py Co-authored-by: Rohit Gupta --- tests/base/boring_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index c1a4a2d1c031a..13f521d8765ab 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -122,7 +122,7 @@ def test_epoch_end(self, outputs) -> None: def configure_optimizers(self): optimizer_class = getattr(torch.optim, self.optimizer_name) - optimizer = optimizer_class(self.layer.parameters(), lr=self.learning_rate) + optimizer = optimizer_class(self.parameters(), lr=self.learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] From 2d2c0b84d6568fea185a59b085d7cd152b59fedf Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Wed, 25 Nov 2020 20:42:16 +0100 Subject: [PATCH 20/47] Update tests/base/develop_pipelines.py Co-authored-by: Rohit Gupta --- tests/base/develop_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index a6f8e55471f7c..70f4b9d11e904 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -136,4 +136,4 @@ def _(trained_model, dataloader, dp=False, min_acc=0.25): output = trained_model(batch) acc = trained_model.loss(batch, output) - assert acc >= min_acc, f"This model is expected to get , {min_acc} in test set (it got {acc})" + assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}" From 15050c3c5a24ddd545be4c4d968e04bc9b042208 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Wed, 25 Nov 2020 20:43:12 +0100 Subject: [PATCH 21/47] Update tests/models/test_cpu.py Co-authored-by: Rohit Gupta --- tests/models/test_cpu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 923fc36b8f98e..52b137715fed2 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -81,9 +81,11 @@ def on_init_start(self, trainer): def on_epoch_start(self, trainer, model): assert trainer.global_step == real_global_step and trainer.global_step > 0 # predict with loaded model to make sure answers are the same + mode = model.training model.eval() new_pred = model(batch) - assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 + assert torch.eq(pred_before_saving, new_pred).all() + model.train(mode) trainer = Trainer( default_root_dir=tmpdir, From b775c39632a2e6c00ca7761d372a6a6c713415dd Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Thu, 26 Nov 2020 11:31:15 +0100 Subject: [PATCH 22/47] Update tests/models/test_cpu.py Co-authored-by: Rohit Gupta --- tests/models/test_cpu.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 52b137715fed2..d85d70ac696d5 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -74,9 +74,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): model = BoringModel() class _StartCallback(Callback): - def on_init_start(self, trainer): - print("Starting to init trainer!") - # set the epoch start hook so we can predict before the model does the full training def on_epoch_start(self, trainer, model): assert trainer.global_step == real_global_step and trainer.global_step > 0 From aa9c1b57eca810b2ddb6695867674a9d532d3548 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Sat, 28 Nov 2020 08:51:51 +0100 Subject: [PATCH 23/47] Fixed test_all_features trainer options --- tests/models/test_cpu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index d85d70ac696d5..86c303ed5c34e 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -115,9 +115,9 @@ def validation_epoch_end(self, outputs) -> None: enable_pl_optimizer=enable_pl_optimizer, progress_bar_refresh_rate=0, accumulate_grad_batches=2, - max_epochs=1, - limit_train_batches=0.4, - limit_val_batches=0.4, + max_epochs=2, + limit_train_batches=0.1, + limit_val_batches=0.1, callbacks=[stopping], ) From ecca29791541fa0d6c3160dc6b3ae12cdd56fad4 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 30 Nov 2020 09:06:58 +0100 Subject: [PATCH 24/47] Ready for review! --- tests/base/boring_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 13f521d8765ab..673e4a9907dda 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -109,7 +109,6 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs) -> None: torch.stack([x['x'] for x in outputs]).mean() - # TODO: add self.log() and refactoring appropriate tests def test_step(self, batch, batch_idx): output = self.layer(batch) @@ -118,7 +117,6 @@ def test_step(self, batch, batch_idx): def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() - # TODO: add self.log() and refactoring appropriate tests def configure_optimizers(self): optimizer_class = getattr(torch.optim, self.optimizer_name) From 827544436e49af79c72f865702d1aa2d14482382 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Tue, 1 Dec 2020 08:28:56 +0100 Subject: [PATCH 25/47] Update tests/models/test_cpu.py Thank you! :) Co-authored-by: Rohit Gupta --- tests/models/test_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 86c303ed5c34e..852c89017aa47 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -75,7 +75,7 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): class _StartCallback(Callback): # set the epoch start hook so we can predict before the model does the full training - def on_epoch_start(self, trainer, model): + def on_train_epoch_start(self, trainer, model): assert trainer.global_step == real_global_step and trainer.global_step > 0 # predict with loaded model to make sure answers are the same mode = model.training From 90bf71402858fd2c12dcf302c7900064117a4b75 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 1 Dec 2020 11:09:08 +0100 Subject: [PATCH 26/47] added optimizer_name, lr, and batch_size as hparams for save_hparameters() --- tests/base/boring_model.py | 10 ++++++---- tests/models/test_cpu.py | 9 +-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 673e4a9907dda..4674b921090dd 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -56,7 +56,9 @@ def __len__(self): class BoringModel(LightningModule): - def __init__(self, in_features: int = 32, out_features: int = 2): + def __init__(self, in_features: int = 32, out_features: int = 2, + optimizer_name: str = 'SGD', learning_rate: float = 0.1, + batch_size:int = 1): """ Testing PL Module @@ -76,9 +78,9 @@ def training_step(...): """ super().__init__() - self.batch_size = 1 - self.learning_rate = 0.1 - self.optimizer_name = "SGD" + self.batch_size = batch_size + self.learning_rate = learning_rate + self.optimizer_name = optimizer_name self.layer = torch.nn.Linear(in_features, out_features) self.save_hyperparameters() diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 852c89017aa47..03c93ed6f38f8 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -157,13 +157,6 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): def test_lbfgs_cpu_model(tmpdir): """Test each of the trainer options. Testing LBFGS optimizer""" - class ModelSpecifiedOptimizer(BoringModel): - def __init__(self, optimizer_name, learning_rate): - super().__init__() - self.optimizer_name = optimizer_name - self.learning_rate = learning_rate - self.save_hyperparameters() - trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -173,7 +166,7 @@ def __init__(self, optimizer_name, learning_rate): limit_val_batches=0.2, ) - model = ModelSpecifiedOptimizer(optimizer_name="LBFGS", learning_rate=0.004) + model = BoringModel(optimizer_name="LBFGS", learning_rate=0.004) tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) From 3c73a65ed384500bd7f78b7eb5371589058e2b81 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Tue, 1 Dec 2020 08:38:22 +0100 Subject: [PATCH 27/47] Update tests/models/test_cpu.py Co-authored-by: Rohit Gupta --- tests/models/test_cpu.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 03c93ed6f38f8..bf41c280eb9b8 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -247,11 +247,6 @@ def __init__(self, *args, **kwargs): def val_loader(self): pass - def test_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"y": loss} - def test_epoch_end(self, outputs) -> None: test_loss = torch.stack([x["y"] for x in outputs]).mean() self.log('test_loss', test_loss) From 6fec1bea836cd844acaf8b372769b1ae7cebb9fe Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Tue, 1 Dec 2020 08:38:40 +0100 Subject: [PATCH 28/47] Update tests/models/test_cpu.py Co-authored-by: Rohit Gupta --- tests/models/test_cpu.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index bf41c280eb9b8..b8a4fe064415f 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -200,11 +200,6 @@ def validation_epoch_end(self, outputs) -> None: val_loss = torch.stack([x["x"] for x in outputs]).mean() self.log('val_loss', val_loss) - def test_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"y": loss} - def test_epoch_end(self, outputs) -> None: test_loss = torch.stack([x["y"] for x in outputs]).mean() self.log('test_loss', test_loss) From 50a2c9bf6332b0b3eb5ad6d0f9937b1d661e3931 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 1 Dec 2020 11:22:00 +0100 Subject: [PATCH 29/47] Fixes for reducing PR size --- setup.py | 2 +- tests/base/boring_model.py | 2 +- tests/base/develop_pipelines.py | 8 ++++---- tests/models/test_cpu.py | 10 +++++----- tests/models/test_horovod.py | 1 - tests/models/test_hparams.py | 10 ++++++++-- 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index c548d508ab434..1adb4237b3954 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ # the goal of the project is simplicity for researchers, don't want to add too much # engineer specific practices setup( - name="pytorch-lightning", + name='pytorch-lightning', version=pytorch_lightning.__version__, description=pytorch_lightning.__docs__, author=pytorch_lightning.__author__, diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 4674b921090dd..7971876aa2b0c 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -25,7 +25,7 @@ def __init__(self, size, length): def __getitem__(self, index): a = self.data[index] b = a + 2 - return {"a": a, "b": b} + return {'a': a, 'b': b} def __len__(self): return self.len diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 70f4b9d11e904..4b9f3cd2fcb01 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -27,7 +27,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 result = trainer.fit(model) # correct result and ok accuracy - assert result == 1, "amp + ddp model failed to complete" + assert result == 1, 'amp + ddp model failed to complete' pretrained_model = load_model_from_checkpoint( trainer.logger, @@ -53,7 +53,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True, min_acc: float = 0.25): reset_seed() - save_dir = trainer_options["default_root_dir"] + save_dir = trainer_options['default_root_dir'] # logger file to get meta logger = get_default_logger(save_dir, version=version) @@ -68,7 +68,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, result = trainer.fit(model) post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) - assert result == 1, "trainer failed" + assert result == 1, 'trainer failed' # Check that the model is actually changed post-training change_ratio = torch.norm(initial_values - post_train_values) assert change_ratio > 0.1, f"the model is changed of {change_ratio}" @@ -109,7 +109,7 @@ def run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): if dp: with torch.no_grad(): output = trained_model(batch, 0) - acc = output["val_acc"] + acc = output['val_acc'] acc = torch.mean(acc).item() else: diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index b8a4fe064415f..0a3e3aea6bbf0 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -48,7 +48,7 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): real_global_step = trainer.global_step # traning complete - assert result == 1, "cpu model failed to complete" + assert result == 1, 'cpu model failed to complete' # predict with trained model before saving # make a prediction @@ -225,7 +225,7 @@ def test_epoch_end(self, outputs) -> None: ) result = trainer.fit(model) - assert result == 1, "training failed to complete" + assert result == 1, 'training failed to complete' trainer.test() @@ -267,7 +267,7 @@ def test_epoch_end(self, outputs) -> None: ) result = trainer.fit(model) - assert result == 1, "training failed to complete" + assert result == 1, 'training failed to complete' trainer.test() @@ -289,7 +289,7 @@ def test_simple_cpu(tmpdir): result = trainer.fit(model) # traning complete - assert result == 1, "amp + ddp model failed to complete" + assert result == 1, 'amp + ddp model failed to complete' def test_cpu_model(tmpdir): @@ -374,4 +374,4 @@ def train_dataloader(self): ) result = trainer.fit(model) - assert result == 1, "training failed to complete" + assert result == 1, 'training failed to complete' diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index cc1ed9da9fd8f..3a2ae8750443f 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -63,7 +63,6 @@ def _run_horovod(trainer_options, on_gpu=False): ] if on_gpu: cmdline += ['--on-gpu'] - exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 270283f285769..6df9efa43bfb2 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -595,11 +595,16 @@ def __init__(self, **kwargs): self.save_hyperparameters() -@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) +class RuntimeParamChangeModelAssign(BoringModel): + def __init__(self, **kwargs): + super().__init__() + self.hparams = kwargs + + +@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving, RuntimeParamChangeModelAssign]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" model = cls(running_arg=123) - assert model.hparams.running_arg == 123 model.hparams.running_arg = -1 assert model.hparams.running_arg == -1 @@ -614,6 +619,7 @@ def test_init_arg_with_runtime_change(tmpdir, cls): max_epochs=1, ) trainer.fit(model) + path_yaml = os.path.join(trainer.logger.log_dir, trainer.logger.NAME_HPARAMS_FILE) hparams = load_hparams_from_yaml(path_yaml) assert hparams.get('running_arg') == 123 From 5c9290b69768015c02a1fecb035bab4d730a9bc1 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 1 Dec 2020 12:03:26 +0100 Subject: [PATCH 30/47] Reverse test_hparams (removed DEPRECATED test for hparams direct assignment) --- tests/models/test_hparams.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 6df9efa43bfb2..7081d450ee256 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -595,13 +595,7 @@ def __init__(self, **kwargs): self.save_hyperparameters() -class RuntimeParamChangeModelAssign(BoringModel): - def __init__(self, **kwargs): - super().__init__() - self.hparams = kwargs - - -@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving, RuntimeParamChangeModelAssign]) +@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" model = cls(running_arg=123) From 7757c826d0e2ec5924a5cc31bf1d6a3c02e0ea11 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Thu, 3 Dec 2020 11:53:53 +0100 Subject: [PATCH 31/47] Changes for in_features --- tests/base/boring_model.py | 6 +++--- tests/base/develop_pipelines.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 7971876aa2b0c..029b7c73f6aca 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -127,10 +127,10 @@ def configure_optimizers(self): return [optimizer], [lr_scheduler] def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(self.hparams.in_features, 64), batch_size=self.batch_size) def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(self.hparams.in_features, 64), batch_size=self.batch_size) def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(self.hparams.in_features, 64), batch_size=self.batch_size) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 4b9f3cd2fcb01..56a86d30ad999 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -109,7 +109,7 @@ def run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): if dp: with torch.no_grad(): output = trained_model(batch, 0) - acc = output['val_acc'] + acc = output['val_acc'] acc = torch.mean(acc).item() else: From 95f5766bc118068850f9e7a85090bb587081a432 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Thu, 3 Dec 2020 14:06:21 +0100 Subject: [PATCH 32/47] Fixed hparams --- tests/base/boring_model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 029b7c73f6aca..8cffbd143c90d 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -82,6 +82,7 @@ def training_step(...): self.learning_rate = learning_rate self.optimizer_name = optimizer_name self.layer = torch.nn.Linear(in_features, out_features) + self.in_features = in_features self.save_hyperparameters() def forward(self, x): @@ -127,10 +128,10 @@ def configure_optimizers(self): return [optimizer], [lr_scheduler] def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(self.hparams.in_features, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(self.in_features, 64), batch_size=self.batch_size) def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(self.hparams.in_features, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(self.in_features, 64), batch_size=self.batch_size) def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(self.hparams.in_features, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(self.in_features, 64), batch_size=self.batch_size) From 9b729e34be3c664f1bc3952bd1aad63c15f2992a Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Sat, 5 Dec 2020 09:20:00 +0100 Subject: [PATCH 33/47] Fixed parameters for boring_model --- tests/base/boring_model.py | 28 +++++++++++----------------- tests/models/test_cpu.py | 12 ++++++++++-- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py index 8cffbd143c90d..6ceffe8562372 100644 --- a/tests/base/boring_model.py +++ b/tests/base/boring_model.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from torch.utils.data import Dataset - from pytorch_lightning import LightningModule +from torch.utils.data import Dataset class RandomDictDataset(Dataset): @@ -56,9 +55,8 @@ def __len__(self): class BoringModel(LightningModule): - def __init__(self, in_features: int = 32, out_features: int = 2, - optimizer_name: str = 'SGD', learning_rate: float = 0.1, - batch_size:int = 1): + + def __init__(self): """ Testing PL Module @@ -77,13 +75,7 @@ def training_step(...): """ super().__init__() - - self.batch_size = batch_size - self.learning_rate = learning_rate - self.optimizer_name = optimizer_name - self.layer = torch.nn.Linear(in_features, out_features) - self.in_features = in_features - self.save_hyperparameters() + self.layer = torch.nn.Linear(32, 2) def forward(self, x): return self.layer(x) @@ -102,6 +94,9 @@ def training_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"loss": loss} + def training_step_end(self, training_step_outputs): + return training_step_outputs + def training_epoch_end(self, outputs) -> None: torch.stack([x["loss"] for x in outputs]).mean() @@ -122,16 +117,15 @@ def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() def configure_optimizers(self): - optimizer_class = getattr(torch.optim, self.optimizer_name) - optimizer = optimizer_class(self.parameters(), lr=self.learning_rate) + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(self.in_features, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64)) def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(self.in_features, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64)) def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(self.in_features, 64), batch_size=self.batch_size) + return torch.utils.data.DataLoader(RandomDataset(32, 64)) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 0a3e3aea6bbf0..f5d022d6ca652 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -157,6 +157,13 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): def test_lbfgs_cpu_model(tmpdir): """Test each of the trainer options. Testing LBFGS optimizer""" + class ModelSpecifiedOptimizer(BoringModel): + def __init__(self, optimizer_name, learning_rate): + super().__init__() + self.optimizer_name = optimizer_name + self.learning_rate = learning_rate + self.save_hyperparameters() + trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -166,7 +173,7 @@ def test_lbfgs_cpu_model(tmpdir): limit_val_batches=0.2, ) - model = BoringModel(optimizer_name="LBFGS", learning_rate=0.004) + model = ModelSpecifiedOptimizer(optimizer_name="LBFGS", learning_rate=0.004) tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) @@ -324,10 +331,11 @@ def __len__(self): return 1 class BpttTestModel(BoringModel): - def __init__(self, batch_size, *args, **kwargs): + def __init__(self, batch_size, in_features, out_features, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None self.batch_size = batch_size + self.layer = torch.nn.Linear(in_features, out_features) def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" From 37f757366a453514134ce4f87d7a80c28ef377a4 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Tue, 15 Dec 2020 18:15:55 +0100 Subject: [PATCH 34/47] Update tests/models/test_cpu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/models/test_cpu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index f5d022d6ca652..24e92596b395a 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -243,8 +243,6 @@ def test_epoch_end(self, outputs) -> None: def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" class ModelTrainTest(BoringModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) def val_loader(self): pass From bd7e8cfec2065e830746bcdf4e1189601b45fe00 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Tue, 15 Dec 2020 18:16:11 +0100 Subject: [PATCH 35/47] Update tests/models/test_cpu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/models/test_cpu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 24e92596b395a..4fd2b986aae07 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -99,8 +99,6 @@ def on_train_epoch_start(self, trainer, model): @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): class ModelTrainVal(BoringModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) def validation_epoch_end(self, outputs) -> None: val_loss = torch.stack([x["x"] for x in outputs]).mean() From 6fecc0943368e16c1e9278d2b8da5b204dac2310 Mon Sep 17 00:00:00 2001 From: Gianluca Scarpellini Date: Tue, 15 Dec 2020 18:16:21 +0100 Subject: [PATCH 36/47] Update tests/models/test_cpu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/models/test_cpu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 4fd2b986aae07..76d634520cb9a 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -198,8 +198,6 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" class ModelTrainValTest(BoringModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) def validation_epoch_end(self, outputs) -> None: val_loss = torch.stack([x["x"] for x in outputs]).mean() From 6527e3f518ebe1d19b83ca17cf3bd0735b8353b9 Mon Sep 17 00:00:00 2001 From: Shachar Mirkin Date: Mon, 14 Dec 2020 13:39:29 +0100 Subject: [PATCH 37/47] Add Google Colab badges (#5111) * Add colab badges to notebook Add colab badges to notebook to notebooks 4 & 5 * Add colab badges Co-authored-by: chaton --- notebooks/04-transformers-text-classification.ipynb | 7 +++++++ notebooks/05-trainer-flags-overview.ipynb | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb index 037b24e4ddd9d..d52af84a76d97 100644 --- a/notebooks/04-transformers-text-classification.ipynb +++ b/notebooks/04-transformers-text-classification.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index 6413e8239bb2e..da044a9c9b5c6 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { From 0a7ba6375f27d65f80560c5bfc5ce39174808f7c Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 15 Dec 2020 18:51:30 +0100 Subject: [PATCH 38/47] fix for pep8 --- tests/models/test_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 76d634520cb9a..cab5f1a0da7b1 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -329,7 +329,7 @@ def __init__(self, batch_size, in_features, out_features, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None self.batch_size = batch_size - self.layer = torch.nn.Linear(in_features, out_features) + self.layer = torch.nn.Linear(in_features, out_features) def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" From 717ca2f2acf5a2589bcbecb992b8e8df3aec8f72 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Fri, 18 Dec 2020 17:31:16 +0100 Subject: [PATCH 39/47] Fixed run_predction and TODO --- tests/base/develop_pipelines.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 56a86d30ad999..72121ecf5c349 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from functools import singledispatch import torch + from pytorch_lightning import Trainer -from tests.base import BoringModel +from tests.base import BoringModel, EvalModelTemplate from tests.base.develop_utils import get_default_logger, load_model_from_checkpoint, reset_seed @@ -59,10 +59,6 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) - # TODO: DEPRECATED option - if "checkpoint_callback" not in trainer_options: - trainer_options.update(checkpoint_callback=True) - trainer = Trainer(**trainer_options) initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) result = trainer.fit(model) @@ -99,8 +95,16 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) -@singledispatch -def run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): +def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): + if isinstance(trained_model, BoringModel): + return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc) + elif isinstance(trained_model, EvalModelTemplate): + return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc) + else: + raise NotImplementedError(f"prediction is not supported for class {trained_model.__class__}") + + +def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): # run prediction on 1 batch batch = next(iter(dataloader)) x, y = batch @@ -128,8 +132,7 @@ def run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})" -@run_prediction.register(BoringModel) -def _(trained_model, dataloader, dp=False, min_acc=0.25): +def _boring_model_run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): # run prediction on 1 batch batch = next(iter(dataloader)) with torch.no_grad(): From 8ee5c990d3e1436deffa3cbb974de761e59c7994 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Sat, 19 Dec 2020 08:47:06 +0100 Subject: [PATCH 40/47] fix min acc for darwin/windows without pl_opt --- tests/models/test_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index cab5f1a0da7b1..e1eaa3ba66262 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -150,7 +150,7 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): ) model = BoringModel() - tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.20) + tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.05) def test_lbfgs_cpu_model(tmpdir): From 8750fbdeaa1df59b210ceac4a183bbf032b59f67 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Sat, 19 Dec 2020 09:20:51 +0100 Subject: [PATCH 41/47] eval as DEFAULT run_prediction strategy --- tests/base/develop_pipelines.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 72121ecf5c349..254208145f2ac 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -98,10 +98,8 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): if isinstance(trained_model, BoringModel): return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc) - elif isinstance(trained_model, EvalModelTemplate): - return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc) else: - raise NotImplementedError(f"prediction is not supported for class {trained_model.__class__}") + return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc) def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): From 879f11ab1e4eef01a4681ce3708af4acbf25ca35 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Mon, 21 Dec 2020 08:53:34 +0100 Subject: [PATCH 42/47] Updated val_dataloader for running_test_no_val --- tests/models/test_cpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index e1eaa3ba66262..ec6de53438923 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -237,10 +237,11 @@ def test_epoch_end(self, outputs) -> None: def test_running_test_no_val(tmpdir): - """Verify `test()` works on a model with no `val_loader`.""" + """Verify `test()` works on a model with no `val_dataloader`. It performs + train and test only""" class ModelTrainTest(BoringModel): - def val_loader(self): + def val_dataloader(self): pass def test_epoch_end(self, outputs) -> None: From b899e9c7d4ed342df730a9e64f8aa0d8fe0b28e6 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 5 Jan 2021 09:56:36 +0100 Subject: [PATCH 43/47] rebased --- tests/base/develop_pipelines.py | 47 ++----- .../data/horovod/train_default_model.py | 5 +- tests/models/test_cpu.py | 125 ++++++++---------- tests/models/test_gpu.py | 11 +- tests/models/test_hparams.py | 16 ++- tests/models/test_restore.py | 7 +- tests/trainer/test_dataloaders.py | 4 +- 7 files changed, 95 insertions(+), 120 deletions(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 254208145f2ac..24535dc67da8e 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -11,12 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import torch from pytorch_lightning import Trainer -from tests.base import BoringModel, EvalModelTemplate -from tests.base.develop_utils import get_default_logger, load_model_from_checkpoint, reset_seed +from tests.base.develop_utils import load_model_from_checkpoint, get_default_logger, \ + reset_seed def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50): @@ -32,7 +31,6 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 pretrained_model = load_model_from_checkpoint( trainer.logger, trainer.checkpoint_callback.best_model_path, - type(model) ) # test new model accuracy @@ -41,7 +39,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(pretrained_model, dataloader, min_acc=min_acc) + run_prediction(dataloader, pretrained_model, min_acc=min_acc) if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test @@ -49,8 +47,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() -def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, - with_hpc: bool = True, min_acc: float = 0.25): +def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True): reset_seed() save_dir = trainer_options['default_root_dir'] @@ -59,6 +56,9 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) + if 'checkpoint_callback' not in trainer_options: + trainer_options.update(checkpoint_callback=True) + trainer = Trainer(**trainer_options) initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) result = trainer.fit(model) @@ -66,11 +66,10 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, assert result == 1, 'trainer failed' # Check that the model is actually changed post-training - change_ratio = torch.norm(initial_values - post_train_values) - assert change_ratio > 0.1, f"the model is changed of {change_ratio}" + assert torch.norm(initial_values - post_train_values) > 0.1 # test model loading - pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model)) + pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() @@ -78,15 +77,14 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(pretrained_model, dataloader, min_acc=min_acc) + run_prediction(dataloader, pretrained_model) if with_hpc: if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model - trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers( - pretrained_model - ) + trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ + trainer.init_optimizers(pretrained_model) # test HPC saving trainer.checkpoint_connector.hpc_save(save_dir, logger) @@ -95,14 +93,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) -def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): - if isinstance(trained_model, BoringModel): - return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc) - else: - return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc) - - -def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): +def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): # run prediction on 1 batch batch = next(iter(dataloader)) x, y = batch @@ -111,7 +102,7 @@ def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min if dp: with torch.no_grad(): output = trained_model(batch, 0) - acc = output['val_acc'] + acc = output['val_acc'] acc = torch.mean(acc).item() else: @@ -128,13 +119,3 @@ def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min acc = acc.item() assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})" - - -def _boring_model_run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): - # run prediction on 1 batch - batch = next(iter(dataloader)) - with torch.no_grad(): - output = trained_model(batch) - acc = trained_model.loss(batch, output) - - assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}" diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index d757d1d03f4f7..62f874902b094 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -35,7 +35,8 @@ from tests.base import EvalModelTemplate # noqa: E402 from tests.base.develop_pipelines import run_prediction # noqa: E402 -from tests.base.develop_utils import reset_seed, set_random_master_port # noqa: E402 +from tests.base.develop_utils import set_random_master_port, reset_seed # noqa: E402 + parser = argparse.ArgumentParser() parser.add_argument('--trainer-options', required=True) @@ -71,7 +72,7 @@ def run_test_from_config(trainer_options): test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(pretrained_model, dataloader) + run_prediction(dataloader, pretrained_model) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index ec6de53438923..f59e452080039 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -21,14 +21,15 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint -from tests.base import BoringModel +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from tests.base import EvalModelTemplate @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): """Verify model save/load/checkpoint on CPU.""" - model = BoringModel() + hparams = EvalModelTemplate.get_default_hparams() + model = EvalModelTemplate(**hparams) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -60,8 +61,11 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): for batch in dataloader: break + x, y = batch + x = x.view(x.size(0), -1) + model.eval() - pred_before_saving = model(batch) + pred_before_saving = model(x) # test HPC saving # simulate snapshot on slurm @@ -71,19 +75,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): # new logger file to get meta logger = tutils.get_default_logger(tmpdir, version=version) - model = BoringModel() - - class _StartCallback(Callback): - # set the epoch start hook so we can predict before the model does the full training - def on_train_epoch_start(self, trainer, model): - assert trainer.global_step == real_global_step and trainer.global_step > 0 - # predict with loaded model to make sure answers are the same - mode = model.training - model.eval() - new_pred = model(batch) - assert torch.eq(pred_before_saving, new_pred).all() - model.train(mode) - trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, @@ -91,6 +82,19 @@ def on_train_epoch_start(self, trainer, model): enable_pl_optimizer=enable_pl_optimizer, callbacks=[_StartCallback(), ModelCheckpoint(dirpath=tmpdir)], ) + model = EvalModelTemplate(**hparams) + + # set the epoch start hook so we can predict before the model does the full training + def assert_pred_same(): + assert trainer.global_step == real_global_step and trainer.global_step > 0 + + # predict with loaded model to make sure answers are the same + trainer.model.eval() + new_pred = trainer.model(x) + assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 + + model.on_epoch_start = assert_pred_same + # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model) @@ -107,16 +111,16 @@ def validation_epoch_end(self, outputs) -> None: stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, + callbacks=[stopping], + max_epochs=2, gradient_clip_val=1.0, overfit_batches=0.20, track_grad_norm=2, enable_pl_optimizer=enable_pl_optimizer, progress_bar_refresh_rate=0, accumulate_grad_batches=2, - max_epochs=2, limit_train_batches=0.1, limit_val_batches=0.1, - callbacks=[stopping], ) model = ModelTrainVal() @@ -149,29 +153,26 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) - model = BoringModel() - tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.05) + model = EvalModelTemplate() + tpipes.run_model_test(trainer_options, model, on_gpu=False) def test_lbfgs_cpu_model(tmpdir): - """Test each of the trainer options. Testing LBFGS optimizer""" - class ModelSpecifiedOptimizer(BoringModel): - def __init__(self, optimizer_name, learning_rate): - super().__init__() - self.optimizer_name = optimizer_name - self.learning_rate = learning_rate - self.save_hyperparameters() - + """Test each of the trainer options.""" trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, progress_bar_refresh_rate=0, - weights_summary="top", + weights_summary='top', limit_train_batches=0.2, limit_val_batches=0.2, ) - model = ModelSpecifiedOptimizer(optimizer_name="LBFGS", learning_rate=0.004) + hparams = EvalModelTemplate.get_default_hparams() + hparams.update(optimizer_name='lbfgs', + learning_rate=0.004) + model = EvalModelTemplate(**hparams) + model.configure_optimizers = model.configure_optimizers__lbfgs tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) @@ -187,8 +188,8 @@ def test_default_logger_callbacks_cpu_model(tmpdir): limit_val_batches=0.01, ) - model = BoringModel() - tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.01) + model = EvalModelTemplate() + tpipes.run_model_test_without_loggers(trainer_options, model) # test freeze on cpu model.freeze() @@ -197,17 +198,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - class ModelTrainValTest(BoringModel): - - def validation_epoch_end(self, outputs) -> None: - val_loss = torch.stack([x["x"] for x in outputs]).mean() - self.log('val_loss', val_loss) - - def test_epoch_end(self, outputs) -> None: - test_loss = torch.stack([x["y"] for x in outputs]).mean() - self.log('test_loss', test_loss) - - model = ModelTrainValTest() + model = EvalModelTemplate() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -233,22 +224,12 @@ def test_epoch_end(self, outputs) -> None: trainer.test() # test we have good test accuracy - tutils.assert_ok_model_acc(trainer, key='test_loss', thr=0.5) + tutils.assert_ok_model_acc(trainer, thr=0.5) def test_running_test_no_val(tmpdir): - """Verify `test()` works on a model with no `val_dataloader`. It performs - train and test only""" - class ModelTrainTest(BoringModel): - - def val_dataloader(self): - pass - - def test_epoch_end(self, outputs) -> None: - test_loss = torch.stack([x["y"] for x in outputs]).mean() - self.log('test_loss', test_loss) - - model = ModelTrainTest() + """Verify `test()` works on a model with no `val_loader`.""" + model = EvalModelTemplate() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -274,12 +255,12 @@ def test_epoch_end(self, outputs) -> None: trainer.test() # test we have good test accuracy - tutils.assert_ok_model_acc(trainer, key='test_loss') + tutils.assert_ok_model_acc(trainer) def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" - model = BoringModel() + model = EvalModelTemplate() # fit model trainer = Trainer( @@ -301,7 +282,7 @@ def test_cpu_model(tmpdir): progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, - limit_val_batches=0.4, + limit_val_batches=0.4 ) model = BoringModel() @@ -325,12 +306,10 @@ def __getitem__(self, i): def __len__(self): return 1 - class BpttTestModel(BoringModel): - def __init__(self, batch_size, in_features, out_features, *args, **kwargs): + class BpttTestModel(EvalModelTemplate): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None - self.batch_size = batch_size - self.layer = torch.nn.Linear(in_features, out_features) def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" @@ -343,10 +322,11 @@ def training_step(self, batch, batch_idx, hiddens): assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) - loss_val = torch.nn.functional.mse_loss(pred, y_tensor.view(batch_size, truncated_bptt_steps)) + loss_val = torch.nn.functional.mse_loss( + pred, y_tensor.view(batch_size, truncated_bptt_steps)) return { - "loss": loss_val, - "hiddens": self.test_hidden, + 'loss': loss_val, + 'hiddens': self.test_hidden, } def training_epoch_end(self, training_step_outputs): @@ -363,8 +343,15 @@ def train_dataloader(self): sampler=None, ) - model = BpttTestModel(batch_size=batch_size, - in_features=truncated_bptt_steps, out_features=truncated_bptt_steps) + hparams = EvalModelTemplate.get_default_hparams() + hparams.update( + batch_size=batch_size, + in_features=truncated_bptt_steps, + hidden_dim=truncated_bptt_steps, + out_features=truncated_bptt_steps + ) + + model = BpttTestModel(**hparams) model.example_input_array = torch.randn(5, truncated_bptt_steps) # fit model diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b7e669dcafd4b..cd61da7c008bc 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -21,10 +21,11 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import BoringModel +from tests.base import EvalModelTemplate +from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator + PRETEND_N_OF_GPUS = 16 @@ -42,8 +43,8 @@ def test_multi_gpu_none_backend(tmpdir): gpus=2, ) - model = BoringModel() - tpipes.run_model_test(trainer_options, model, min_acc=0.20) + model = EvalModelTemplate() + tpipes.run_model_test(trainer_options, model) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @@ -59,7 +60,7 @@ def test_single_gpu_model(tmpdir, gpus): gpus=gpus ) - model = BoringModel() + model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model) diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 7081d450ee256..5e5fab7d0a0b4 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -20,14 +20,14 @@ import pytest import torch from fsspec.implementations.local import LocalFileSystem -from omegaconf import Container, OmegaConf +from omegaconf import OmegaConf, Container from torch.nn import functional as F from torch.utils.data import DataLoader -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml +from pytorch_lightning import Trainer, LightningModule +from pytorch_lightning.core.saving import save_hparams_to_yaml, load_hparams_from_yaml from pytorch_lightning.utilities import AttributeDict, is_picklable -from tests.base import BoringModel, EvalModelTemplate, TrialMNIST +from tests.base import EvalModelTemplate, TrialMNIST, BoringModel class SaveHparamsModel(BoringModel): @@ -595,7 +595,13 @@ def __init__(self, **kwargs): self.save_hyperparameters() -@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) +class RuntimeParamChangeModelAssign(BoringModel): + def __init__(self, **kwargs): + super().__init__() + self.hparams = kwargs + + +@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving, RuntimeParamChangeModelAssign]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" model = cls(running_arg=123) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 47fbf7048d547..476d1db99ce60 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -142,7 +142,6 @@ def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" - tutils.set_random_master_port() model = EvalModelTemplate() @@ -187,7 +186,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir): dataloaders = [dataloaders] for dataloader in dataloaders: - tpipes.run_prediction(pretrained_model, dataloader) + tpipes.run_prediction(dataloader, pretrained_model) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -238,7 +237,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): dataloaders = [dataloaders] for dataloader in dataloaders: - tpipes.run_prediction(pretrained_model, dataloader) + tpipes.run_prediction(dataloader, pretrained_model) def test_running_test_pretrained_model_cpu(tmpdir): @@ -380,7 +379,7 @@ def assert_good_acc(): dp_model.eval() dataloader = trainer.train_dataloader - tpipes.run_prediction(dp_model, dataloader, dp=True) + tpipes.run_prediction(dataloader, dp_model, dp=True) # new model model = EvalModelTemplate(**hparams) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 614b2a8e66ab8..9b42aa98c9dd0 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -128,7 +128,7 @@ def test_multiple_val_dataloader(tmpdir): # make sure predictions are good for each val set for dataloader in trainer.val_dataloaders: - tpipes.run_prediction(trainer.model, dataloader) + tpipes.run_prediction(dataloader, trainer.model) @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific']) @@ -164,7 +164,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs): # make sure predictions are good for each test set for dataloader in trainer.test_dataloaders: - tpipes.run_prediction(trainer.model, dataloader) + tpipes.run_prediction(dataloader, trainer.model) # run the test method trainer.test(ckpt_path=ckpt_path) From a9589ee09108e63e78231a80c1c91c58f8eae0ac Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 5 Jan 2021 10:14:35 +0100 Subject: [PATCH 44/47] Revert "Updated val_dataloader for running_test_no_val" This reverts commit 5c0b31ce9ff18d217939f3ed01f81343894a7459. --- tests/models/test_cpu.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index f59e452080039..c4bfc9ea70565 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -229,7 +229,16 @@ def test_running_test_after_fitting(tmpdir): def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" - model = EvalModelTemplate() + class ModelTrainTest(BoringModel): + + def val_loader(self): + pass + + def test_epoch_end(self, outputs) -> None: + test_loss = torch.stack([x["y"] for x in outputs]).mean() + self.log('test_loss', test_loss) + + model = ModelTrainTest() # logger file to get meta logger = tutils.get_default_logger(tmpdir) From 64236115a546d54e3dd40759957b3b775dcacc18 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 5 Jan 2021 10:19:02 +0100 Subject: [PATCH 45/47] Revert "Revert "Updated val_dataloader for running_test_no_val"" This reverts commit 89a25795b5082dbb672709449554f9180b96fdfe. --- tests/base/develop_pipelines.py | 46 +++++-- .../data/horovod/train_default_model.py | 6 +- tests/models/test_cpu.py | 120 +++++++++--------- tests/models/test_gpu.py | 11 +- tests/models/test_hparams.py | 16 +-- tests/models/test_restore.py | 7 +- tests/trainer/test_dataloaders.py | 4 +- 7 files changed, 112 insertions(+), 98 deletions(-) diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 24535dc67da8e..b6289079a35ab 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -14,8 +14,8 @@ import torch from pytorch_lightning import Trainer -from tests.base.develop_utils import load_model_from_checkpoint, get_default_logger, \ - reset_seed +from tests.base import BoringModel +from tests.base.develop_utils import get_default_logger, load_model_from_checkpoint, reset_seed def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50): @@ -31,6 +31,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 pretrained_model = load_model_from_checkpoint( trainer.logger, trainer.checkpoint_callback.best_model_path, + type(model) ) # test new model accuracy @@ -39,7 +40,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(dataloader, pretrained_model, min_acc=min_acc) + run_prediction(pretrained_model, dataloader, min_acc=min_acc) if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test @@ -47,7 +48,8 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() -def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True): +def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, + with_hpc: bool = True, min_acc: float = 0.25): reset_seed() save_dir = trainer_options['default_root_dir'] @@ -56,9 +58,6 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) - if 'checkpoint_callback' not in trainer_options: - trainer_options.update(checkpoint_callback=True) - trainer = Trainer(**trainer_options) initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()]) result = trainer.fit(model) @@ -66,10 +65,11 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi assert result == 1, 'trainer failed' # Check that the model is actually changed post-training - assert torch.norm(initial_values - post_train_values) > 0.1 + change_ratio = torch.norm(initial_values - post_train_values) + assert change_ratio > 0.1, f"the model is changed of {change_ratio}" # test model loading - pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path) + pretrained_model = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model)) # test new model accuracy test_loaders = model.test_dataloader() @@ -77,14 +77,15 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(dataloader, pretrained_model) + run_prediction(pretrained_model, dataloader, min_acc=min_acc) if with_hpc: if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model - trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ - trainer.init_optimizers(pretrained_model) + trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers( + pretrained_model + ) # test HPC saving trainer.checkpoint_connector.hpc_save(save_dir, logger) @@ -93,7 +94,14 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) -def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): +def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): + if isinstance(trained_model, BoringModel): + return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc) + else: + return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc) + + +def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50): # run prediction on 1 batch batch = next(iter(dataloader)) x, y = batch @@ -102,7 +110,7 @@ def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): if dp: with torch.no_grad(): output = trained_model(batch, 0) - acc = output['val_acc'] + acc = output['val_acc'] acc = torch.mean(acc).item() else: @@ -119,3 +127,13 @@ def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): acc = acc.item() assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})" + + +def _boring_model_run_prediction(trained_model, dataloader, dp=False, min_acc=0.25): + # run prediction on 1 batch + batch = next(iter(dataloader)) + with torch.no_grad(): + output = trained_model(batch) + acc = trained_model.loss(batch, output) + + assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}" diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 62f874902b094..084c6e95993b9 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -24,6 +24,7 @@ # this is need as e.g. Conda do not uses `PYTHONPATH` env var as pip or/and virtualenv sys.path = os.getenv('PYTHONPATH').split(':') + sys.path + from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE # noqa: E402 @@ -35,8 +36,7 @@ from tests.base import EvalModelTemplate # noqa: E402 from tests.base.develop_pipelines import run_prediction # noqa: E402 -from tests.base.develop_utils import set_random_master_port, reset_seed # noqa: E402 - +from tests.base.develop_utils import reset_seed, set_random_master_port # noqa: E402 parser = argparse.ArgumentParser() parser.add_argument('--trainer-options', required=True) @@ -72,7 +72,7 @@ def run_test_from_config(trainer_options): test_loaders = [test_loaders] for dataloader in test_loaders: - run_prediction(dataloader, pretrained_model) + run_prediction(pretrained_model, dataloader) # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index c4bfc9ea70565..cc24f6f187502 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -21,15 +21,14 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from tests.base import EvalModelTemplate +from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint +from tests.base import BoringModel @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): """Verify model save/load/checkpoint on CPU.""" - hparams = EvalModelTemplate.get_default_hparams() - model = EvalModelTemplate(**hparams) + model = BoringModel() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -61,11 +60,8 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): for batch in dataloader: break - x, y = batch - x = x.view(x.size(0), -1) - model.eval() - pred_before_saving = model(x) + pred_before_saving = model(batch) # test HPC saving # simulate snapshot on slurm @@ -75,6 +71,19 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): # new logger file to get meta logger = tutils.get_default_logger(tmpdir, version=version) + model = BoringModel() + + class _StartCallback(Callback): + # set the epoch start hook so we can predict before the model does the full training + def on_train_epoch_start(self, trainer, model): + assert trainer.global_step == real_global_step and trainer.global_step > 0 + # predict with loaded model to make sure answers are the same + mode = model.training + model.eval() + new_pred = model(batch) + assert torch.eq(pred_before_saving, new_pred).all() + model.train(mode) + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, @@ -82,19 +91,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, callbacks=[_StartCallback(), ModelCheckpoint(dirpath=tmpdir)], ) - model = EvalModelTemplate(**hparams) - - # set the epoch start hook so we can predict before the model does the full training - def assert_pred_same(): - assert trainer.global_step == real_global_step and trainer.global_step > 0 - - # predict with loaded model to make sure answers are the same - trainer.model.eval() - new_pred = trainer.model(x) - assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 - - model.on_epoch_start = assert_pred_same - # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model) @@ -103,16 +99,14 @@ def assert_pred_same(): @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir): class ModelTrainVal(BoringModel): - def validation_epoch_end(self, outputs) -> None: val_loss = torch.stack([x["x"] for x in outputs]).mean() self.log('val_loss', val_loss) stopping = EarlyStopping(monitor="val_loss", min_delta=0.1) trainer_options = dict( - default_root_dir=tmpdir, callbacks=[stopping], - max_epochs=2, + default_root_dir=tmpdir, gradient_clip_val=1.0, overfit_batches=0.20, track_grad_norm=2, @@ -153,26 +147,29 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) - model = EvalModelTemplate() - tpipes.run_model_test(trainer_options, model, on_gpu=False) + model = BoringModel() + tpipes.run_model_test(trainer_options, model, on_gpu=False, min_acc=0.05) def test_lbfgs_cpu_model(tmpdir): - """Test each of the trainer options.""" + """Test each of the trainer options. Testing LBFGS optimizer""" + class ModelSpecifiedOptimizer(BoringModel): + def __init__(self, optimizer_name, learning_rate): + super().__init__() + self.optimizer_name = optimizer_name + self.learning_rate = learning_rate + self.save_hyperparameters() + trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, progress_bar_refresh_rate=0, - weights_summary='top', + weights_summary="top", limit_train_batches=0.2, limit_val_batches=0.2, ) - hparams = EvalModelTemplate.get_default_hparams() - hparams.update(optimizer_name='lbfgs', - learning_rate=0.004) - model = EvalModelTemplate(**hparams) - model.configure_optimizers = model.configure_optimizers__lbfgs + model = ModelSpecifiedOptimizer(optimizer_name="LBFGS", learning_rate=0.004) tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.25) @@ -188,8 +185,8 @@ def test_default_logger_callbacks_cpu_model(tmpdir): limit_val_batches=0.01, ) - model = EvalModelTemplate() - tpipes.run_model_test_without_loggers(trainer_options, model) + model = BoringModel() + tpipes.run_model_test_without_loggers(trainer_options, model, min_acc=0.01) # test freeze on cpu model.freeze() @@ -198,7 +195,17 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - model = EvalModelTemplate() + class ModelTrainValTest(BoringModel): + + def validation_epoch_end(self, outputs) -> None: + val_loss = torch.stack([x["x"] for x in outputs]).mean() + self.log('val_loss', val_loss) + + def test_epoch_end(self, outputs) -> None: + test_loss = torch.stack([x["y"] for x in outputs]).mean() + self.log('test_loss', test_loss) + + model = ModelTrainValTest() # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -224,14 +231,15 @@ def test_running_test_after_fitting(tmpdir): trainer.test() # test we have good test accuracy - tutils.assert_ok_model_acc(trainer, thr=0.5) + tutils.assert_ok_model_acc(trainer, key='test_loss', thr=0.5) def test_running_test_no_val(tmpdir): - """Verify `test()` works on a model with no `val_loader`.""" + """Verify `test()` works on a model with no `val_dataloader`. It performs + train and test only""" class ModelTrainTest(BoringModel): - def val_loader(self): + def val_dataloader(self): pass def test_epoch_end(self, outputs) -> None: @@ -264,12 +272,12 @@ def test_epoch_end(self, outputs) -> None: trainer.test() # test we have good test accuracy - tutils.assert_ok_model_acc(trainer) + tutils.assert_ok_model_acc(trainer, key='test_loss') def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" - model = EvalModelTemplate() + model = BoringModel() # fit model trainer = Trainer( @@ -291,7 +299,7 @@ def test_cpu_model(tmpdir): progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = BoringModel() @@ -315,10 +323,12 @@ def __getitem__(self, i): def __len__(self): return 1 - class BpttTestModel(EvalModelTemplate): - def __init__(self, *args, **kwargs): + class BpttTestModel(BoringModel): + def __init__(self, batch_size, in_features, out_features, *args, **kwargs): super().__init__(*args, **kwargs) self.test_hidden = None + self.batch_size = batch_size + self.layer = torch.nn.Linear(in_features, out_features) def training_step(self, batch, batch_idx, hiddens): assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" @@ -331,18 +341,17 @@ def training_step(self, batch, batch_idx, hiddens): assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) - loss_val = torch.nn.functional.mse_loss( - pred, y_tensor.view(batch_size, truncated_bptt_steps)) + loss_val = torch.nn.functional.mse_loss(pred, y_tensor.view(batch_size, truncated_bptt_steps)) return { - 'loss': loss_val, - 'hiddens': self.test_hidden, + "loss": loss_val, + "hiddens": self.test_hidden, } def training_epoch_end(self, training_step_outputs): training_step_outputs = training_step_outputs[0] assert len(training_step_outputs) == (sequence_size / truncated_bptt_steps) - loss = torch.stack([x['loss'] for x in training_step_outputs]).mean() - self.log('train_loss', loss) + loss = torch.stack([x["loss"] for x in training_step_outputs]).mean() + self.log("train_loss", loss) def train_dataloader(self): return torch.utils.data.DataLoader( @@ -352,15 +361,8 @@ def train_dataloader(self): sampler=None, ) - hparams = EvalModelTemplate.get_default_hparams() - hparams.update( - batch_size=batch_size, - in_features=truncated_bptt_steps, - hidden_dim=truncated_bptt_steps, - out_features=truncated_bptt_steps - ) - - model = BpttTestModel(**hparams) + model = BpttTestModel(batch_size=batch_size, + in_features=truncated_bptt_steps, out_features=truncated_bptt_steps) model.example_input_array = torch.randn(5, truncated_bptt_steps) # fit model diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index cd61da7c008bc..b7e669dcafd4b 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -21,11 +21,10 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import EvalModelTemplate -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator - +from tests.base import BoringModel PRETEND_N_OF_GPUS = 16 @@ -43,8 +42,8 @@ def test_multi_gpu_none_backend(tmpdir): gpus=2, ) - model = EvalModelTemplate() - tpipes.run_model_test(trainer_options, model) + model = BoringModel() + tpipes.run_model_test(trainer_options, model, min_acc=0.20) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @@ -60,7 +59,7 @@ def test_single_gpu_model(tmpdir, gpus): gpus=gpus ) - model = EvalModelTemplate() + model = BoringModel() tpipes.run_model_test(trainer_options, model) diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index 5e5fab7d0a0b4..7081d450ee256 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -20,14 +20,14 @@ import pytest import torch from fsspec.implementations.local import LocalFileSystem -from omegaconf import OmegaConf, Container +from omegaconf import Container, OmegaConf from torch.nn import functional as F from torch.utils.data import DataLoader -from pytorch_lightning import Trainer, LightningModule -from pytorch_lightning.core.saving import save_hparams_to_yaml, load_hparams_from_yaml +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.utilities import AttributeDict, is_picklable -from tests.base import EvalModelTemplate, TrialMNIST, BoringModel +from tests.base import BoringModel, EvalModelTemplate, TrialMNIST class SaveHparamsModel(BoringModel): @@ -595,13 +595,7 @@ def __init__(self, **kwargs): self.save_hyperparameters() -class RuntimeParamChangeModelAssign(BoringModel): - def __init__(self, **kwargs): - super().__init__() - self.hparams = kwargs - - -@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving, RuntimeParamChangeModelAssign]) +@pytest.mark.parametrize("cls", [RuntimeParamChangeModelSaving]) def test_init_arg_with_runtime_change(tmpdir, cls): """Test that we save/export only the initial hparams, no other runtime change allowed""" model = cls(running_arg=123) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 476d1db99ce60..47fbf7048d547 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -142,6 +142,7 @@ def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" + tutils.set_random_master_port() model = EvalModelTemplate() @@ -186,7 +187,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir): dataloaders = [dataloaders] for dataloader in dataloaders: - tpipes.run_prediction(dataloader, pretrained_model) + tpipes.run_prediction(pretrained_model, dataloader) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -237,7 +238,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): dataloaders = [dataloaders] for dataloader in dataloaders: - tpipes.run_prediction(dataloader, pretrained_model) + tpipes.run_prediction(pretrained_model, dataloader) def test_running_test_pretrained_model_cpu(tmpdir): @@ -379,7 +380,7 @@ def assert_good_acc(): dp_model.eval() dataloader = trainer.train_dataloader - tpipes.run_prediction(dataloader, dp_model, dp=True) + tpipes.run_prediction(dp_model, dataloader, dp=True) # new model model = EvalModelTemplate(**hparams) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 9b42aa98c9dd0..614b2a8e66ab8 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -128,7 +128,7 @@ def test_multiple_val_dataloader(tmpdir): # make sure predictions are good for each val set for dataloader in trainer.val_dataloaders: - tpipes.run_prediction(dataloader, trainer.model) + tpipes.run_prediction(trainer.model, dataloader) @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific']) @@ -164,7 +164,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs): # make sure predictions are good for each test set for dataloader in trainer.test_dataloaders: - tpipes.run_prediction(dataloader, trainer.model) + tpipes.run_prediction(trainer.model, dataloader) # run the test method trainer.test(ckpt_path=ckpt_path) From e7e5f6772dfd7b6f34a9a2fec80c49d6efe7e674 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 5 Jan 2021 10:52:36 +0100 Subject: [PATCH 46/47] reverting unwanted modifications --- .../04-transformers-text-classification.ipynb | 7 ---- notebooks/05-trainer-flags-overview.ipynb | 7 ---- pytorch_lightning/trainer/supporters.py | 36 +++++-------------- setup.py | 2 +- tests/base/model_train_dataloaders.py | 2 +- tests/base/model_train_steps.py | 1 + .../data/horovod/train_default_model.py | 4 +-- tests/trainer/test_dataloaders.py | 7 ++-- tests/trainer/test_supporters.py | 20 ++--------- tests/utilities/distributed.py | 1 - 10 files changed, 19 insertions(+), 68 deletions(-) diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb index d52af84a76d97..037b24e4ddd9d 100644 --- a/notebooks/04-transformers-text-classification.ipynb +++ b/notebooks/04-transformers-text-classification.ipynb @@ -1,12 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open" - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index da044a9c9b5c6..6413e8239bb2e 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -1,12 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open" - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index db51fb8014de0..81d4f0cfcbcf1 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -24,8 +24,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence from typing import Any, Union -from pytorch_lightning.utilities.exceptions import MisconfigurationException - class TensorRunningAccum(object): """Tracks a running accumulation values (min, max, mean) without graph @@ -253,34 +251,19 @@ class CombinedDataset(object): """ Combine multiple datasets and compute their statistics """ - COMPUTE_FUNCS = {'min_size': min, 'max_size_cycle': max} - - def __init__(self, datasets: Union[Sequence, Mapping], mode: str = 'min_size'): + def __init__(self, datasets: Union[Sequence, Mapping], mode: str): """ Args: datasets: a sequence/mapping datasets. Can be a collections of torch.utils.Dataset, Iterable or even None. - mode: whether to use the minimum number of batches in all samples or the maximum + mode: whether to use the minimum number of batches in all samples or the maximum number of batches in all samples. """ self.datasets = datasets - if mode not in self.COMPUTE_FUNCS.keys(): - raise MisconfigurationException( - f'You have selected unsupported mode "{mode}",' - f' please select one the: {list(self.COMPUTE_FUNCS.keys())}.' - ) self.mode = mode - @property - def max_len(self) -> Union[int, float]: - return self._calc_num_data(self.datasets, 'max_size_cycle') - - @property - def min_len(self) -> Union[int, float]: - return self._calc_num_data(self.datasets, 'min_size') - @staticmethod def _calc_num_data(datasets: Union[Sequence, Mapping], mode: str) -> Union[int, float]: """ @@ -296,14 +279,14 @@ def _calc_num_data(datasets: Union[Sequence, Mapping], mode: str) -> Union[int, length: the length of `CombinedDataset` """ - if mode not in CombinedDataset.COMPUTE_FUNCS.keys(): - raise MisconfigurationException(f"Invalid Mode: {mode}") + if mode not in ['min_size', 'max_size_cycle']: + raise ValueError(f"Invalid Mode: {mode}") # extract the lengths all_lengths = apply_to_collection(datasets, (Dataset, Iterable, type(None)), get_len, wrong_dtype=(Sequence, Mapping)) - compute_func = CombinedDataset.COMPUTE_FUNCS[mode] + compute_func = {'min_size': min, 'max_size_cycle': max} if isinstance(all_lengths, (int, float)): length = all_lengths @@ -324,7 +307,7 @@ def __len__(self) -> int: class CombinedLoader(object): """ Combines different dataloaders and allows sampling in parallel. - + Supported modes are 'min_size', which raises StopIteration after the shortest loader (the one with the lowest number of batches) is done, and 'max_size_cycle` which raises StopIteration after the longest loader (the one with most batches) is done, while cycling @@ -365,7 +348,7 @@ def __init__(self, loaders: Any, mode: str = 'min_size'): self.dataset = CombinedDataset(datasets, mode) if mode not in self.SUPPORTED_MODES: - raise MisconfigurationException(f"Invalid Mode: {mode}") + raise ValueError(f"Invalid Mode: {mode}") self.mode = mode @@ -403,9 +386,8 @@ def _wrap_loaders_max_size_cycle(self) -> Any: for k, v in self.loaders.items()}) elif isinstance(self.loaders, Sequence): - self.loaders = type(self.loaders)([ - CycleIterator(v, length=length) for v in self.loaders - ]) + self.loaders = type(self.loaders)([CycleIterator(v, length=length) + for v in self.loaders]) # dataloaders are iterable but not sequence elif isinstance(self.loaders, Iterable): diff --git a/setup.py b/setup.py index 1adb4237b3954..6b68d5524167d 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ # the goal of the project is simplicity for researchers, don't want to add too much # engineer specific practices setup( - name='pytorch-lightning', + name="pytorch-lightning-nightly", version=pytorch_lightning.__version__, description=pytorch_lightning.__docs__, author=pytorch_lightning.__author__, diff --git a/tests/base/model_train_dataloaders.py b/tests/base/model_train_dataloaders.py index 0cc6b7e9e14db..65873cfa8d6c4 100644 --- a/tests/base/model_train_dataloaders.py +++ b/tests/base/model_train_dataloaders.py @@ -42,7 +42,7 @@ def train_dataloader__multiple_mapping(self): """Return a mapping loaders with different lengths""" return {'a': self.dataloader(train=True, num_samples=100), 'b': self.dataloader(train=True, num_samples=50)} - + def train_dataloader__multiple_sequence(self): return [self.dataloader(train=True, num_samples=100), self.dataloader(train=True, num_samples=50)] diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index e39dd47aa565b..e12d004db8f98 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -174,3 +174,4 @@ def training_step__multiple_dataloaders(self, batch, batch_idx, optimizer_idx=No } ) return output + diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 084c6e95993b9..c38b5b4efafe8 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -24,7 +24,6 @@ # this is need as e.g. Conda do not uses `PYTHONPATH` env var as pip or/and virtualenv sys.path = os.getenv('PYTHONPATH').split(':') + sys.path - from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE # noqa: E402 @@ -36,7 +35,8 @@ from tests.base import EvalModelTemplate # noqa: E402 from tests.base.develop_pipelines import run_prediction # noqa: E402 -from tests.base.develop_utils import reset_seed, set_random_master_port # noqa: E402 +from tests.base.develop_utils import set_random_master_port, reset_seed # noqa: E402 + parser = argparse.ArgumentParser() parser.add_argument('--trainer-options', required=True) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 614b2a8e66ab8..426a23b13a8f1 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -919,14 +919,11 @@ def test_fit_multiple_train_loaders(tmpdir, multiple_trainloader_mode, num_train """Integration test for multple train loaders""" model = EvalModelTemplate() - model.train_dataloader = model.train_dataloader__multiple_mapping - # todo: add also `train_dataloader__multiple_sequence` + model.train_dataloader = model.train_dataloader__multiple model.training_step = model.training_step__multiple_dataloaders trainer = Trainer( - max_epochs=1, - default_root_dir=tmpdir, - multiple_trainloader_mode=multiple_trainloader_mode, + max_epochs=1, default_root_dir=tmpdir, multiple_trainloader_mode=multiple_trainloader_mode ) assert 1 == trainer.fit(model) diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 6195d7ddeb0b0..88812f01b1a22 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -1,16 +1,3 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. from collections import Sequence import pytest @@ -18,7 +5,6 @@ from torch.utils.data import TensorDataset from pytorch_lightning.trainer.supporters import CycleIterator, CombinedLoader, CombinedDataset, CombinedLoaderIterator -from pytorch_lightning.utilities.exceptions import MisconfigurationException def test_cycle_iterator(): @@ -59,7 +45,7 @@ def test_combined_dataset(dataset_1, dataset_2): def test_combined_dataset_length_mode_error(): - with pytest.raises(MisconfigurationException, match='Invalid Mode'): + with pytest.raises(ValueError, match='Invalid Mode'): CombinedDataset._calc_num_data([range(10)], 'test') @@ -80,8 +66,8 @@ def test_combined_loader_iterator_dict_min_size(): def test_combined_loader_init_mode_error(): """Test the ValueError when constructing `CombinedLoader`""" - with pytest.raises(MisconfigurationException, match='selected unsupported mode'): - CombinedLoader([range(10)], 'testtt') + with pytest.raises(ValueError, match='Invalid Mode'): + CombinedLoader([range(10)], 'test') def test_combined_loader_loader_type_error(): diff --git a/tests/utilities/distributed.py b/tests/utilities/distributed.py index c569f51143f25..80c0246ce6c57 100644 --- a/tests/utilities/distributed.py +++ b/tests/utilities/distributed.py @@ -41,5 +41,4 @@ def call_training_script(module_file, cli_args, method, tmpdir, timeout=60): except TimeoutExpired: p.kill() std, err = p.communicate() - return std, err From 87ef88f82b0822b37f03f64e3ddf99e65b794a90 Mon Sep 17 00:00:00 2001 From: gianscarpe Date: Tue, 5 Jan 2021 11:29:03 +0100 Subject: [PATCH 47/47] unwanted changes --- pytorch_lightning/trainer/supporters.py | 36 ++++++++++++++++++------- tests/base/model_train_dataloaders.py | 2 +- tests/base/model_train_steps.py | 1 - tests/trainer/test_dataloaders.py | 7 +++-- tests/trainer/test_supporters.py | 20 +++++++++++--- 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 81d4f0cfcbcf1..db51fb8014de0 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -24,6 +24,8 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence from typing import Any, Union +from pytorch_lightning.utilities.exceptions import MisconfigurationException + class TensorRunningAccum(object): """Tracks a running accumulation values (min, max, mean) without graph @@ -251,19 +253,34 @@ class CombinedDataset(object): """ Combine multiple datasets and compute their statistics """ - def __init__(self, datasets: Union[Sequence, Mapping], mode: str): + COMPUTE_FUNCS = {'min_size': min, 'max_size_cycle': max} + + def __init__(self, datasets: Union[Sequence, Mapping], mode: str = 'min_size'): """ Args: datasets: a sequence/mapping datasets. Can be a collections of torch.utils.Dataset, Iterable or even None. - mode: whether to use the minimum number of batches in all samples or the maximum + mode: whether to use the minimum number of batches in all samples or the maximum number of batches in all samples. """ self.datasets = datasets + if mode not in self.COMPUTE_FUNCS.keys(): + raise MisconfigurationException( + f'You have selected unsupported mode "{mode}",' + f' please select one the: {list(self.COMPUTE_FUNCS.keys())}.' + ) self.mode = mode + @property + def max_len(self) -> Union[int, float]: + return self._calc_num_data(self.datasets, 'max_size_cycle') + + @property + def min_len(self) -> Union[int, float]: + return self._calc_num_data(self.datasets, 'min_size') + @staticmethod def _calc_num_data(datasets: Union[Sequence, Mapping], mode: str) -> Union[int, float]: """ @@ -279,14 +296,14 @@ def _calc_num_data(datasets: Union[Sequence, Mapping], mode: str) -> Union[int, length: the length of `CombinedDataset` """ - if mode not in ['min_size', 'max_size_cycle']: - raise ValueError(f"Invalid Mode: {mode}") + if mode not in CombinedDataset.COMPUTE_FUNCS.keys(): + raise MisconfigurationException(f"Invalid Mode: {mode}") # extract the lengths all_lengths = apply_to_collection(datasets, (Dataset, Iterable, type(None)), get_len, wrong_dtype=(Sequence, Mapping)) - compute_func = {'min_size': min, 'max_size_cycle': max} + compute_func = CombinedDataset.COMPUTE_FUNCS[mode] if isinstance(all_lengths, (int, float)): length = all_lengths @@ -307,7 +324,7 @@ def __len__(self) -> int: class CombinedLoader(object): """ Combines different dataloaders and allows sampling in parallel. - + Supported modes are 'min_size', which raises StopIteration after the shortest loader (the one with the lowest number of batches) is done, and 'max_size_cycle` which raises StopIteration after the longest loader (the one with most batches) is done, while cycling @@ -348,7 +365,7 @@ def __init__(self, loaders: Any, mode: str = 'min_size'): self.dataset = CombinedDataset(datasets, mode) if mode not in self.SUPPORTED_MODES: - raise ValueError(f"Invalid Mode: {mode}") + raise MisconfigurationException(f"Invalid Mode: {mode}") self.mode = mode @@ -386,8 +403,9 @@ def _wrap_loaders_max_size_cycle(self) -> Any: for k, v in self.loaders.items()}) elif isinstance(self.loaders, Sequence): - self.loaders = type(self.loaders)([CycleIterator(v, length=length) - for v in self.loaders]) + self.loaders = type(self.loaders)([ + CycleIterator(v, length=length) for v in self.loaders + ]) # dataloaders are iterable but not sequence elif isinstance(self.loaders, Iterable): diff --git a/tests/base/model_train_dataloaders.py b/tests/base/model_train_dataloaders.py index 65873cfa8d6c4..0cc6b7e9e14db 100644 --- a/tests/base/model_train_dataloaders.py +++ b/tests/base/model_train_dataloaders.py @@ -42,7 +42,7 @@ def train_dataloader__multiple_mapping(self): """Return a mapping loaders with different lengths""" return {'a': self.dataloader(train=True, num_samples=100), 'b': self.dataloader(train=True, num_samples=50)} - + def train_dataloader__multiple_sequence(self): return [self.dataloader(train=True, num_samples=100), self.dataloader(train=True, num_samples=50)] diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index e12d004db8f98..e39dd47aa565b 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -174,4 +174,3 @@ def training_step__multiple_dataloaders(self, batch, batch_idx, optimizer_idx=No } ) return output - diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 426a23b13a8f1..614b2a8e66ab8 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -919,11 +919,14 @@ def test_fit_multiple_train_loaders(tmpdir, multiple_trainloader_mode, num_train """Integration test for multple train loaders""" model = EvalModelTemplate() - model.train_dataloader = model.train_dataloader__multiple + model.train_dataloader = model.train_dataloader__multiple_mapping + # todo: add also `train_dataloader__multiple_sequence` model.training_step = model.training_step__multiple_dataloaders trainer = Trainer( - max_epochs=1, default_root_dir=tmpdir, multiple_trainloader_mode=multiple_trainloader_mode + max_epochs=1, + default_root_dir=tmpdir, + multiple_trainloader_mode=multiple_trainloader_mode, ) assert 1 == trainer.fit(model) diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 88812f01b1a22..6195d7ddeb0b0 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from collections import Sequence import pytest @@ -5,6 +18,7 @@ from torch.utils.data import TensorDataset from pytorch_lightning.trainer.supporters import CycleIterator, CombinedLoader, CombinedDataset, CombinedLoaderIterator +from pytorch_lightning.utilities.exceptions import MisconfigurationException def test_cycle_iterator(): @@ -45,7 +59,7 @@ def test_combined_dataset(dataset_1, dataset_2): def test_combined_dataset_length_mode_error(): - with pytest.raises(ValueError, match='Invalid Mode'): + with pytest.raises(MisconfigurationException, match='Invalid Mode'): CombinedDataset._calc_num_data([range(10)], 'test') @@ -66,8 +80,8 @@ def test_combined_loader_iterator_dict_min_size(): def test_combined_loader_init_mode_error(): """Test the ValueError when constructing `CombinedLoader`""" - with pytest.raises(ValueError, match='Invalid Mode'): - CombinedLoader([range(10)], 'test') + with pytest.raises(MisconfigurationException, match='selected unsupported mode'): + CombinedLoader([range(10)], 'testtt') def test_combined_loader_loader_type_error():