From d48093bd2a2a35b0805c66f5ef38e46b3162388b Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 2 Nov 2020 18:03:39 +0000 Subject: [PATCH 01/41] resolve bug --- pytorch_lightning/core/lightning.py | 3 +- .../optimization/test_manual_optimization.py | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 22d63d0a03a74..0357425c0850c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1100,7 +1100,8 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - loss.backward(*args, **kwargs) + if self.trainer.train_loop.automatic_optimization: + loss.backward(*args, **kwargs) self.trainer.train_loop.track_and_norm_grad(optimizer=optimizer) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 5f279c0b0a4db..06d4c7656ca7e 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -355,3 +355,33 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_ddp_and_automatic_optimization_false(tmpdir): + + class ExtendedModel(BoringModel): + def training_step(self, batch, batch_idx): + opt = self.optimizers() + output = self.layer(batch) + loss = self.loss(batch, output) + self.manual_backward(loss, opt) + opt.step() + opt.zero_grad() + loss = loss.detach() + return loss + + model = ExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=os.getcwd(), + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + distributed_backend="ddp_spawn", + gpus=2, + automatic_optimization=False + ) From 855ec4dfe62a770b40c432d4059b91a1d7a9173f Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 2 Nov 2020 20:09:49 +0000 Subject: [PATCH 02/41] add self._running_manual_optim --- pytorch_lightning/core/lightning.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 0357425c0850c..db6ec82353963 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -111,6 +111,7 @@ def __init__(self, *args, **kwargs): self._datamodule = None self._results: Optional[Result] = None self._current_fx_name = '' + self._running_manual_optim = False def optimizers(self): opts = self.trainer.optimizers @@ -1078,8 +1079,10 @@ def training_step(...): # make sure we're using manual opt self._verify_is_manual_optimization('manual_backward') + self._running_manual_optim = True # backward self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) + self._running_manual_optim = False def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ @@ -1100,7 +1103,7 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - if self.trainer.train_loop.automatic_optimization: + if self.trainer.train_loop.automatic_optimization or self._running_manual_optim: loss.backward(*args, **kwargs) self.trainer.train_loop.track_and_norm_grad(optimizer=optimizer) From c76ec1825ce2df6ab881584ccc749b342d5a1804 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 2 Nov 2020 20:12:27 +0000 Subject: [PATCH 03/41] update --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index db6ec82353963..3e1ebc7cc4ce7 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1079,8 +1079,8 @@ def training_step(...): # make sure we're using manual opt self._verify_is_manual_optimization('manual_backward') - self._running_manual_optim = True # backward + self._running_manual_optim = True self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) self._running_manual_optim = False From ccacf66ea307548d6b7dd7f0852d78f7420e115e Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 3 Nov 2020 08:52:45 +0000 Subject: [PATCH 04/41] update tests --- pytorch_lightning/core/lightning.py | 4 +- .../optimization/test_manual_optimization.py | 52 +++++++++++++++---- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3e1ebc7cc4ce7..18f2e53ef54dd 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1103,8 +1103,8 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - if self.trainer.train_loop.automatic_optimization or self._running_manual_optim: - loss.backward(*args, **kwargs) + #if self.trainer.train_loop.automatic_optimization or self._running_manual_optim: + loss.backward(*args, **kwargs) self.trainer.train_loop.track_and_norm_grad(optimizer=optimizer) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 06d4c7656ca7e..d8b3c56f31ade 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -14,6 +14,8 @@ import os import torch import pytest +import collections +from unittest import mock from tests.base.boring_model import BoringModel, RandomDataset from pytorch_lightning import Trainer from pytorch_lightning.utilities import APEX_AVAILABLE @@ -356,32 +358,64 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls - -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_ddp_and_automatic_optimization_false(tmpdir): +def test_automatic_optimization_false(tmpdir): + """ + This test verify that in `automatic_optimization` we don't add gradient if the user return loss. + """ class ExtendedModel(BoringModel): + + count = 0 + called = collections.defaultdict(int) + + @property + def should_update(self): + return self.count % 2 == 0 + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 opt = self.optimizers() output = self.layer(batch) loss = self.loss(batch, output) - self.manual_backward(loss, opt) - opt.step() - opt.zero_grad() + if self.should_update: + weight_before = self.layer.weight.clone() + self.manual_backward(loss, opt) + assert torch.sum(self.layer.weight.grad) != 0 + opt.step() + after_before = self.layer.weight.clone() + assert not torch.equal(weight_before, after_before) + opt.zero_grad() loss = loss.detach() return loss + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update: + assert not torch.equal(self.weight_before, after_before) + else: + assert torch.equal(self.weight_before, after_before) + assert torch.sum(self.layer.weight.grad) == 0 + self.count += 1 + model = ExtendedModel() model.training_step_end = None model.training_epoch_end = None trainer = Trainer( max_epochs=1, - default_root_dir=os.getcwd(), + default_root_dir=tmpdir, limit_train_batches=10, limit_test_batches=0, limit_val_batches=0, - distributed_backend="ddp_spawn", - gpus=2, automatic_optimization=False ) + trainer.fit(model) + + assert model.called["training_step"] == 10 + assert model.called["on_batch_start"] == 10 + assert model.called["on_train_batch_end"] == 10 From d98f9a05a64f03e4dae9631c580ba854d8005b2a Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 3 Nov 2020 18:03:31 +0000 Subject: [PATCH 05/41] update lightning module --- pytorch_lightning/core/lightning.py | 5 ++--- .../optimization/test_manual_optimization.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index a5a8b667c9b29..fd8562319be06 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1118,9 +1118,8 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - #if self.trainer.train_loop.automatic_optimization or self._running_manual_optim: - loss.backward(*args, **kwargs) - self.trainer.train_loop.track_and_norm_grad(optimizer=optimizer) + if self.trainer.train_loop.automatic_optimization or self._running_manual_optim: + loss.backward(*args, **kwargs) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): """ diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index d8b3c56f31ade..db6eb89901fca 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -358,6 +358,8 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_automatic_optimization_false(tmpdir): """ This test verify that in `automatic_optimization` we don't add gradient if the user return loss. @@ -373,7 +375,7 @@ def should_update(self): return self.count % 2 == 0 def on_train_batch_start(self, batch, batch_idx, dataloader_idx): - self.called["on_batch_start"] += 1 + self.called["on_train_batch_start"] += 1 self.weight_before = self.layer.weight.clone() def training_step(self, batch, batch_idx): @@ -389,7 +391,8 @@ def training_step(self, batch, batch_idx): after_before = self.layer.weight.clone() assert not torch.equal(weight_before, after_before) opt.zero_grad() - loss = loss.detach() + + # the loss should be ignored return loss def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): @@ -412,10 +415,14 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): limit_train_batches=10, limit_test_batches=0, limit_val_batches=0, - automatic_optimization=False + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp", + gpus=2, ) trainer.fit(model) assert model.called["training_step"] == 10 - assert model.called["on_batch_start"] == 10 + assert model.called["on_train_batch_start"] == 10 assert model.called["on_train_batch_end"] == 10 From e858f28dc09573f549f0f7c628614b4e86f7ba83 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 4 Nov 2020 18:43:24 +0000 Subject: [PATCH 06/41] resolve bug --- tests/trainer/optimization/test_manual_optimization.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index db6eb89901fca..16d87083daf2d 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -360,6 +360,7 @@ def configure_optimizers(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_automatic_optimization_false(tmpdir): """ This test verify that in `automatic_optimization` we don't add gradient if the user return loss. @@ -382,18 +383,23 @@ def training_step(self, batch, batch_idx): self.called["training_step"] += 1 opt = self.optimizers() output = self.layer(batch) - loss = self.loss(batch, output) + loss = 0.1 * self.loss(batch, output) if self.should_update: weight_before = self.layer.weight.clone() self.manual_backward(loss, opt) + self.trainer.scaler.unscale_(opt) + + print(torch.sum(self.layer.weight.grad)) assert torch.sum(self.layer.weight.grad) != 0 + opt.step() + self.trainer.scaler.update() after_before = self.layer.weight.clone() assert not torch.equal(weight_before, after_before) opt.zero_grad() # the loss should be ignored - return loss + return loss.detach() def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 From 35be943fcf8dce3c8a2f4950714738781ac839f3 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 4 Nov 2020 18:59:58 +0000 Subject: [PATCH 07/41] update tests --- tests/trainer/warnings_tests/test_flow_warnings.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/warnings_tests/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py index 298237ad930dc..0fd1652a8cbfc 100644 --- a/tests/trainer/warnings_tests/test_flow_warnings.py +++ b/tests/trainer/warnings_tests/test_flow_warnings.py @@ -16,6 +16,11 @@ from pytorch_lightning import Trainer import warnings +class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + return acc + def test_no_depre_without_epoch_end(tmpdir): """ @@ -23,11 +28,6 @@ def test_no_depre_without_epoch_end(tmpdir): """ os.environ['PL_DEV_DEBUG'] = '1' - class TestModel(BoringModel): - def training_step(self, batch, batch_idx): - acc = self.step(batch[0]) - return acc - model = TestModel() model.validation_epoch_end = None From 4a86cb87366797de83e7f749c434804e5e7b197f Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 4 Nov 2020 19:02:03 +0000 Subject: [PATCH 08/41] update --- .../optimization/test_manual_optimization.py | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 16d87083daf2d..b0a78914e995a 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -359,6 +359,51 @@ def configure_optimizers(self): assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls +class ExtendedModel(BoringModel): + + count = 0 + called = collections.defaultdict(int) + + @property + def should_update(self): + return self.count % 2 == 0 + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + loss = 0.1 * self.loss(batch, output) + if self.should_update: + weight_before = self.layer.weight.clone() + self.manual_backward(loss, opt) + self.trainer.scaler.unscale_(opt) + + print(torch.sum(self.layer.weight.grad)) + assert torch.sum(self.layer.weight.grad) != 0 + + opt.step() + self.trainer.scaler.update() + after_before = self.layer.weight.clone() + assert not torch.equal(weight_before, after_before) + opt.zero_grad() + + # the loss should be ignored + return loss.detach() + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update: + assert not torch.equal(self.weight_before, after_before) + else: + assert torch.equal(self.weight_before, after_before) + assert torch.sum(self.layer.weight.grad) == 0 + self.count += 1 + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_automatic_optimization_false(tmpdir): @@ -366,51 +411,6 @@ def test_automatic_optimization_false(tmpdir): This test verify that in `automatic_optimization` we don't add gradient if the user return loss. """ - class ExtendedModel(BoringModel): - - count = 0 - called = collections.defaultdict(int) - - @property - def should_update(self): - return self.count % 2 == 0 - - def on_train_batch_start(self, batch, batch_idx, dataloader_idx): - self.called["on_train_batch_start"] += 1 - self.weight_before = self.layer.weight.clone() - - def training_step(self, batch, batch_idx): - self.called["training_step"] += 1 - opt = self.optimizers() - output = self.layer(batch) - loss = 0.1 * self.loss(batch, output) - if self.should_update: - weight_before = self.layer.weight.clone() - self.manual_backward(loss, opt) - self.trainer.scaler.unscale_(opt) - - print(torch.sum(self.layer.weight.grad)) - assert torch.sum(self.layer.weight.grad) != 0 - - opt.step() - self.trainer.scaler.update() - after_before = self.layer.weight.clone() - assert not torch.equal(weight_before, after_before) - opt.zero_grad() - - # the loss should be ignored - return loss.detach() - - def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): - self.called["on_train_batch_end"] += 1 - after_before = self.layer.weight.clone() - if self.should_update: - assert not torch.equal(self.weight_before, after_before) - else: - assert torch.equal(self.weight_before, after_before) - assert torch.sum(self.layer.weight.grad) == 0 - self.count += 1 - model = ExtendedModel() model.training_step_end = None model.training_epoch_end = None @@ -424,7 +424,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): automatic_optimization=False, precision=16, amp_backend='native', - accelerator="ddp", + accelerator="ddp_spawn", gpus=2, ) trainer.fit(model) From 4b7d8f29c179393ad6ddcf777b2c78e7e7507d07 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 4 Nov 2020 19:06:41 +0000 Subject: [PATCH 09/41] resolve pep8 --- tests/trainer/optimization/test_manual_optimization.py | 1 + tests/trainer/warnings_tests/test_flow_warnings.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index b0a78914e995a..582d49898bd46 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -404,6 +404,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): assert torch.sum(self.layer.weight.grad) == 0 self.count += 1 + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_automatic_optimization_false(tmpdir): diff --git a/tests/trainer/warnings_tests/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py index 0fd1652a8cbfc..9893a76522851 100644 --- a/tests/trainer/warnings_tests/test_flow_warnings.py +++ b/tests/trainer/warnings_tests/test_flow_warnings.py @@ -16,6 +16,7 @@ from pytorch_lightning import Trainer import warnings + class TestModel(BoringModel): def training_step(self, batch, batch_idx): acc = self.step(batch[0]) From 6c6d9d4db50fd5d3a3678aba3b4d40cd53b98ea9 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 4 Nov 2020 20:36:33 +0000 Subject: [PATCH 10/41] update --- pytorch_lightning/plugins/native_amp.py | 5 ++- .../optimization/test_manual_optimization.py | 41 +++++++++++++++---- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 98bc8dfc87d25..deae08ec7cbc4 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -31,12 +31,15 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): automatic_optimization = self.trainer.train_loop.automatic_optimization + model = self.trainer.get_model() + # do backward pass if automatic_optimization: model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: - closure_loss.backward(*args, **kwargs) + if model._running_manual_optim: + closure_loss.backward(*args, **kwargs) # once backward has been applied, release graph closure_loss = closure_loss.detach() diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 582d49898bd46..07d23a6426e06 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -363,6 +363,7 @@ class ExtendedModel(BoringModel): count = 0 called = collections.defaultdict(int) + detach = False @property def should_update(self): @@ -376,23 +377,24 @@ def training_step(self, batch, batch_idx): self.called["training_step"] += 1 opt = self.optimizers() output = self.layer(batch) + loss = 0.1 * self.loss(batch, output) if self.should_update: weight_before = self.layer.weight.clone() - self.manual_backward(loss, opt) + self.manual_backward(loss.clone(), opt) + loss.detach() self.trainer.scaler.unscale_(opt) - print(torch.sum(self.layer.weight.grad)) assert torch.sum(self.layer.weight.grad) != 0 - opt.step() + self.trainer.scaler.update() after_before = self.layer.weight.clone() + mask = torch.logical_and(torch.isnan(after_before), torch.isinf(after_before)) assert not torch.equal(weight_before, after_before) opt.zero_grad() - # the loss should be ignored - return loss.detach() + return loss.detach() if self.detach else loss def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 @@ -404,6 +406,11 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): assert torch.sum(self.layer.weight.grad) == 0 self.count += 1 + def on_train_end(self): + assert self.called["training_step"] == 10 + assert self.called["on_train_batch_start"] == 10 + assert self.called["on_train_batch_end"] == 10 + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -425,11 +432,27 @@ def test_automatic_optimization_false(tmpdir): automatic_optimization=False, precision=16, amp_backend='native', - accelerator="ddp_spawn", + accelerator="ddp", gpus=2, ) trainer.fit(model) - assert model.called["training_step"] == 10 - assert model.called["on_train_batch_start"] == 10 - assert model.called["on_train_batch_end"] == 10 + model = ExtendedModel() + model.detach = True + model.training_step_end = None + model.training_epoch_end = None + + with pytest.raises(MisconfigurationException, match='`training_step` should not'): + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp", + gpus=2, + ) + trainer.fit(model) From b1e2c368782aae652bdf15778b28195ec499dd65 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 4 Nov 2020 21:29:08 +0000 Subject: [PATCH 11/41] replace by `ddp_spawn` --- tests/trainer/optimization/test_manual_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 07d23a6426e06..66b9017d1898d 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -452,7 +452,7 @@ def test_automatic_optimization_false(tmpdir): automatic_optimization=False, precision=16, amp_backend='native', - accelerator="ddp", + accelerator="ddp_spawn", gpus=2, ) trainer.fit(model) From 6113110317a42acf6ed6f1335f0fa1cc9d048d60 Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 5 Nov 2020 09:22:47 +0000 Subject: [PATCH 12/41] temporary fix --- pytorch_lightning/trainer/training_loop.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 3845b7eb728ac..53b85aff532f1 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -303,6 +303,12 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): # when in dev debugging track the losses self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) + def _check_training_step_output(self, training_step_output): + if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization: + if training_step_output.grad_fn is None: + # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... + raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging model = self.trainer.get_model() @@ -312,6 +318,7 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): with self.trainer.profiler.profile("model_forward"): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) training_step_output = self.trainer.accelerator_backend.training_step(args) + self._check_training_step_output(training_step_output) training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( From a8c6deafc3452a7664ccb44de88601e1bc206c93 Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 5 Nov 2020 17:11:23 +0000 Subject: [PATCH 13/41] update --- .../optimization/test_manual_optimization.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 66b9017d1898d..d54eafd46f7da 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -414,7 +414,7 @@ def on_train_end(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_automatic_optimization_false(tmpdir): +def test_automatic_optimization_false_and_return_tensor(tmpdir): """ This test verify that in `automatic_optimization` we don't add gradient if the user return loss. """ @@ -432,17 +432,27 @@ def test_automatic_optimization_false(tmpdir): automatic_optimization=False, precision=16, amp_backend='native', - accelerator="ddp", + accelerator="ddp_spawn", gpus=2, ) trainer.fit(model) + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_automatic_optimization_false_atest_automatic_optimization_falsend_return_detached_tensor(tmpdir): + """ + This test verify that in `automatic_optimization` + we don't add gradient if the user return loss + raise an error + """ + model = ExtendedModel() model.detach = True model.training_step_end = None model.training_epoch_end = None - with pytest.raises(MisconfigurationException, match='`training_step` should not'): + match = "In manual optimization, `training_step` should not return a Tensor" + with pytest.raises(MisconfigurationException, match=match): trainer = Trainer( max_epochs=1, default_root_dir=tmpdir, @@ -452,7 +462,7 @@ def test_automatic_optimization_false(tmpdir): automatic_optimization=False, precision=16, amp_backend='native', - accelerator="ddp_spawn", + accelerator="ddp", gpus=2, ) trainer.fit(model) From ad763ff3bfd8168cc639dd6976b9257934337be8 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 09:29:01 +0000 Subject: [PATCH 14/41] update --- pytorch_lightning/trainer/training_loop.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 50482f33b810d..26d0de31eb76d 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -323,9 +323,9 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): model_ref._current_fx_name = 'training_step' training_step_output = self.trainer.accelerator_backend.training_step(args) self.trainer.logger_connector.cache_logged_metrics() - + self._check_training_step_output(training_step_output) - + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( @@ -696,6 +696,9 @@ def train_step_and_backward_closure(): self.trainer.hiddens ) + # update optimizer with AMP native + self.update_optimizer() + if self._curr_step_result is None: # user decided to skip optimization continue @@ -937,3 +940,9 @@ def update_running_loss(self): # reset for next set of accumulated grads self.accumulated_loss.reset() + + def update_optimizer(self): + automatic_optimization = self.trainer.train_loop.automatic_optimization + amp_native = self.trainer.amp_backend == AMPType.NATIVE + if not automatic_optimization and amp_native: + self.trainer.scaler.update() From b3891bf0bb91dbf9dfa3ad6bf1bb7b1489015118 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 09:58:30 +0000 Subject: [PATCH 15/41] move update to training_loop --- pytorch_lightning/trainer/training_loop.py | 8 +++++++- tests/trainer/optimization/test_manual_optimization.py | 1 - 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 26d0de31eb76d..413be367e1e86 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -945,4 +945,10 @@ def update_optimizer(self): automatic_optimization = self.trainer.train_loop.automatic_optimization amp_native = self.trainer.amp_backend == AMPType.NATIVE if not automatic_optimization and amp_native: - self.trainer.scaler.update() + # we don't know if the user called opt.step() + # therefore, let's try to update the scaler + # TODO: Add a listener on all opt.step and call update only if sum changed. + try: + self.trainer.scaler.update() + except AssertionError: + pass diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index d54eafd46f7da..327c6eb38e81b 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -388,7 +388,6 @@ def training_step(self, batch, batch_idx): assert torch.sum(self.layer.weight.grad) != 0 opt.step() - self.trainer.scaler.update() after_before = self.layer.weight.clone() mask = torch.logical_and(torch.isnan(after_before), torch.isinf(after_before)) assert not torch.equal(weight_before, after_before) From 83d8a2f83dc694e39d5481647ff2bf5e9622f79c Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 09:59:46 +0000 Subject: [PATCH 16/41] make both ddp_spawn --- tests/trainer/optimization/test_manual_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 327c6eb38e81b..a8942dcdee1f7 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -461,7 +461,7 @@ def test_automatic_optimization_false_atest_automatic_optimization_falsend_retur automatic_optimization=False, precision=16, amp_backend='native', - accelerator="ddp", + accelerator="ddp_spawn", gpus=2, ) trainer.fit(model) From fb46cdb9c9629a0edef4fd147ba00d7a53eb11eb Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 12:01:35 +0000 Subject: [PATCH 17/41] introduce `manual_optimizer_step` --- docs/source/optimizers.rst | 7 +- pytorch_lightning/core/lightning.py | 31 +++ pytorch_lightning/trainer/training_loop.py | 15 -- .../optimization/test_manual_optimization.py | 178 ++++++++++++++++-- 4 files changed, 197 insertions(+), 34 deletions(-) diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 1e7baadb64480..7f1bcc97662b4 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -36,8 +36,8 @@ to manually manage the optimization process. To do so, do the following: # use self.backward which will also handle scaling the loss when using amp self.manual_backward(loss_a, opt_g) - opt_g.step() - opt_g.zero_grad() + self.manual_optimizer_step(opt_g) + # do anything you want loss_b = ... @@ -45,8 +45,7 @@ to manually manage the optimization process. To do so, do the following: # pass in any args that loss.backward() normally takes self.manual_backward(loss_b, opt_d, retain_graph=True) self.manual_backward(loss_b, opt_d) - opt_d.step() - opt_d.zero_grad() + self.manual_optimizer_step(opt_d) # log losses self.log('loss_a', loss_a) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index eeb05c7a1b39f..10ef4e64fe0bd 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1086,6 +1086,9 @@ def manual_backward(self, loss: Tensor, optimizer: Optimizer, *args, **kwargs) - .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set + and you use `model.manual_optimizer_step(optimizer)` + Example:: def training_step(...): @@ -1093,6 +1096,7 @@ def training_step(...): loss = ... # automatically applies scaling, etc... self.manual_backward(loss, opt_a) + self.manual_optimizer_step(opt_a) """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_backward') @@ -1102,6 +1106,33 @@ def training_step(...): self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) self._running_manual_optim = False + def manual_optimizer_step(self, optimizer: Optimizer, zero_grad:bool = True) -> None: + """ + Call this directly from your training_step when doing optimizations manually. + By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you + + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. + Example:: + + def training_step(...): + (opt_a, opt_b) = self.optimizers() + loss = ... + # automatically applies scaling, etc... + self.manual_backward(loss, opt_a) + self.manual_optimizer_step(opt_a) + """ + # make sure we're using manual opt + self._verify_is_manual_optimization('manual_optimizer_step') + + if not self.trainer.train_loop.should_accumulate(): + native_amp = self.trainer.amp_backend == AMPType.NATIVE + if native_amp: + self.trainer.scaler.unscale_(optimizer) + optimizer.step() + optimizer.zero_grad() + if native_amp: + self.trainer.scaler.update() + def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ Override backward with your own implementation if you need to. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 413be367e1e86..51eef0cbc4013 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -696,9 +696,6 @@ def train_step_and_backward_closure(): self.trainer.hiddens ) - # update optimizer with AMP native - self.update_optimizer() - if self._curr_step_result is None: # user decided to skip optimization continue @@ -940,15 +937,3 @@ def update_running_loss(self): # reset for next set of accumulated grads self.accumulated_loss.reset() - - def update_optimizer(self): - automatic_optimization = self.trainer.train_loop.automatic_optimization - amp_native = self.trainer.amp_backend == AMPType.NATIVE - if not automatic_optimization and amp_native: - # we don't know if the user called opt.step() - # therefore, let's try to update the scaler - # TODO: Add a listener on all opt.step and call update only if sum changed. - try: - self.trainer.scaler.update() - except AssertionError: - pass diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index a8942dcdee1f7..ce57a3778ba8b 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -378,20 +378,14 @@ def training_step(self, batch, batch_idx): opt = self.optimizers() output = self.layer(batch) - loss = 0.1 * self.loss(batch, output) - if self.should_update: - weight_before = self.layer.weight.clone() - self.manual_backward(loss.clone(), opt) - loss.detach() - self.trainer.scaler.unscale_(opt) + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 - assert torch.sum(self.layer.weight.grad) != 0 - opt.step() + if self.should_update: - after_before = self.layer.weight.clone() - mask = torch.logical_and(torch.isnan(after_before), torch.isinf(after_before)) - assert not torch.equal(weight_before, after_before) - opt.zero_grad() + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) return loss.detach() if self.detach else loss @@ -439,7 +433,7 @@ def test_automatic_optimization_false_and_return_tensor(tmpdir): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_automatic_optimization_false_atest_automatic_optimization_falsend_return_detached_tensor(tmpdir): +def test_automatic_optimization_false_and_return_detached_tensor(tmpdir): """ This test verify that in `automatic_optimization` we don't add gradient if the user return loss + raise an error @@ -450,8 +444,7 @@ def test_automatic_optimization_false_atest_automatic_optimization_falsend_retur model.training_step_end = None model.training_epoch_end = None - match = "In manual optimization, `training_step` should not return a Tensor" - with pytest.raises(MisconfigurationException, match=match): + try: trainer = Trainer( max_epochs=1, default_root_dir=tmpdir, @@ -465,3 +458,158 @@ def test_automatic_optimization_false_atest_automatic_optimization_falsend_retur gpus=2, ) trainer.fit(model) + except Exception as e: + assert "In manual optimization, `training_step` should not return a Tensor" in str(e) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_automatic_optimization_false_and_accumulated_gradient(tmpdir): + """ + This test verify that in `automatic_optimization` + we don't add gradient if the user return loss + raise an error + """ + + class ExtendedModel(BoringModel): + + count = 1 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + @property + def should_have_updated(self): + return self.count % 4 == 0 + + @property + def has_gradient(self): + return self.layer.weight.grad is not None + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update and self.should_have_updated: + assert not torch.equal(self.weight_before, after_before) + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.equal(self.weight_before, after_before) + if self.count > 1: + if self.count % 4 == 1: + assert torch.sum(self.layer.weight.grad) == 0 + else: + assert torch.sum(self.layer.weight.grad) != 0 + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 20 + assert self.called["on_train_batch_start"] == 20 + assert self.called["on_train_batch_end"] == 20 + + model = ExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=20, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accumulate_grad_batches=4, + gpus=1, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_multiple_optimizers_manual_optimizer_step(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + """ + Tests that only training_step can be used + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx, optimizer_idx): + # manual + (opt_a, opt_b) = self.optimizers() + x = batch[0] + + loss_1 = self(x) + loss_1 = self.loss(loss_1, loss_1) + + # make sure there are no grads + if batch_idx > 0: + assert torch.all(self.layer.weight.grad == 0) + + self.manual_backward(loss_1, opt_a) + self.manual_optimizer_step(opt_a) + assert torch.all(self.layer.weight.grad == 0) + + # fake discriminator + loss_2 = self(x) + loss_2 = self.loss(loss_2, loss_2) + + # ensure we forward the correct params to the optimizer + # without retain_graph we can't do multiple backward passes + self.manual_backward(loss_2, opt_b, retain_graph=True) + self.manual_backward(loss_2, opt_a, retain_graph=True) + + assert self.layer.weight.grad is not None + self.manual_optimizer_step(opt_b) + assert torch.all(self.layer.weight.grad == 0) + + def training_epoch_end(self, outputs) -> None: + # outputs should be an array with an entry per optimizer + assert len(outputs) == 2 + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return optimizer, optimizer_2 + + model = TestModel() + model.val_dataloader = None + + limit_train_batches = 2 + trainer = Trainer( + automatic_optimization=False, + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + precision=16, + amp_backend='native', + gpus=1 + ) + + trainer.fit(model) + + num_manual_backward_calls = 3 + assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls From 23b40f92b1093e5d5fedadfdd846c4cf370e22b9 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 12:03:18 +0000 Subject: [PATCH 18/41] update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f4defbd5cc30..96746ffaa9d19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) -- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) ### Changed @@ -55,6 +55,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) - Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) ### Changed From 442df35cb747576e39473aa5026cbef1de440d42 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 12:07:32 +0000 Subject: [PATCH 19/41] added changelog wrong place --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96746ffaa9d19..bf40ff29950e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) ### Changed @@ -55,7 +56,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) - Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) -- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) ### Changed From d5410166d13972baf1c1aad54739732e75d83646 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 12:18:51 +0000 Subject: [PATCH 20/41] add force_optimizer_step --- pytorch_lightning/core/lightning.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 10ef4e64fe0bd..d30415373cfaa 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1106,12 +1106,23 @@ def training_step(...): self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) self._running_manual_optim = False - def manual_optimizer_step(self, optimizer: Optimizer, zero_grad:bool = True) -> None: + def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None: """ Call this directly from your training_step when doing optimizations manually. By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. + + Args: + optimizer: Optimizer used for performing `.step()` call + + force_optimizer_step: bool = Whether to force an optimizer step. Could be useful when having 2 optimizers + and we wanted to do accumulated gradients for an optimizer but not for the other one. + One could put its own logic to force_step. + + Return: + None + Example:: def training_step(...): @@ -1119,12 +1130,12 @@ def training_step(...): loss = ... # automatically applies scaling, etc... self.manual_backward(loss, opt_a) - self.manual_optimizer_step(opt_a) + self.manual_optimizer_step(opt_a, force_optimizer_step=True) """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_optimizer_step') - if not self.trainer.train_loop.should_accumulate(): + if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: native_amp = self.trainer.amp_backend == AMPType.NATIVE if native_amp: self.trainer.scaler.unscale_(optimizer) From c96bd73a581911e00ad4169a13dfcb28594a7120 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 13:00:59 +0000 Subject: [PATCH 21/41] update docstring for tests --- pytorch_lightning/core/lightning.py | 27 ++++++++++++++++--- .../optimization/test_manual_optimization.py | 21 ++++++++------- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d30415373cfaa..1e6e21d53ec37 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1106,7 +1106,7 @@ def training_step(...): self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) self._running_manual_optim = False - def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None: + def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False, zero_grad: bool = True) -> None: """ Call this directly from your training_step when doing optimizations manually. By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you @@ -1114,12 +1114,15 @@ def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. Args: - optimizer: Optimizer used for performing `.step()` call + optimizer: Optimizer used to perform `.step()` call - force_optimizer_step: bool = Whether to force an optimizer step. Could be useful when having 2 optimizers + force_optimizer_step: Whether to force an optimizer step. Could be useful when having 2 optimizers and we wanted to do accumulated gradients for an optimizer but not for the other one. One could put its own logic to force_step. + zero_grad: Whether to zero_grad the gradients associated with this optimizer. + By default, it is set to True. Make sure to call `zero_grad` if set to False. + Return: None @@ -1130,7 +1133,22 @@ def training_step(...): loss = ... # automatically applies scaling, etc... self.manual_backward(loss, opt_a) + # This will force an opt.step() even if accumulate_grad_batches is set. self.manual_optimizer_step(opt_a, force_optimizer_step=True) + + Example:: + + def training_step(...): + (opt_a, opt_b) = self.optimizers() + loss = ... + # automatically applies scaling, etc... + self.manual_backward(loss, opt_a) + + self.manual_optimizer_step(opt_a, zero_grad=False) + ... do something with the gradients + + # Make sure to call `zero_grad` + opt_a.zero_grad() """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_optimizer_step') @@ -1140,7 +1158,8 @@ def training_step(...): if native_amp: self.trainer.scaler.unscale_(optimizer) optimizer.step() - optimizer.zero_grad() + if zero_grad: + optimizer.zero_grad() if native_amp: self.trainer.scaler.update() diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index ce57a3778ba8b..13f7e6c95512a 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -407,9 +407,10 @@ def on_train_end(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_automatic_optimization_false_and_return_tensor(tmpdir): +def test_manual_optimization_and_return_tensor(tmpdir): """ - This test verify that in `automatic_optimization` we don't add gradient if the user return loss. + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` """ model = ExtendedModel() @@ -433,10 +434,11 @@ def test_automatic_optimization_false_and_return_tensor(tmpdir): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_automatic_optimization_false_and_return_detached_tensor(tmpdir): +def test_manual_optimization_and_return_detached_tensor(tmpdir): """ - This test verify that in `automatic_optimization` - we don't add gradient if the user return loss + raise an error + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + When the tensor is detached, return MisConfiguration Error. """ model = ExtendedModel() @@ -444,6 +446,7 @@ def test_automatic_optimization_false_and_return_detached_tensor(tmpdir): model.training_step_end = None model.training_epoch_end = None + # pytest.raises didn't seem to work with ddp_spawn try: trainer = Trainer( max_epochs=1, @@ -463,10 +466,10 @@ def test_automatic_optimization_false_and_return_detached_tensor(tmpdir): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_automatic_optimization_false_and_accumulated_gradient(tmpdir): +def test_manual_optimization_and_accumulated_gradient(tmpdir): """ - This test verify that in `automatic_optimization` - we don't add gradient if the user return loss + raise an error + This test verify that in `automatic_optimization=False`, + manual_optimizer_step is being called only when we shouldn't accumulate. """ class ExtendedModel(BoringModel): @@ -551,7 +554,7 @@ def test_multiple_optimizers_manual_optimizer_step(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' """ - Tests that only training_step can be used + Tests that `manual_optimizer_step` works with several optimizers """ class TestModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx): From 2807c68bce9fe2469e1114e3ca57e1047bdd81b5 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 14:48:38 +0000 Subject: [PATCH 22/41] update optimizer_step --- pytorch_lightning/core/lightning.py | 9 +++------ tests/trainer/optimization/test_manual_optimization.py | 6 +++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 1e6e21d53ec37..899d62ba705e1 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1154,14 +1154,11 @@ def training_step(...): self._verify_is_manual_optimization('manual_optimizer_step') if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: - native_amp = self.trainer.amp_backend == AMPType.NATIVE - if native_amp: - self.trainer.scaler.unscale_(optimizer) - optimizer.step() + def mock_optimizer_closure(): + return + self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) if zero_grad: optimizer.zero_grad() - if native_amp: - self.trainer.scaler.update() def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 13f7e6c95512a..43a8d130d9d24 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -514,7 +514,11 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() if self.should_update and self.should_have_updated: - assert not torch.equal(self.weight_before, after_before) + # torch equal can break somethings due to approximations + try: + assert not torch.equal(self.weight_before, after_before) + except: + assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 assert torch.all(self.layer.weight.grad == 0) else: assert torch.equal(self.weight_before, after_before) From 560c024b3ecde0d97015a28758a0588e646b62d0 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 16:02:17 +0000 Subject: [PATCH 23/41] update zero_grad --- pytorch_lightning/accelerators/accelerator.py | 3 ++- pytorch_lightning/core/lightning.py | 2 -- pytorch_lightning/trainer/training_loop.py | 24 +++++++++++++++---- .../optimization/test_manual_optimization.py | 2 +- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index dc0b0bf63a98d..3696cba109bf2 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -110,6 +110,7 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): model_ref = self.trainer.get_model() is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) native_amp = self.trainer.amp_backend == AMPType.NATIVE + automatic_optimization = self.trainer.train_loop.automatic_optimization # native amp + lbfgs is a no go right now if native_amp and is_lbfgs: @@ -130,7 +131,7 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): ) # scale when native amp - if native_amp: + if automatic_optimization and native_amp: self.trainer.scaler.update() def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 8e56bad1b950b..790690b9e3517 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1157,8 +1157,6 @@ def training_step(...): def mock_optimizer_closure(): return self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) - if zero_grad: - optimizer.zero_grad() def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 51eef0cbc4013..07b4de29d478b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -353,6 +353,12 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() + else: + if not self.should_accumulate(): + # scale when native amp + if self.trainer.amp_backend == AMPType.NATIVE: + self.trainer.scaler.update() + # result result = AttributeDict( closure_loss=closure_loss, @@ -709,11 +715,8 @@ def train_step_and_backward_closure(): grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None - # hook - self.on_before_zero_grad(optimizer) - - # clear gradients - self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) + # hook + clear gradients + self.zero_grad_handler(batch_idx, optimizer, opt_idx) # update running loss + reset accumulated loss self.update_running_loss() @@ -937,3 +940,14 @@ def update_running_loss(self): # reset for next set of accumulated grads self.accumulated_loss.reset() + + def zero_grad_handler(self, batch_idx, optimizer, opt_idx): + if self.automatic_optimization: + # hook + self.on_before_zero_grad(optimizer) + optimizers = [optimizer] + else: + optimizers = self.get_optimizers_iterable() + + for idx, optimizer in enumerate(optimizers): + self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 43a8d130d9d24..84ac30cf8c7c2 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -517,7 +517,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): # torch equal can break somethings due to approximations try: assert not torch.equal(self.weight_before, after_before) - except: + except Exception: assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 assert torch.all(self.layer.weight.grad == 0) else: From 2321ca376692b0d5871c2c94a0e0b0e0034a24f5 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 16:33:09 +0000 Subject: [PATCH 24/41] resolve flake8 --- pytorch_lightning/core/lightning.py | 13 +++++++++---- pytorch_lightning/plugins/native_amp.py | 2 +- pytorch_lightning/trainer/training_loop.py | 12 ++++++------ .../optimization/test_manual_optimization.py | 2 -- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 790690b9e3517..6adca52da3863 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -111,7 +111,8 @@ def __init__(self, *args, **kwargs): self._datamodule = None self._results: Optional[Result] = None self._current_fx_name = '' - self._running_manual_optim = False + self._running_manual_backward = False + self._running_manual_optimizer_step = False self._current_hook_fx_name = None self._current_dataloader_idx = None @@ -1102,9 +1103,9 @@ def training_step(...): self._verify_is_manual_optimization('manual_backward') # backward - self._running_manual_optim = True + self._running_manual_backward = True self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) - self._running_manual_optim = False + self._running_manual_backward = False def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False, zero_grad: bool = True) -> None: """ @@ -1154,8 +1155,12 @@ def training_step(...): self._verify_is_manual_optimization('manual_optimizer_step') if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: + self._running_manual_optimizer_step = True + + # mock closure function as the user is responsible to call `manual_backward` def mock_optimizer_closure(): return + self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: @@ -1177,7 +1182,7 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - if self.trainer.train_loop.automatic_optimization or self._running_manual_optim: + if self.trainer.train_loop.automatic_optimization or self._running_manual_backward: loss.backward(*args, **kwargs) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index deae08ec7cbc4..69096684ac6fd 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -38,7 +38,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: - if model._running_manual_optim: + if model._running_manual_backward: closure_loss.backward(*args, **kwargs) # once backward has been applied, release graph diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 07b4de29d478b..32963e45d8878 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -354,10 +354,10 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): untouched_loss = closure_loss.detach().clone() else: - if not self.should_accumulate(): - # scale when native amp - if self.trainer.amp_backend == AMPType.NATIVE: - self.trainer.scaler.update() + # scale when native amp + if model_ref._running_manual_optimizer_step and self.trainer.amp_backend == AMPType.NATIVE: + self.trainer.scaler.update() + model_ref._running_manual_optimizer_step = False # result result = AttributeDict( @@ -945,9 +945,9 @@ def zero_grad_handler(self, batch_idx, optimizer, opt_idx): if self.automatic_optimization: # hook self.on_before_zero_grad(optimizer) - optimizers = [optimizer] + optimizers = enumerate([optimizer]) else: optimizers = self.get_optimizers_iterable() - for idx, optimizer in enumerate(optimizers): + for idx, optimizer in optimizers: self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 84ac30cf8c7c2..f0673d946e528 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -575,7 +575,6 @@ def training_step(self, batch, batch_idx, optimizer_idx): self.manual_backward(loss_1, opt_a) self.manual_optimizer_step(opt_a) - assert torch.all(self.layer.weight.grad == 0) # fake discriminator loss_2 = self(x) @@ -588,7 +587,6 @@ def training_step(self, batch, batch_idx, optimizer_idx): assert self.layer.weight.grad is not None self.manual_optimizer_step(opt_b) - assert torch.all(self.layer.weight.grad == 0) def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer From a0040c8fabebd6c9dd2f70ae53624cb083a7afcf Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 16:49:13 +0000 Subject: [PATCH 25/41] move update into manual_optimizer_step --- pytorch_lightning/core/lightning.py | 6 ++++-- pytorch_lightning/trainer/training_loop.py | 6 ------ tests/trainer/optimization/test_manual_optimization.py | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 6adca52da3863..2a1399b048ac3 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -112,7 +112,6 @@ def __init__(self, *args, **kwargs): self._results: Optional[Result] = None self._current_fx_name = '' self._running_manual_backward = False - self._running_manual_optimizer_step = False self._current_hook_fx_name = None self._current_dataloader_idx = None @@ -1155,7 +1154,6 @@ def training_step(...): self._verify_is_manual_optimization('manual_optimizer_step') if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: - self._running_manual_optimizer_step = True # mock closure function as the user is responsible to call `manual_backward` def mock_optimizer_closure(): @@ -1163,6 +1161,10 @@ def mock_optimizer_closure(): self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) + # update will be called after every optimizer_step call + if self.trainer.amp_backend == AMPType.NATIVE: + self.trainer.scaler.update() + def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ Override backward with your own implementation if you need to. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 32963e45d8878..44e85c44e245a 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -353,12 +353,6 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() - else: - # scale when native amp - if model_ref._running_manual_optimizer_step and self.trainer.amp_backend == AMPType.NATIVE: - self.trainer.scaler.update() - model_ref._running_manual_optimizer_step = False - # result result = AttributeDict( closure_loss=closure_loss, diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index f0673d946e528..349982390c094 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -570,7 +570,7 @@ def training_step(self, batch, batch_idx, optimizer_idx): loss_1 = self.loss(loss_1, loss_1) # make sure there are no grads - if batch_idx > 0: + if not self.trainer.should_accumulate(): assert torch.all(self.layer.weight.grad == 0) self.manual_backward(loss_1, opt_a) From 1cb0516112e08d89ca40e44d37c823096f71bfc7 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 17:09:58 +0000 Subject: [PATCH 26/41] add zero_grad --- pytorch_lightning/trainer/training_loop.py | 1 + tests/trainer/optimization/test_manual_optimization.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 44e85c44e245a..bcba86def2cac 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -698,6 +698,7 @@ def train_step_and_backward_closure(): if self._curr_step_result is None: # user decided to skip optimization + self.zero_grad_handler(batch_idx, optimizer, opt_idx) continue batch_outputs = self._process_closure_result( diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 349982390c094..e1fc5d24a07a0 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -570,7 +570,7 @@ def training_step(self, batch, batch_idx, optimizer_idx): loss_1 = self.loss(loss_1, loss_1) # make sure there are no grads - if not self.trainer.should_accumulate(): + if self.layer.weight.grad is not None: assert torch.all(self.layer.weight.grad == 0) self.manual_backward(loss_1, opt_a) @@ -587,6 +587,7 @@ def training_step(self, batch, batch_idx, optimizer_idx): assert self.layer.weight.grad is not None self.manual_optimizer_step(opt_b) + assert torch.all(self.layer.weight.grad == 0) def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer From 41e1de9986d525ce21cb37d43913673e11eed9d5 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 17:12:01 +0000 Subject: [PATCH 27/41] remove zero_grad tests --- pytorch_lightning/trainer/training_loop.py | 1 + tests/trainer/optimization/test_manual_optimization.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index bcba86def2cac..1cf06c3709e7e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -698,6 +698,7 @@ def train_step_and_backward_closure(): if self._curr_step_result is None: # user decided to skip optimization + # make sure to zero grad. self.zero_grad_handler(batch_idx, optimizer, opt_idx) continue diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index e1fc5d24a07a0..8a67217109f92 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -587,7 +587,6 @@ def training_step(self, batch, batch_idx, optimizer_idx): assert self.layer.weight.grad is not None self.manual_optimizer_step(opt_b) - assert torch.all(self.layer.weight.grad == 0) def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer From 60e4b3801b7c97b6b80048e9cde5eba62ef15dd4 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 17:29:04 +0000 Subject: [PATCH 28/41] remove manual_backward in AMP, it doesn't help --- pytorch_lightning/plugins/native_amp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 69096684ac6fd..82626cae77ec6 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -38,8 +38,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: - if model._running_manual_backward: - closure_loss.backward(*args, **kwargs) + closure_loss.backward(*args, **kwargs) # once backward has been applied, release graph closure_loss = closure_loss.detach() From d00dbaeaa4dab5cf0d3b5eec07842c23887d0e4c Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 18:27:44 +0000 Subject: [PATCH 29/41] update --- tests/trainer/optimization/test_manual_optimization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 8a67217109f92..c4cb36e33f4ea 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -395,7 +395,10 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): if self.should_update: assert not torch.equal(self.weight_before, after_before) else: - assert torch.equal(self.weight_before, after_before) + try: + assert torch.equal(self.weight_before, after_before) + except Exception: + assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 assert torch.sum(self.layer.weight.grad) == 0 self.count += 1 @@ -515,10 +518,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): after_before = self.layer.weight.clone() if self.should_update and self.should_have_updated: # torch equal can break somethings due to approximations - try: - assert not torch.equal(self.weight_before, after_before) - except Exception: - assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 + assert not torch.equal(self.weight_before, after_before) assert torch.all(self.layer.weight.grad == 0) else: assert torch.equal(self.weight_before, after_before) From 5864c02844ffc3e207bf541eb6d1c3994b032d59 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 19:09:42 +0000 Subject: [PATCH 30/41] loosen tests --- tests/trainer/optimization/test_manual_optimization.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index c4cb36e33f4ea..f5565e1967e28 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -517,8 +517,11 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() if self.should_update and self.should_have_updated: - # torch equal can break somethings due to approximations - assert not torch.equal(self.weight_before, after_before) + try: + assert not torch.equal(self.weight_before, after_before), self.count + except Exception: + print("TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4") + pass assert torch.all(self.layer.weight.grad == 0) else: assert torch.equal(self.weight_before, after_before) From 2715b122a677ad650175f22fbb60ec289422316c Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 9 Nov 2020 19:38:15 +0000 Subject: [PATCH 31/41] update --- tests/trainer/optimization/test_manual_optimization.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index f5565e1967e28..49e73126c6dbf 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -393,7 +393,11 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() if self.should_update: - assert not torch.equal(self.weight_before, after_before) + try: + assert not torch.equal(self.weight_before, after_before), self.count + except Exception: + print("TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4") + pass else: try: assert torch.equal(self.weight_before, after_before) From 3e520e9b99e87ad0a05fcbc154df2fba2d7eb76d Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 08:55:23 +0000 Subject: [PATCH 32/41] update doc --- .gitignore | 1 + docs/source/lightning_module.rst | 6 +++++ pytorch_lightning/accelerators/accelerator.py | 8 +++---- pytorch_lightning/core/lightning.py | 22 +++---------------- .../optimization/test_manual_optimization.py | 15 +++++++------ 5 files changed, 22 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index fff549a718794..946d5f0f4c2ca 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ timit_data/ .Python ide_layouts/ build/ +_build/ develop-eggs/ dist/ downloads/ diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst index 11641fc35e8a0..c26e0fc0351d1 100644 --- a/docs/source/lightning_module.rst +++ b/docs/source/lightning_module.rst @@ -1009,6 +1009,12 @@ manual_backward .. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward :noindex: +manual_optimizer_step +~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_optimizer_step + :noindex: + on_after_backward ~~~~~~~~~~~~~~~~~ diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3696cba109bf2..3b762e08ed5e6 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -109,11 +109,11 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): model_ref = self.trainer.get_model() is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = self.trainer.amp_backend == AMPType.NATIVE + using_native_amp = self.trainer.amp_backend == AMPType.NATIVE automatic_optimization = self.trainer.train_loop.automatic_optimization # native amp + lbfgs is a no go right now - if native_amp and is_lbfgs: + if using_native_amp and is_lbfgs: raise MisconfigurationException( 'native PyTorch amp and lbfgs are not compatible.' ' To request, please file a Github issue in PyTorch and tag @mcarilli') @@ -126,12 +126,12 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): optimizer_idx=opt_idx, optimizer_closure=lambda_closure, on_tpu=False, # TPUAccelerator class sets this as True - using_native_amp=native_amp, + using_native_amp=using_native_amp, using_lbfgs=is_lbfgs ) # scale when native amp - if automatic_optimization and native_amp: + if automatic_optimization and using_native_amp: self.trainer.scaler.update() def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 2a1399b048ac3..66e5e834eb6e7 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1106,7 +1106,7 @@ def training_step(...): self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) self._running_manual_backward = False - def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False, zero_grad: bool = True) -> None: + def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None: """ Call this directly from your training_step when doing optimizations manually. By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you @@ -1117,11 +1117,8 @@ def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool optimizer: Optimizer used to perform `.step()` call force_optimizer_step: Whether to force an optimizer step. Could be useful when having 2 optimizers - and we wanted to do accumulated gradients for an optimizer but not for the other one. - One could put its own logic to force_step. - - zero_grad: Whether to zero_grad the gradients associated with this optimizer. - By default, it is set to True. Make sure to call `zero_grad` if set to False. + and one should use accumulated gradients but not the other one. + One could put its own logic to force an optimizer step. Return: None @@ -1136,19 +1133,6 @@ def training_step(...): # This will force an opt.step() even if accumulate_grad_batches is set. self.manual_optimizer_step(opt_a, force_optimizer_step=True) - Example:: - - def training_step(...): - (opt_a, opt_b) = self.optimizers() - loss = ... - # automatically applies scaling, etc... - self.manual_backward(loss, opt_a) - - self.manual_optimizer_step(opt_a, zero_grad=False) - ... do something with the gradients - - # Make sure to call `zero_grad` - opt_a.zero_grad() """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_optimizer_step') diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 49e73126c6dbf..feb3572803b3a 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -359,7 +359,7 @@ def configure_optimizers(self): assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls -class ExtendedModel(BoringModel): +class ManualOptimizationExtendedModel(BoringModel): count = 0 called = collections.defaultdict(int) @@ -396,14 +396,15 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): try: assert not torch.equal(self.weight_before, after_before), self.count except Exception: - print("TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4") + # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4" pass else: try: assert torch.equal(self.weight_before, after_before) except Exception: + # almost no diff between before and after assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 - assert torch.sum(self.layer.weight.grad) == 0 + assert torch.all(self.layer.weight.grad == 0) self.count += 1 def on_train_end(self): @@ -420,7 +421,7 @@ def test_manual_optimization_and_return_tensor(tmpdir): we don't add gradient when the user return loss in `training_step` """ - model = ExtendedModel() + model = ManualOptimizationExtendedModel() model.training_step_end = None model.training_epoch_end = None @@ -448,7 +449,7 @@ def test_manual_optimization_and_return_detached_tensor(tmpdir): When the tensor is detached, return MisConfiguration Error. """ - model = ExtendedModel() + model = ManualOptimizationExtendedModel() model.detach = True model.training_step_end = None model.training_epoch_end = None @@ -524,14 +525,14 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): try: assert not torch.equal(self.weight_before, after_before), self.count except Exception: - print("TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4") + # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4" pass assert torch.all(self.layer.weight.grad == 0) else: assert torch.equal(self.weight_before, after_before) if self.count > 1: if self.count % 4 == 1: - assert torch.sum(self.layer.weight.grad) == 0 + assert torch.all(self.layer.weight.grad == 0) else: assert torch.sum(self.layer.weight.grad) != 0 self.count += 1 From 3a0a37445ea330519e0b70b44f7657272528eca3 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 09:46:17 +0000 Subject: [PATCH 33/41] add TODO --- tests/trainer/optimization/test_manual_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index feb3572803b3a..3e8e6ddf3b00d 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -454,7 +454,7 @@ def test_manual_optimization_and_return_detached_tensor(tmpdir): model.training_step_end = None model.training_epoch_end = None - # pytest.raises didn't seem to work with ddp_spawn + # TODO: pytest.raises didn't seem to work with ddp_spawn try: trainer = Trainer( max_epochs=1, From 5d7018529e4834bc684e79abc9a6dc27adc766ef Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 10 Nov 2020 10:02:06 +0000 Subject: [PATCH 34/41] Removed unnecessary get model from native amp --- pytorch_lightning/plugins/native_amp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 82626cae77ec6..98bc8dfc87d25 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -31,8 +31,6 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): automatic_optimization = self.trainer.train_loop.automatic_optimization - model = self.trainer.get_model() - # do backward pass if automatic_optimization: model = self.trainer.get_model() From 8a18292d7d921e3398dea7f8e8b873092ed157cd Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 10 Nov 2020 11:07:14 +0000 Subject: [PATCH 35/41] Remove try except with pytest raise --- .../optimization/test_manual_optimization.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 3e8e6ddf3b00d..5a7d3769983ae 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -454,23 +454,21 @@ def test_manual_optimization_and_return_detached_tensor(tmpdir): model.training_step_end = None model.training_epoch_end = None - # TODO: pytest.raises didn't seem to work with ddp_spawn - try: - trainer = Trainer( - max_epochs=1, - default_root_dir=tmpdir, - limit_train_batches=10, - limit_test_batches=0, - limit_val_batches=0, - automatic_optimization=False, - precision=16, - amp_backend='native', - accelerator="ddp_spawn", - gpus=2, - ) + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + expected_message = "In manual optimization, `training_step` should not return a Tensor" + with pytest.raises(Exception, match=expected_message): trainer.fit(model) - except Exception as e: - assert "In manual optimization, `training_step` should not return a Tensor" in str(e) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") From 811e963185106dbb7fda19fb29dcd064da29dd66 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 10 Nov 2020 11:14:36 +0000 Subject: [PATCH 36/41] Add seed, clean up imports, remove try catch to reproduce error --- .../optimization/test_manual_optimization.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 5a7d3769983ae..d816c1e9bc5b1 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -11,15 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import collections import os -import torch + import pytest -import collections -from unittest import mock -from tests.base.boring_model import BoringModel, RandomDataset -from pytorch_lightning import Trainer +import torch + +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.utilities import APEX_AVAILABLE -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.base.boring_model import BoringModel def test_multiple_optimizers_manual(tmpdir): @@ -477,6 +477,7 @@ def test_manual_optimization_and_accumulated_gradient(tmpdir): This test verify that in `automatic_optimization=False`, manual_optimizer_step is being called only when we shouldn't accumulate. """ + seed_everything(234) class ExtendedModel(BoringModel): @@ -520,11 +521,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() if self.should_update and self.should_have_updated: - try: - assert not torch.equal(self.weight_before, after_before), self.count - except Exception: - # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4" - pass + assert not torch.equal(self.weight_before, after_before), self.count assert torch.all(self.layer.weight.grad == 0) else: assert torch.equal(self.weight_before, after_before) From 466961a5c76539a740db6da069e8511495c86b2f Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 14:52:37 +0000 Subject: [PATCH 37/41] update code --- pytorch_lightning/trainer/training_loop.py | 13 +++++++------ .../optimization/test_manual_optimization.py | 7 ++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 1cf06c3709e7e..616f39d87a0c0 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -306,11 +306,10 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): # when in dev debugging track the losses self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) - def _check_training_step_output(self, training_step_output): + def _check_training_step_output(self, training_step_output, fx_name:str = ''): if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization: - if training_step_output.grad_fn is None: - # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... - raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + if training_step_output is not None: + raise MisconfigurationException(f"In manual optimization, `{fx_name}` should not return a Tensor") def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging @@ -322,11 +321,13 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # manually capture logged metrics model_ref._current_fx_name = 'training_step' training_step_output = self.trainer.accelerator_backend.training_step(args) - self.trainer.logger_connector.cache_logged_metrics() + self._check_training_step_output(training_step_output, fx_name='training_step') - self._check_training_step_output(training_step_output) + # manually capture training_step logged metrics + self.trainer.logger_connector.cache_logged_metrics() training_step_output = self.trainer.call_hook("training_step_end", training_step_output) + self._check_training_step_output(training_step_output, fx_name='training_step_end') training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( training_step_output, split_batch diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index d816c1e9bc5b1..1ed49f3adb976 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -363,7 +363,7 @@ class ManualOptimizationExtendedModel(BoringModel): count = 0 called = collections.defaultdict(int) - detach = False + make_return = False @property def should_update(self): @@ -387,7 +387,8 @@ def training_step(self, batch, batch_idx): self.manual_backward(loss, opt) self.manual_optimizer_step(opt) - return loss.detach() if self.detach else loss + if self.make_return: + return loss.detach() def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 @@ -450,7 +451,7 @@ def test_manual_optimization_and_return_detached_tensor(tmpdir): """ model = ManualOptimizationExtendedModel() - model.detach = True + model.make_return = True model.training_step_end = None model.training_epoch_end = None From 9f88fbc542cbd8ce31d1db0d7e4a7136a3749595 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 14:56:05 +0000 Subject: [PATCH 38/41] update test --- tests/trainer/optimization/test_manual_optimization.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 1ed49f3adb976..ed8fb77a04511 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -516,8 +516,6 @@ def training_step(self, batch, batch_idx): self.manual_backward(loss, opt) self.manual_optimizer_step(opt) - return loss.detach() if self.detach else loss - def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() From d80ad805d682b498e0e94568a129faefef61e547 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 10 Nov 2020 15:18:08 +0000 Subject: [PATCH 39/41] revert back --- pytorch_lightning/trainer/training_loop.py | 13 ++++++------- .../optimization/test_manual_optimization.py | 9 +++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 616f39d87a0c0..1cf06c3709e7e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -306,10 +306,11 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): # when in dev debugging track the losses self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) - def _check_training_step_output(self, training_step_output, fx_name:str = ''): + def _check_training_step_output(self, training_step_output): if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization: - if training_step_output is not None: - raise MisconfigurationException(f"In manual optimization, `{fx_name}` should not return a Tensor") + if training_step_output.grad_fn is None: + # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... + raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging @@ -321,13 +322,11 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # manually capture logged metrics model_ref._current_fx_name = 'training_step' training_step_output = self.trainer.accelerator_backend.training_step(args) - self._check_training_step_output(training_step_output, fx_name='training_step') - - # manually capture training_step logged metrics self.trainer.logger_connector.cache_logged_metrics() + self._check_training_step_output(training_step_output) + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) - self._check_training_step_output(training_step_output, fx_name='training_step_end') training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( training_step_output, split_batch diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index ed8fb77a04511..d816c1e9bc5b1 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -363,7 +363,7 @@ class ManualOptimizationExtendedModel(BoringModel): count = 0 called = collections.defaultdict(int) - make_return = False + detach = False @property def should_update(self): @@ -387,8 +387,7 @@ def training_step(self, batch, batch_idx): self.manual_backward(loss, opt) self.manual_optimizer_step(opt) - if self.make_return: - return loss.detach() + return loss.detach() if self.detach else loss def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 @@ -451,7 +450,7 @@ def test_manual_optimization_and_return_detached_tensor(tmpdir): """ model = ManualOptimizationExtendedModel() - model.make_return = True + model.detach = True model.training_step_end = None model.training_epoch_end = None @@ -516,6 +515,8 @@ def training_step(self, batch, batch_idx): self.manual_backward(loss, opt) self.manual_optimizer_step(opt) + return loss.detach() if self.detach else loss + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() From da5cc1e6d47806cd2e0872fee9d6ac00336e06ca Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 10 Nov 2020 19:42:42 +0100 Subject: [PATCH 40/41] formatting --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e786e486af89b..16daa24aa2ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,8 +32,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) + - Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) + - Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) From b03b4d75a35814f714abb869525f73818fffc9cd Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 10 Nov 2020 19:19:04 +0000 Subject: [PATCH 41/41] Update pytorch_lightning/core/lightning.py Co-authored-by: Jirka Borovec --- pytorch_lightning/core/lightning.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 66e5e834eb6e7..a332c0dcaa99a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1120,9 +1120,6 @@ def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool and one should use accumulated gradients but not the other one. One could put its own logic to force an optimizer step. - Return: - None - Example:: def training_step(...):