From 8058a81404bbc46e51e070088f3468287c36ca9f Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 9 Apr 2021 07:32:22 +0900 Subject: [PATCH 01/14] . --- docs/source/common/optimizers.rst | 506 ++++++++++++++++-------------- 1 file changed, 277 insertions(+), 229 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 422302ea8987e..3aa843daea1b8 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -3,14 +3,12 @@ ************ Optimization ************ - Lightning offers two modes for managing the optimization process: -- automatic optimization (AutoOpt) +- automatic optimization - manual optimization -For the majority of research cases, **automatic optimization** will do the right thing for you and it is what -most users should use. +For the majority of research cases, **automatic optimization** will do the right thing for you and it is what most users should use. For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**. @@ -18,223 +16,309 @@ For advanced/expert users who want to do esoteric optimization schedules or tech Manual optimization =================== -For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable -to manually manage the optimization process. To do so, do the following: +For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to manually manage the optimization process. + +This is only recommended for experts who need ultimate flexibility. +Lightning will handle only precision and accelerators logic. +The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc.. -* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function +To manually optimize, do the following: + +* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule``'s ``__init__`` function. * Use ``self.manual_backward(loss)`` instead of ``loss.backward()``. +Here is a basic example of manual optimization. + .. testcode:: python - from pytorch_lightning import LightningModule + from pytorch_lightning import LightningModule + + class MyModel(LightningModule): - class MyModel(LightningModule): + def __init__(self): + super().__init__() + # Important: This property activates manual optimization. + self.automatic_optimization = False - def __init__(self): - super().__init__() - # Important: This property activate ``manual optimization`` for your model - self.automatic_optimization = False + def training_step(batch, batch_idx): + opt = self.optimizers() - def training_step(batch, batch_idx): - opt = self.optimizers() - loss = self.compute_loss(batch) - self.manual_backward(loss) + loss = self.compute_loss(batch) -.. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc.. + opt.zero_grad() + self.manual_backward(loss) + opt.step() -.. warning:: Before 1.2, ``optimzer.step`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertise. -.. tip:: To perform ``accumulate_grad_batches`` with one optimizer, you can do as such. +.. warning:: Before 1.2, ``optimzer.step()`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertise. .. tip:: ``self.optimizers()`` will return ``LightningOptimizer`` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you. -.. code-block:: python +.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``. - def __init__(self): - self.automatic_optimization = False - def training_step(self, batch, batch_idx): - opt = self.optimizers() +Learning rate scheduling +------------------------ +From 1.3, Lightning supports learning rate scheduling in manual optimization. +``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals. Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``. - loss = self.compute_loss(batch) - self.manual_backward(loss) +.. deprecated:: Before 1.3, ``lr_scheduler.step`` was called automatically in both manual and automatic optimization. - # accumulate gradient batches - if batch_idx % 2 == 0: - opt.step() - opt.zero_grad() +.. warning:: The learning rate scheduler keys are ignored in manual optimization. -.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. See also `the PyTorch docs `_. +.. testcode:: python + + # step every batch + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def training_step(self, batch, batch_idx): + # do foward, backward, and optimization + ... -Here is the same example as above using a ``closure``. + # single scheduler + sch = self.lr_schedulers() + sch.step() + + # multiple schedulers + sch1, sch2 = self.lr_schedulers() + sch1.step() + sch2.step() .. testcode:: python - def __init__(self): - self.automatic_optimization = False + def __init__(self): + super().__init__() + self.automatic_optimization = False - def training_step(self, batch, batch_idx): - opt = self.optimizers() + def training_step(self, batch, batch_idx): + # do forward, backward, and optimization + ... - def closure(): - # Only zero_grad on the first batch to accumulate gradients - is_first_batch_to_accumulate = batch_idx % 2 == 0 - if is_first_batch_to_accumulate: - opt.zero_grad() + sch = self.lr_schedulers() - loss = self.compute_loss(batch) - self.manual_backward(loss) - return loss + # step every `n` batches + if (batch_idx + 1) % n == 0: + sch.step() - opt.step(closure=closure) + # step every `n` epochs + if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0: + sch.step() -.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``. + +Gradient accumulation +--------------------- +You can accumulate gradients over batches similarly to :attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization. +To perform gradient accumulation with one optimizer, you can do as such. + +.. testcode:: python + + # accumulate gradients over 2 batches + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def training_step(self, batch, batch_idx): + opt = self.optimizers() + + loss = self.compute_loss(batch) + self.manual_backward(loss) + + # accumulate gradients of 2 batches + if (batch_idx + 1) % 2 == 0: + opt.step() + opt.zero_grad() + + +Use closure for LBFGS-like optimizers +------------------------------------- +It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model. +It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`. +See `the PyTorch docs `_ for more details about the closure. + +Here is an example using a closure function. + +.. testcode:: python + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def configure_optimizers(self): + return torch.optim.LBFGS(...) + + def training_step(self, batch, batch_idx): + opt = self.optimizers() + + def closure(): + loss = self.compute_loss(batch) + opt.zero_grad() + self.manual_backward(loss) + return loss + + opt.step(closure=closure) + + +Multiple optimizers +------------------- .. testcode:: python - import torch - from torch import Tensor - from pytorch_lightning import LightningModule + import torch + from torch import Tensor + from pytorch_lightning import LightningModule + + class SimpleGAN(LightningModule): - class SimpleGAN(LightningModule): + def __init__(self): + super().__init__() + self.G = Generator() + self.D = Discriminator() - def __init__(self): - super().__init__() - self.G = Generator() - self.D = Discriminator() + # Important: This property activates manual optimization. + self.automatic_optimization = False - # Important: This property activate ``manual optimization`` for this model - self.automatic_optimization = False + def sample_z(self, n) -> Tensor: + sample = self._Z.sample((n,)) + return sample - def sample_z(self, n) -> Tensor: - sample = self._Z.sample((n,)) - return sample + def sample_G(self, n) -> Tensor: + z = self.sample_z(n) + return self.G(z) - def sample_G(self, n) -> Tensor: - z = self.sample_z(n) - return self.G(z) + def training_step(self, batch, batch_idx): + # Implementation follows the PyTorch tutorial: + # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html + g_opt, d_opt = self.optimizers() - def training_step(self, batch, batch_idx): - # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html - g_opt, d_opt = self.optimizers() + X, _ = batch + batch_size = X.shape[0] - X, _ = batch - batch_size = X.shape[0] + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) - real_label = torch.ones((batch_size, 1), device=self.device) - fake_label = torch.zeros((batch_size, 1), device=self.device) + g_X = self.sample_G(batch_size) - g_X = self.sample_G(batch_size) + ########################### + # Optimize Discriminator # + ########################### + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) - ########################### - # Optimize Discriminator # - ########################### - d_opt.zero_grad() - d_x = self.D(X) - errD_real = self.criterion(d_x, real_label) + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - d_z = self.D(g_X.detach()) - errD_fake = self.criterion(d_z, fake_label) + errD = (errD_real + errD_fake) - errD = (errD_real + errD_fake) + d_opt.zero_grad() + self.manual_backward(errD) + d_opt.step() - self.manual_backward(errD) - d_opt.step() + ####################### + # Optimize Generator # + ####################### + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) - ####################### - # Optimize Generator # - ####################### - g_opt.zero_grad() + g_opt.zero_grad() + self.manual_backward(errG) + g_opt.step() - d_z = self.D(g_X) - errG = self.criterion(d_z, real_label) + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) - self.manual_backward(errG) - g_opt.step() + def configure_optimizers(self): + g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5) + d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) + return g_opt, d_opt - self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) - def configure_optimizers(self): - g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5) - d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) - return g_opt, d_opt +Toggle models for faster training +--------------------------------- +If you're trying to: -.. note:: ``LightningOptimizer`` provides a ``toggle_model`` function as a ``@context_manager`` for advanced users. It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting. +* use multiple optimizers +* accumulate gradients over batches + +:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users. +It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting. Here is an explanation of what it does: -Considering the current optimizer as A and all other optimizers as B. -Toggling means that all parameters from B exclusive to A will have their ``requires_grad`` attribute set to ``False``. Their original state will be restored when exiting the context manager. +* Considering the current optimizer as A and all other optimizers as B. +* Toggling means that all parameters from B exclusive to A will have their ``requires_grad`` attribute set to ``False``. +* Their original state will be restored when exiting the context manager. When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. - Here is an example for advanced use-case. .. testcode:: python - # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. + # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. - class SimpleGAN(LightningModule): + class SimpleGAN(LightningModule): - ... + def __init__(self): + super().__init__() + self.automatic_optimization = False - def __init__(self): - self.automatic_optimization = False + def training_step(self, batch, batch_idx): + # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html + g_opt, d_opt = self.optimizers() - def training_step(self, batch, batch_idx): - # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html - g_opt, d_opt = self.optimizers() + X, _ = batch + X.requires_grad = True + batch_size = X.shape[0] - X, _ = batch - X.requires_grad = True - batch_size = X.shape[0] + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) - real_label = torch.ones((batch_size, 1), device=self.device) - fake_label = torch.zeros((batch_size, 1), device=self.device) + # Sync and clear gradients only at the end of accumulation. + is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0 - accumulated_grad_batches = batch_idx % 2 == 0 + g_X = self.sample_G(batch_size) - g_X = self.sample_G(batch_size) + ########################### + # Optimize Discriminator # + ########################### + with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) - ########################### - # Optimize Discriminator # - ########################### - with d_opt.toggle_model(sync_grad=accumulated_grad_batches): - d_x = self.D(X) - errD_real = self.criterion(d_x, real_label) + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - d_z = self.D(g_X.detach()) - errD_fake = self.criterion(d_z, fake_label) + errD = (errD_real + errD_fake) - errD = (errD_real + errD_fake) + self.manual_backward(errD) + if is_last_batch_to_accumulate: + d_opt.step() + d_opt.zero_grad() - self.manual_backward(errD) - if accumulated_grad_batches: - d_opt.step() - d_opt.zero_grad() + ####################### + # Optimize Generator # + ####################### + with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) - ####################### - # Optimize Generator # - ####################### - with g_opt.toggle_model(sync_grad=accumulated_grad_batches): - d_z = self.D(g_X) - errG = self.criterion(d_z, real_label) + self.manual_backward(errG) + if is_last_batch_to_accumulate: + g_opt.step() + g_opt.zero_grad() - self.manual_backward(errG) - if accumulated_grad_batches: - g_opt.step() - g_opt.zero_grad() + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) - self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) ------ Automatic optimization ====================== -With Lightning most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` -since Lightning automates that for you. +With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` since Lightning automates that for you. .. warning:: Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. @@ -273,27 +357,27 @@ In the case of multiple optimizers, Lightning does the following: Learning rate scheduling ------------------------ Every optimizer you use can be paired with any `Learning Rate Scheduler `_. -In the basic use-case, the scheduler (or multiple schedulers) should be returned as the second output from the ``.configure_optimizers`` method: +In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.core.LightningModule.configure_optimizers` method: .. testcode:: # no LR scheduler def configure_optimizers(self): - return Adam(...) + return Adam(...) # Adam + LR scheduler def configure_optimizers(self): - optimizer = Adam(...) - scheduler = LambdaLR(optimizer, ...) - return [optimizer], [scheduler] + optimizer = Adam(...) + scheduler = LambdaLR(optimizer, ...) + return [optimizer], [scheduler] # Two optimizers each with a scheduler def configure_optimizers(self): - optimizer1 = Adam(...) - optimizer2 = SGD(...) - scheduler1 = LambdaLR(optimizer1, ...) - scheduler2 = LambdaLR(optimizer2, ...) - return [optimizer1, optimizer2], [scheduler1, scheduler2] + optimizer1 = Adam(...) + optimizer2 = SGD(...) + scheduler1 = LambdaLR(optimizer1, ...) + scheduler2 = LambdaLR(optimizer2, ...) + return [optimizer1, optimizer2], [scheduler1, scheduler2] When there are schedulers in which the ``.step()`` method is conditioned on a metric value (for example the :class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler), Lightning requires that the output @@ -304,11 +388,12 @@ set to metric that the scheduler should be conditioned on. # The ReduceLROnPlateau scheduler requires a monitor def configure_optimizers(self): - return { - 'optimizer': Adam(...), - 'lr_scheduler': ReduceLROnPlateau(optimizer, ...), - 'monitor': 'metric_to_track' - } + optimizer = Adam(...) + return { + 'optimizer': optimizer, + 'lr_scheduler': ReduceLROnPlateau(optimizer, ...), + 'monitor': 'metric_to_track', + } # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler def configure_optimizers(self): @@ -321,12 +406,10 @@ set to metric that the scheduler should be conditioned on. {'optimizer': optimizer2, 'lr_scheduler': scheduler2}, ) -.. note:: - Metrics can be made availble to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` - in your lightning module. +.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your lightning module. -By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration should be -returned as a dict which can contain the following keywords: +By default, all schedulers will be called after each epoch ends. +To change this behaviour, a scheduler configuration should be returned as a dict which can contain the following keywords: * ``scheduler`` (required): the actual scheduler object * ``monitor`` (optional): metric to condition @@ -362,21 +445,21 @@ returned as a dict which can contain the following keywords: Use multiple optimizers (like GANs) ----------------------------------- -To use multiple optimizers return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers` +To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. .. testcode:: # one optimizer def configure_optimizers(self): - return Adam(...) + return Adam(...) # two optimizers, no schedulers def configure_optimizers(self): - return Adam(...), SGD(...) + return Adam(...), SGD(...) # Two optimizers, one scheduler for adam only def configure_optimizers(self): - return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'} + return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'} Lightning will call each optimizer sequentially: @@ -390,8 +473,8 @@ Lightning will call each optimizer sequentially: loss.backward() opt.step() - for lr_scheduler in lr_schedulers: - lr_scheduler.step() + for lr_scheduler in lr_schedulers: + lr_scheduler.step() ---------- @@ -404,86 +487,51 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch .. testcode:: - def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): - optimizer.zero_grad() - - # Alternating schedule for optimizer steps (ie: GANs) - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # update generator opt every 2 steps - if optimizer_idx == 0: - if batch_nb % 2 == 0 : - optimizer.step(closure=closure) + # Alternating schedule for optimizer steps (ie: GANs) + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + # update generator opt every 2 steps + if optimizer_idx == 0: + if batch_nb % 2 == 0: + optimizer.step(closure=closure) - # update discriminator opt every 4 steps - if optimizer_idx == 1: - if batch_nb % 4 == 0 : + # update discriminator opt every 4 steps + if optimizer_idx == 1: + if batch_nb % 4 == 0: optimizer.step(closure=closure) Here we add a learning-rate warm up .. testcode:: - # learning rate warm-up - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # warm up lr - if self.trainer.global_step < 500: - lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) - for pg in optimizer.param_groups: - pg['lr'] = lr_scale * self.hparams.learning_rate + # learning rate warm-up + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + # warm up lr + if self.trainer.global_step < 500: + lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) + for pg in optimizer.param_groups: + pg['lr'] = lr_scale * self.hparams.learning_rate - # update params - optimizer.step(closure=closure) + # update params + optimizer.step(closure=optimizer_closure) .. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches and much more ... .. testcode:: - # function hook in LightningModule - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - optimizer.step(closure=closure) + # function hook in LightningModule + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + optimizer.step(closure=optimizer_closure) .. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow. .. testcode:: - # function hook in LightningModule - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - - # `optimizer is a ``LightningOptimizer`` wrapping the optimizer. - # To access it, do as follow: - optimizer = optimizer.optimizer - - # run step. However, it won't work on TPU, AMP, etc... - optimizer.step(closure=closure) - - ----------- - -Using the closure functions for optimization --------------------------------------------- - -When using optimization schemes such as LBFGS, the `second_order_closure` needs to be enabled. By default, this function is defined by wrapping the `training_step` and the backward steps as follows - -.. warning:: - Before 1.2.2, ``.zero_grad()`` was called outside the closure internally. - From 1.2.2, the closure calls ``.zero_grad()`` inside, so there is no need to define your own closure - when using similar optimizers to :class:`torch.optim.LBFGS` which requires reevaluation of the loss with the closure in ``optimizer.step()``. - -.. testcode:: - - def second_order_closure(pl_module, split_batch, batch_idx, opt_idx, optimizer, hidden): - # Model training step on a given batch - result = pl_module.training_step(split_batch, batch_idx, opt_idx, hidden) - - # Model backward pass - pl_module.backward(result, optimizer, opt_idx) - - # on_after_backward callback - pl_module.on_after_backward(result.training_step_output, batch_idx, result.loss) + # function hook in LightningModule + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - return result + # `optimizer` is a `LightningOptimizer` wrapping the optimizer. + # To access it, do as follow: + optimizer = optimizer.optimizer - # This default `second_order_closure` function can be enabled by passing it directly into the `optimizer.step` - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, second_order_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # update params - optimizer.step(second_order_closure) + # run step. However, it won't work on TPU, AMP, etc... + optimizer.step(closure=optimizer_closure) From 4477f46ee9fe002339d9a05f18470ebffb752d09 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 11 Apr 2021 09:04:00 +0900 Subject: [PATCH 02/14] . --- docs/source/common/optimizers.rst | 234 ++++++++++++++++-------------- 1 file changed, 124 insertions(+), 110 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 3aa843daea1b8..c94bd33e81d89 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -12,7 +12,7 @@ For the majority of research cases, **automatic optimization** will do the right For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**. ------- +----- Manual optimization =================== @@ -27,7 +27,7 @@ To manually optimize, do the following: * Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule``'s ``__init__`` function. * Use ``self.manual_backward(loss)`` instead of ``loss.backward()``. -Here is a basic example of manual optimization. +Here is a minimal example of manual optimization. .. testcode:: python @@ -49,22 +49,30 @@ Here is a basic example of manual optimization. self.manual_backward(loss) opt.step() +.. warning:: + Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally. + From 1.2, it is left to the users expertise. -.. warning:: Before 1.2, ``optimzer.step()`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertise. - -.. tip:: ``self.optimizers()`` will return ``LightningOptimizer`` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you. - -.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``. +.. tip:: + * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can + access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, + Lightning won't be able to support accelerators and precision for you. + * Be careful where you call ``zero_grad``, or your model won't converge. + It is good practice to call ``zero_grad`` before ``manual_backward``. +----- -Learning rate scheduling ------------------------- -From 1.3, Lightning supports learning rate scheduling in manual optimization. -``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals. Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``. +Learning rate scheduling [manual] +--------------------------------- +You can obtain learning schedulers to call ``lr_scheduler.step()`` at arbitrary intervals. +Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``. -.. deprecated:: Before 1.3, ``lr_scheduler.step`` was called automatically in both manual and automatic optimization. +.. warning:: + * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored. + * Before 1.3, ``lr_scheduler.step()`` was automatically called in manual optimization. + From 1.3, ``lr_scheduler.step()`` is disabled so that you can call it at arbitrary intervals. -.. warning:: The learning rate scheduler keys are ignored in manual optimization. +Here is a example calling ``step()`` every step. .. testcode:: python @@ -87,6 +95,8 @@ From 1.3, Lightning supports learning rate scheduling in manual optimization. sch1.step() sch2.step() +If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following. + .. testcode:: python def __init__(self): @@ -107,6 +117,7 @@ From 1.3, Lightning supports learning rate scheduling in manual optimization. if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0: sch.step() +----- Gradient accumulation --------------------- @@ -132,38 +143,10 @@ To perform gradient accumulation with one optimizer, you can do as such. opt.step() opt.zero_grad() +----- -Use closure for LBFGS-like optimizers -------------------------------------- -It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model. -It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`. -See `the PyTorch docs `_ for more details about the closure. - -Here is an example using a closure function. - -.. testcode:: python - - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def configure_optimizers(self): - return torch.optim.LBFGS(...) - - def training_step(self, batch, batch_idx): - opt = self.optimizers() - - def closure(): - loss = self.compute_loss(batch) - opt.zero_grad() - self.manual_backward(loss) - return loss - - opt.step(closure=closure) - - -Multiple optimizers -------------------- +Use multiple optimizers [manual] +-------------------------------- .. testcode:: python @@ -202,9 +185,9 @@ Multiple optimizers g_X = self.sample_G(batch_size) - ########################### - # Optimize Discriminator # - ########################### + ########################## + # Optimize Discriminator # + ########################## d_x = self.D(X) errD_real = self.criterion(d_x, real_label) @@ -217,9 +200,9 @@ Multiple optimizers self.manual_backward(errD) d_opt.step() - ####################### - # Optimize Generator # - ####################### + ###################### + # Optimize Generator # + ###################### d_z = self.D(g_X) errG = self.criterion(d_z, real_label) @@ -234,16 +217,12 @@ Multiple optimizers d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) return g_opt, d_opt +----- -Toggle models for faster training ---------------------------------- -If you're trying to: - -* use multiple optimizers -* accumulate gradients over batches - -:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users. -It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting. +Improve training time with model toggling +----------------------------------------- +Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers +in a distributed setting. Here is an explanation of what it does: @@ -254,12 +233,13 @@ Here is an explanation of what it does: When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. +:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users. + Here is an example for advanced use-case. .. testcode:: python # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. - class SimpleGAN(LightningModule): def __init__(self): @@ -267,7 +247,8 @@ Here is an example for advanced use-case. self.automatic_optimization = False def training_step(self, batch, batch_idx): - # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html + # Implementation follows the PyTorch tutorial: + # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html g_opt, d_opt = self.optimizers() X, _ = batch @@ -282,9 +263,9 @@ Here is an example for advanced use-case. g_X = self.sample_G(batch_size) - ########################### - # Optimize Discriminator # - ########################### + ########################## + # Optimize Discriminator # + ########################## with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): d_x = self.D(X) errD_real = self.criterion(d_x, real_label) @@ -299,9 +280,9 @@ Here is an example for advanced use-case. d_opt.step() d_opt.zero_grad() - ####################### - # Optimize Generator # - ####################### + ###################### + # Optimize Generator # + ###################### with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): d_z = self.D(g_X) errG = self.criterion(d_z, real_label) @@ -313,6 +294,35 @@ Here is an example for advanced use-case. self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) +----- + +Use closure for LBFGS-like optimizers +------------------------------------- +It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model. +It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`. +See `the PyTorch docs `_ for more details about the closure. + +Here is an example using a closure function. + +.. testcode:: python + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def configure_optimizers(self): + return torch.optim.LBFGS(...) + + def training_step(self, batch, batch_idx): + opt = self.optimizers() + + def closure(): + loss = self.compute_loss(batch) + opt.zero_grad() + self.manual_backward(loss) + return loss + + opt.step(closure=closure) ------ @@ -320,11 +330,7 @@ Automatic optimization ====================== With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` since Lightning automates that for you. -.. warning:: - Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. - From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``. - -Under the hood Lightning does the following: +Under the hood, Lightning does the following: .. code-block:: python @@ -353,11 +359,16 @@ In the case of multiple optimizers, Lightning does the following: for lr_scheduler in lr_schedulers: lr_scheduler.step() +.. warning:: + Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. + From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``. + +----- Learning rate scheduling ------------------------ Every optimizer you use can be paired with any `Learning Rate Scheduler `_. -In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.core.LightningModule.configure_optimizers` method: +In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.LightningModule.configure_optimizers` method: .. testcode:: @@ -381,8 +392,8 @@ In the basic use-case, the scheduler(s) should be returned as the second output When there are schedulers in which the ``.step()`` method is conditioned on a metric value (for example the :class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler), Lightning requires that the output -from ``configure_optimizers`` should be dicts, one for each optimizer, with the keyword ``monitor`` -set to metric that the scheduler should be conditioned on. +from :meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer, +with the keyword ``"monitor"`` set to metric that the scheduler should be conditioned on. .. testcode:: @@ -406,24 +417,25 @@ set to metric that the scheduler should be conditioned on. {'optimizer': optimizer2, 'lr_scheduler': scheduler2}, ) -.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your lightning module. +.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`. By default, all schedulers will be called after each epoch ends. -To change this behaviour, a scheduler configuration should be returned as a dict which can contain the following keywords: +To change this behaviour, +a scheduler configuration should be returned as a dict which can contain the following keywords: -* ``scheduler`` (required): the actual scheduler object -* ``monitor`` (optional): metric to condition -* ``interval`` (optional): either ``epoch`` (default) for stepping after each epoch ends or ``step`` for stepping +* ``"scheduler"`` (required): the actual scheduler object +* ``"monitor"`` (optional): metric to condition +* ``"interval"`` (optional): either ``epoch`` (default) for stepping after each epoch ends or ``step`` for stepping after each optimization step -* ``frequency`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1, +* ``"frequency"`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1, corresponding to updating the learning rate after every epoch/step. -* ``strict`` (optional): if set to ``True`` will enforce that value specified in ``monitor`` is available while trying - to call ``scheduler.step()``, and stop training if not found. If ``False`` will only give a warning and continue training - (without calling the scheduler). -* ``name`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the - learning rate progress, this keyword can be used to specify a specific name the learning rate should be logged as. +* ``"strict"`` (optional): if set to ``True`` will enforce that value specified in ``monitor`` is available while trying + to call ``scheduler.step()``, and stop training if not found. If ``False`` will only give a warning and continue + training without calling the scheduler. +* ``"name"`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the + learning rate progress, this keyword can be used to specify a name the learning rate should be logged as. -.. testcode:: +.. testcode:: python # Same as the above example with additional params passed to the first scheduler # In this case the ReduceLROnPlateau will step after every 10 processed batches @@ -441,27 +453,29 @@ To change this behaviour, a scheduler configuration should be returned as a dict ] return optimizers, schedulers ----------- +----- -Use multiple optimizers (like GANs) ------------------------------------ +Multiple optimizers (e.g. GANs) +------------------------------- To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. -.. testcode:: - - # one optimizer - def configure_optimizers(self): - return Adam(...) +.. testcode:: python # two optimizers, no schedulers def configure_optimizers(self): return Adam(...), SGD(...) - # Two optimizers, one scheduler for adam only + # two optimizers, one scheduler for adam only def configure_optimizers(self): return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'} -Lightning will call each optimizer sequentially: + # two optimizers, two schedulers + def configure_optimizers(self): + opt1 = Adam(...) + opt2 = SGD(...) + return [opt1, opt2], [StepLR(opt1, ...), OneCycleLR(opt2, ...)] + +Under the hood, Lightning will call each optimizer sequentially: .. code-block:: python @@ -476,36 +490,36 @@ Lightning will call each optimizer sequentially: for lr_scheduler in lr_schedulers: lr_scheduler.step() ----------- +----- Step optimizers at arbitrary intervals -------------------------------------- To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling, -override the :meth:`optimizer_step` function. +override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function. -For example, here step optimizer A every 2 batches and optimizer B every 4 batches +For example, here step optimizer A every 2 batches and optimizer B every 4 batches. .. testcode:: - # Alternating schedule for optimizer steps (ie: GANs) + # Alternating schedule for optimizer steps (e.g. GANs) def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): # update generator opt every 2 steps if optimizer_idx == 0: - if batch_nb % 2 == 0: - optimizer.step(closure=closure) + if batch_idx % 2 == 0: + optimizer.step(closure=optimizer_closure) # update discriminator opt every 4 steps if optimizer_idx == 1: - if batch_nb % 4 == 0: - optimizer.step(closure=closure) + if batch_idx % 4 == 0: + optimizer.step(closure=optimizer_closure) -Here we add a learning-rate warm up +Here we add a learning rate warm-up. .. testcode:: # learning rate warm-up def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # warm up lr + # skip the first 500 steps if self.trainer.global_step < 500: lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) for pg in optimizer.param_groups: @@ -514,9 +528,9 @@ Here we add a learning-rate warm up # update params optimizer.step(closure=optimizer_closure) -.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches and much more ... +.. note:: The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, gradient accumulation and much more ... -.. testcode:: +.. testcode:: python # function hook in LightningModule def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): @@ -524,7 +538,7 @@ Here we add a learning-rate warm up .. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow. -.. testcode:: +.. testcode:: python # function hook in LightningModule def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): From 05b6304ab898bd72e537e916c1a8658c7efb38a1 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 11 Apr 2021 09:26:01 +0900 Subject: [PATCH 03/14] Fix link to the section --- docs/source/common/optimizers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index c94bd33e81d89..ee3c405cffa1c 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -455,7 +455,7 @@ a scheduler configuration should be returned as a dict which can contain the fol ----- -Multiple optimizers (e.g. GANs) +Use multiple optimizers (like GANs) ------------------------------- To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. From cf3074161647fe6f91b39664dba48fae9d07f4ff Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 11 Apr 2021 09:29:11 +0900 Subject: [PATCH 04/14] Fix link to the section --- docs/source/common/optimizers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index ee3c405cffa1c..94f77837c1e08 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -456,7 +456,7 @@ a scheduler configuration should be returned as a dict which can contain the fol ----- Use multiple optimizers (like GANs) -------------------------------- +----------------------------------- To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. .. testcode:: python From 1627da4d6ab4e7202203d42716d805c4fd3dcf3b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 11 Apr 2021 09:42:09 +0900 Subject: [PATCH 05/14] Consistent indent --- docs/source/common/optimizers.rst | 473 +++++++++++++++--------------- 1 file changed, 236 insertions(+), 237 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 94f77837c1e08..39b56666900f4 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -31,23 +31,23 @@ Here is a minimal example of manual optimization. .. testcode:: python - from pytorch_lightning import LightningModule + from pytorch_lightning import LightningModule - class MyModel(LightningModule): + class MyModel(LightningModule): - def __init__(self): - super().__init__() - # Important: This property activates manual optimization. - self.automatic_optimization = False + def __init__(self): + super().__init__() + # Important: This property activates manual optimization. + self.automatic_optimization = False - def training_step(batch, batch_idx): - opt = self.optimizers() + def training_step(batch, batch_idx): + opt = self.optimizers() - loss = self.compute_loss(batch) + loss = self.compute_loss(batch) - opt.zero_grad() - self.manual_backward(loss) - opt.step() + opt.zero_grad() + self.manual_backward(loss) + opt.step() .. warning:: Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally. @@ -76,46 +76,46 @@ Here is a example calling ``step()`` every step. .. testcode:: python - # step every batch + # step every batch - def __init__(self): - super().__init__() - self.automatic_optimization = False + def __init__(self): + super().__init__() + self.automatic_optimization = False - def training_step(self, batch, batch_idx): - # do foward, backward, and optimization - ... + def training_step(self, batch, batch_idx): + # do foward, backward, and optimization + ... - # single scheduler - sch = self.lr_schedulers() - sch.step() + # single scheduler + sch = self.lr_schedulers() + sch.step() - # multiple schedulers - sch1, sch2 = self.lr_schedulers() - sch1.step() - sch2.step() + # multiple schedulers + sch1, sch2 = self.lr_schedulers() + sch1.step() + sch2.step() If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following. .. testcode:: python - def __init__(self): - super().__init__() - self.automatic_optimization = False + def __init__(self): + super().__init__() + self.automatic_optimization = False - def training_step(self, batch, batch_idx): - # do forward, backward, and optimization - ... + def training_step(self, batch, batch_idx): + # do forward, backward, and optimization + ... - sch = self.lr_schedulers() + sch = self.lr_schedulers() - # step every `n` batches - if (batch_idx + 1) % n == 0: - sch.step() + # step every `n` batches + if (batch_idx + 1) % n == 0: + sch.step() - # step every `n` epochs - if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0: - sch.step() + # step every `n` epochs + if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0: + sch.step() ----- @@ -126,22 +126,22 @@ To perform gradient accumulation with one optimizer, you can do as such. .. testcode:: python - # accumulate gradients over 2 batches + # accumulate gradients over 2 batches - def __init__(self): - super().__init__() - self.automatic_optimization = False + def __init__(self): + super().__init__() + self.automatic_optimization = False - def training_step(self, batch, batch_idx): - opt = self.optimizers() + def training_step(self, batch, batch_idx): + opt = self.optimizers() - loss = self.compute_loss(batch) - self.manual_backward(loss) + loss = self.compute_loss(batch) + self.manual_backward(loss) - # accumulate gradients of 2 batches - if (batch_idx + 1) % 2 == 0: - opt.step() - opt.zero_grad() + # accumulate gradients of 2 batches + if (batch_idx + 1) % 2 == 0: + opt.step() + opt.zero_grad() ----- @@ -150,72 +150,71 @@ Use multiple optimizers [manual] .. testcode:: python - import torch - from torch import Tensor - from pytorch_lightning import LightningModule + import torch + from torch import Tensor + from pytorch_lightning import LightningModule - class SimpleGAN(LightningModule): + class SimpleGAN(LightningModule): + def __init__(self): + super().__init__() + self.G = Generator() + self.D = Discriminator() - def __init__(self): - super().__init__() - self.G = Generator() - self.D = Discriminator() + # Important: This property activates manual optimization. + self.automatic_optimization = False - # Important: This property activates manual optimization. - self.automatic_optimization = False + def sample_z(self, n) -> Tensor: + sample = self._Z.sample((n,)) + return sample - def sample_z(self, n) -> Tensor: - sample = self._Z.sample((n,)) - return sample + def sample_G(self, n) -> Tensor: + z = self.sample_z(n) + return self.G(z) - def sample_G(self, n) -> Tensor: - z = self.sample_z(n) - return self.G(z) + def training_step(self, batch, batch_idx): + # Implementation follows the PyTorch tutorial: + # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html + g_opt, d_opt = self.optimizers() - def training_step(self, batch, batch_idx): - # Implementation follows the PyTorch tutorial: - # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html - g_opt, d_opt = self.optimizers() + X, _ = batch + batch_size = X.shape[0] - X, _ = batch - batch_size = X.shape[0] + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) - real_label = torch.ones((batch_size, 1), device=self.device) - fake_label = torch.zeros((batch_size, 1), device=self.device) + g_X = self.sample_G(batch_size) - g_X = self.sample_G(batch_size) + ########################## + # Optimize Discriminator # + ########################## + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) - ########################## - # Optimize Discriminator # - ########################## - d_x = self.D(X) - errD_real = self.criterion(d_x, real_label) + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - d_z = self.D(g_X.detach()) - errD_fake = self.criterion(d_z, fake_label) + errD = (errD_real + errD_fake) - errD = (errD_real + errD_fake) + d_opt.zero_grad() + self.manual_backward(errD) + d_opt.step() - d_opt.zero_grad() - self.manual_backward(errD) - d_opt.step() + ###################### + # Optimize Generator # + ###################### + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) - ###################### - # Optimize Generator # - ###################### - d_z = self.D(g_X) - errG = self.criterion(d_z, real_label) + g_opt.zero_grad() + self.manual_backward(errG) + g_opt.step() - g_opt.zero_grad() - self.manual_backward(errG) - g_opt.step() + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) - self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) - - def configure_optimizers(self): - g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5) - d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) - return g_opt, d_opt + def configure_optimizers(self): + g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5) + d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) + return g_opt, d_opt ----- @@ -239,60 +238,60 @@ Here is an example for advanced use-case. .. testcode:: python - # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. - class SimpleGAN(LightningModule): + # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. + class SimpleGAN(LightningModule): - def __init__(self): - super().__init__() - self.automatic_optimization = False + def __init__(self): + super().__init__() + self.automatic_optimization = False - def training_step(self, batch, batch_idx): - # Implementation follows the PyTorch tutorial: - # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html - g_opt, d_opt = self.optimizers() + def training_step(self, batch, batch_idx): + # Implementation follows the PyTorch tutorial: + # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html + g_opt, d_opt = self.optimizers() - X, _ = batch - X.requires_grad = True - batch_size = X.shape[0] + X, _ = batch + X.requires_grad = True + batch_size = X.shape[0] - real_label = torch.ones((batch_size, 1), device=self.device) - fake_label = torch.zeros((batch_size, 1), device=self.device) + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) - # Sync and clear gradients only at the end of accumulation. - is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0 + # Sync and clear gradients only at the end of accumulation. + is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0 - g_X = self.sample_G(batch_size) + g_X = self.sample_G(batch_size) - ########################## - # Optimize Discriminator # - ########################## - with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): - d_x = self.D(X) - errD_real = self.criterion(d_x, real_label) + ########################## + # Optimize Discriminator # + ########################## + with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) - d_z = self.D(g_X.detach()) - errD_fake = self.criterion(d_z, fake_label) + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - errD = (errD_real + errD_fake) + errD = (errD_real + errD_fake) - self.manual_backward(errD) - if is_last_batch_to_accumulate: - d_opt.step() - d_opt.zero_grad() + self.manual_backward(errD) + if is_last_batch_to_accumulate: + d_opt.step() + d_opt.zero_grad() - ###################### - # Optimize Generator # - ###################### - with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): - d_z = self.D(g_X) - errG = self.criterion(d_z, real_label) + ###################### + # Optimize Generator # + ###################### + with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate): + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) - self.manual_backward(errG) - if is_last_batch_to_accumulate: - g_opt.step() - g_opt.zero_grad() + self.manual_backward(errG) + if is_last_batch_to_accumulate: + g_opt.step() + g_opt.zero_grad() - self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) ----- @@ -306,23 +305,23 @@ Here is an example using a closure function. .. testcode:: python - def __init__(self): - super().__init__() - self.automatic_optimization = False + def __init__(self): + super().__init__() + self.automatic_optimization = False - def configure_optimizers(self): - return torch.optim.LBFGS(...) + def configure_optimizers(self): + return torch.optim.LBFGS(...) - def training_step(self, batch, batch_idx): - opt = self.optimizers() + def training_step(self, batch, batch_idx): + opt = self.optimizers() - def closure(): - loss = self.compute_loss(batch) - opt.zero_grad() - self.manual_backward(loss) - return loss + def closure(): + loss = self.compute_loss(batch) + opt.zero_grad() + self.manual_backward(loss) + return loss - opt.step(closure=closure) + opt.step(closure=closure) ------ @@ -397,25 +396,25 @@ with the keyword ``"monitor"`` set to metric that the scheduler should be condit .. testcode:: - # The ReduceLROnPlateau scheduler requires a monitor - def configure_optimizers(self): - optimizer = Adam(...) - return { - 'optimizer': optimizer, - 'lr_scheduler': ReduceLROnPlateau(optimizer, ...), - 'monitor': 'metric_to_track', - } + # The ReduceLROnPlateau scheduler requires a monitor + def configure_optimizers(self): + optimizer = Adam(...) + return { + 'optimizer': optimizer, + 'lr_scheduler': ReduceLROnPlateau(optimizer, ...), + 'monitor': 'metric_to_track', + } - # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler - def configure_optimizers(self): - optimizer1 = Adam(...) - optimizer2 = SGD(...) - scheduler1 = ReduceLROnPlateau(optimizer1, ...) - scheduler2 = LambdaLR(optimizer2, ...) - return ( - {'optimizer': optimizer1, 'lr_scheduler': scheduler1, 'monitor': 'metric_to_track'}, - {'optimizer': optimizer2, 'lr_scheduler': scheduler2}, - ) + # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler + def configure_optimizers(self): + optimizer1 = Adam(...) + optimizer2 = SGD(...) + scheduler1 = ReduceLROnPlateau(optimizer1, ...) + scheduler2 = LambdaLR(optimizer2, ...) + return ( + {'optimizer': optimizer1, 'lr_scheduler': scheduler1, 'monitor': 'metric_to_track'}, + {'optimizer': optimizer2, 'lr_scheduler': scheduler2}, + ) .. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`. @@ -437,21 +436,21 @@ a scheduler configuration should be returned as a dict which can contain the fol .. testcode:: python - # Same as the above example with additional params passed to the first scheduler - # In this case the ReduceLROnPlateau will step after every 10 processed batches - def configure_optimizers(self): - optimizers = [Adam(...), SGD(...)] - schedulers = [ - { - 'scheduler': ReduceLROnPlateau(optimizers[0], ...), - 'monitor': 'metric_to_track', - 'interval': 'step', - 'frequency': 10, - 'strict': True, - }, - LambdaLR(optimizers[1], ...) - ] - return optimizers, schedulers + # Same as the above example with additional params passed to the first scheduler + # In this case the ReduceLROnPlateau will step after every 10 processed batches + def configure_optimizers(self): + optimizers = [Adam(...), SGD(...)] + schedulers = [ + { + 'scheduler': ReduceLROnPlateau(optimizers[0], ...), + 'monitor': 'metric_to_track', + 'interval': 'step', + 'frequency': 10, + 'strict': True, + }, + LambdaLR(optimizers[1], ...) + ] + return optimizers, schedulers ----- @@ -461,34 +460,34 @@ To use multiple optimizers, return two or more optimizers from :meth:`pytorch_li .. testcode:: python - # two optimizers, no schedulers - def configure_optimizers(self): - return Adam(...), SGD(...) + # two optimizers, no schedulers + def configure_optimizers(self): + return Adam(...), SGD(...) - # two optimizers, one scheduler for adam only - def configure_optimizers(self): - return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'} + # two optimizers, one scheduler for adam only + def configure_optimizers(self): + return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'} - # two optimizers, two schedulers - def configure_optimizers(self): - opt1 = Adam(...) - opt2 = SGD(...) - return [opt1, opt2], [StepLR(opt1, ...), OneCycleLR(opt2, ...)] + # two optimizers, two schedulers + def configure_optimizers(self): + opt1 = Adam(...) + opt2 = SGD(...) + return [opt1, opt2], [StepLR(opt1, ...), OneCycleLR(opt2, ...)] Under the hood, Lightning will call each optimizer sequentially: .. code-block:: python - for epoch in epochs: - for batch in data: - for opt in optimizers: - loss = train_step(batch, batch_idx, optimizer_idx) - opt.zero_grad() - loss.backward() - opt.step() + for epoch in epochs: + for batch in data: + for opt in optimizers: + loss = train_step(batch, batch_idx, optimizer_idx) + opt.zero_grad() + loss.backward() + opt.step() - for lr_scheduler in lr_schedulers: - lr_scheduler.step() + for lr_scheduler in lr_schedulers: + lr_scheduler.step() ----- @@ -501,51 +500,51 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch .. testcode:: - # Alternating schedule for optimizer steps (e.g. GANs) - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # update generator opt every 2 steps - if optimizer_idx == 0: - if batch_idx % 2 == 0: - optimizer.step(closure=optimizer_closure) - - # update discriminator opt every 4 steps - if optimizer_idx == 1: - if batch_idx % 4 == 0: + # Alternating schedule for optimizer steps (e.g. GANs) + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + # update generator opt every 2 steps + if optimizer_idx == 0: + if batch_idx % 2 == 0: optimizer.step(closure=optimizer_closure) + # update discriminator opt every 4 steps + if optimizer_idx == 1: + if batch_idx % 4 == 0: + optimizer.step(closure=optimizer_closure) + Here we add a learning rate warm-up. .. testcode:: - # learning rate warm-up - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # skip the first 500 steps - if self.trainer.global_step < 500: - lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) - for pg in optimizer.param_groups: - pg['lr'] = lr_scale * self.hparams.learning_rate + # learning rate warm-up + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + # skip the first 500 steps + if self.trainer.global_step < 500: + lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) + for pg in optimizer.param_groups: + pg['lr'] = lr_scale * self.hparams.learning_rate - # update params - optimizer.step(closure=optimizer_closure) + # update params + optimizer.step(closure=optimizer_closure) .. note:: The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, gradient accumulation and much more ... .. testcode:: python - # function hook in LightningModule - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - optimizer.step(closure=optimizer_closure) + # function hook in LightningModule + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + optimizer.step(closure=optimizer_closure) .. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow. .. testcode:: python - # function hook in LightningModule - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + # function hook in LightningModule + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - # `optimizer` is a `LightningOptimizer` wrapping the optimizer. - # To access it, do as follow: - optimizer = optimizer.optimizer + # `optimizer` is a `LightningOptimizer` wrapping the optimizer. + # To access it, do as follow: + optimizer = optimizer.optimizer - # run step. However, it won't work on TPU, AMP, etc... - optimizer.step(closure=optimizer_closure) + # run step. However, it won't work on TPU, AMP, etc... + optimizer.step(closure=optimizer_closure) From 43e8ee692be9cada7966ac3021d667201dc90a9c Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 14 Apr 2021 06:01:32 +0900 Subject: [PATCH 06/14] Update docs --- docs/source/common/optimizers.rst | 250 +++++++++++++++++------------- 1 file changed, 140 insertions(+), 110 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 39b56666900f4..57098738b02e6 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -8,7 +8,8 @@ Lightning offers two modes for managing the optimization process: - automatic optimization - manual optimization -For the majority of research cases, **automatic optimization** will do the right thing for you and it is what most users should use. +For the majority of research cases, **automatic optimization** will do the right thing for you and it is what most +users should use. For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**. @@ -16,7 +17,8 @@ For advanced/expert users who want to do esoteric optimization schedules or tech Manual optimization =================== -For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to manually manage the optimization process. +For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to +manually manage the optimization process. This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. @@ -24,8 +26,12 @@ The users are left with ``optimizer.zero_grad()``, gradient accumulation, model To manually optimize, do the following: -* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule``'s ``__init__`` function. -* Use ``self.manual_backward(loss)`` instead of ``loss.backward()``. +* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function. +* Use the following functions and call them manually: + + * ``optimizer.zero_grad()`` to clear the gradients from the previous training step + * ``self.manual_backward(loss)`` instead of ``loss.backward()`` + * ``optimizer.step()`` to update your model parameters Here is a minimal example of manual optimization. @@ -42,10 +48,8 @@ Here is a minimal example of manual optimization. def training_step(batch, batch_idx): opt = self.optimizers() - - loss = self.compute_loss(batch) - opt.zero_grad() + loss = self.compute_loss(batch) self.manual_backward(loss) opt.step() @@ -57,77 +61,20 @@ Here is a minimal example of manual optimization. * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you. - * Be careful where you call ``zero_grad``, or your model won't converge. - It is good practice to call ``zero_grad`` before ``manual_backward``. - ------ - -Learning rate scheduling [manual] ---------------------------------- -You can obtain learning schedulers to call ``lr_scheduler.step()`` at arbitrary intervals. -Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``. - -.. warning:: - * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored. - * Before 1.3, ``lr_scheduler.step()`` was automatically called in manual optimization. - From 1.3, ``lr_scheduler.step()`` is disabled so that you can call it at arbitrary intervals. - -Here is a example calling ``step()`` every step. - -.. testcode:: python - - # step every batch - - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def training_step(self, batch, batch_idx): - # do foward, backward, and optimization - ... - - # single scheduler - sch = self.lr_schedulers() - sch.step() - - # multiple schedulers - sch1, sch2 = self.lr_schedulers() - sch1.step() - sch2.step() - -If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following. - -.. testcode:: python - - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def training_step(self, batch, batch_idx): - # do forward, backward, and optimization - ... - - sch = self.lr_schedulers() - - # step every `n` batches - if (batch_idx + 1) % n == 0: - sch.step() - - # step every `n` epochs - if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0: - sch.step() + * Be careful where you call ``optimizer.zero_grad()``, or your model won't converge. + It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``. ----- Gradient accumulation --------------------- -You can accumulate gradients over batches similarly to :attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization. +You can accumulate gradients over batches similarly to +:attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization. To perform gradient accumulation with one optimizer, you can do as such. .. testcode:: python - # accumulate gradients over 2 batches - + # accumulate gradients over `n` batches def __init__(self): super().__init__() self.automatic_optimization = False @@ -138,15 +85,16 @@ To perform gradient accumulation with one optimizer, you can do as such. loss = self.compute_loss(batch) self.manual_backward(loss) - # accumulate gradients of 2 batches - if (batch_idx + 1) % 2 == 0: + # accumulate gradients of `n` batches + if (batch_idx + 1) % n == 0: opt.step() opt.zero_grad() ----- -Use multiple optimizers [manual] --------------------------------- +Use multiple optimizers (like GANs) [manual] +-------------------------------------------- +Here is an example training a simple GAN with multiple optimizers. .. testcode:: python @@ -218,10 +166,68 @@ Use multiple optimizers [manual] ----- -Improve training time with model toggling ------------------------------------------ -Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers -in a distributed setting. +Learning rate scheduling [manual] +--------------------------------- +You can call ``lr_scheduler.step()`` at arbitrary intervals. +Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined +in your ``LightningModule.configure_optimizers()``. + +.. warning:: + * Before 1.3, Lightning automatically calls ``lr_scheduler.step()`` in both automatic and manual optimization. From + 1.3, ``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals. + * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored even if they are provided in + your ``configure_optimizers()`` in manual optimization. + +Here is an example calling ``lr_scheduler.step()`` every step. + +.. testcode:: python + + # step every batch + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def training_step(self, batch, batch_idx): + # do forward, backward, and optimization + ... + + # single scheduler + sch = self.lr_schedulers() + sch.step() + + # multiple schedulers + sch1, sch2 = self.lr_schedulers() + sch1.step() + sch2.step() + +If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following. + +.. testcode:: python + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def training_step(self, batch, batch_idx): + # do forward, backward, and optimization + ... + + sch = self.lr_schedulers() + + # step every `n` batches + if (batch_idx + 1) % n == 0: + sch.step() + + # step every `n` epochs + if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0: + sch.step() + +----- + +Improve training speed with model toggling +------------------------------------------ +Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers in a +distributed setting. Here is an explanation of what it does: @@ -232,7 +238,9 @@ Here is an explanation of what it does: When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. -:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users. +:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a +:meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a +:func:`contextlib.contextmanager` for advanced users. Here is an example for advanced use-case. @@ -257,8 +265,11 @@ Here is an example for advanced use-case. real_label = torch.ones((batch_size, 1), device=self.device) fake_label = torch.zeros((batch_size, 1), device=self.device) - # Sync and clear gradients only at the end of accumulation. - is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0 + # Sync and clear gradients + # at the end of accumulation or + # at the end of an epoch. + is_last_batch_to_accumulate = \ + (batch_idx + 1) % 2 == 0 or self.trainer.is_last_batch g_X = self.sample_G(batch_size) @@ -297,9 +308,11 @@ Here is an example for advanced use-case. Use closure for LBFGS-like optimizers ------------------------------------- -It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model. -It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`. -See `the PyTorch docs `_ for more details about the closure. +It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and +``backward`` of your model. It is optional for most optimizers, but makes your code compatible if you switch to an +optimizer which requires a closure, such as :class:`torch.optim.LBFGS`. + +See `the PyTorch docs `_ for more about the closure. Here is an example using a closure function. @@ -327,7 +340,8 @@ Here is an example using a closure function. Automatic optimization ====================== -With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` since Lightning automates that for you. +With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` +since Lightning automates that for you. Under the hood, Lightning does the following: @@ -359,17 +373,19 @@ In the case of multiple optimizers, Lightning does the following: lr_scheduler.step() .. warning:: - Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. - From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``. + Before 1.2.2, Lightning internally calls ``backward``, ``step`` and ``zero_grad`` in the order. + From 1.2.2, the order is changed to ``zero_grad``, ``backward`` and ``step``. ----- Learning rate scheduling ------------------------ -Every optimizer you use can be paired with any `Learning Rate Scheduler `_. -In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.LightningModule.configure_optimizers` method: +Every optimizer you use can be paired with any +`Learning Rate Scheduler `_. In the basic +use-case, the scheduler(s) should be returned as the second output from the +:meth:`~pytorch_lightning.LightningModule.configure_optimizers` method: -.. testcode:: +.. testcode:: python # no LR scheduler def configure_optimizers(self): @@ -389,10 +405,10 @@ In the basic use-case, the scheduler(s) should be returned as the second output scheduler2 = LambdaLR(optimizer2, ...) return [optimizer1, optimizer2], [scheduler1, scheduler2] -When there are schedulers in which the ``.step()`` method is conditioned on a metric value (for example the -:class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler), Lightning requires that the output -from :meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer, -with the keyword ``"monitor"`` set to metric that the scheduler should be conditioned on. +When there are schedulers in which the ``.step()`` method is conditioned on a metric value, such as the +:class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler, Lightning requires that the output from +:meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer, with the +keyword ``"monitor"`` set to metric that the scheduler should be conditioned on. .. testcode:: @@ -416,21 +432,22 @@ with the keyword ``"monitor"`` set to metric that the scheduler should be condit {'optimizer': optimizer2, 'lr_scheduler': scheduler2}, ) -.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`. +.. note:: + Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` + in your :class:`~pytorch_lightning.LightningModule`. -By default, all schedulers will be called after each epoch ends. -To change this behaviour, -a scheduler configuration should be returned as a dict which can contain the following keywords: +By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration +should be returned as a dict which can contain the following keywords: * ``"scheduler"`` (required): the actual scheduler object * ``"monitor"`` (optional): metric to condition -* ``"interval"`` (optional): either ``epoch`` (default) for stepping after each epoch ends or ``step`` for stepping +* ``"interval"`` (optional): either ``"epoch"`` (default) for stepping after each epoch ends or ``"step"`` for stepping after each optimization step * ``"frequency"`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1, corresponding to updating the learning rate after every epoch/step. -* ``"strict"`` (optional): if set to ``True`` will enforce that value specified in ``monitor`` is available while trying - to call ``scheduler.step()``, and stop training if not found. If ``False`` will only give a warning and continue - training without calling the scheduler. +* ``"strict"`` (optional): if set to ``True``, will enforce that value specified in ``"monitor"`` is available while + trying to call ``scheduler.step()``, and stop training if not found. If ``False``, it will only give a warning and + continue training without calling the scheduler. * ``"name"`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the learning rate progress, this keyword can be used to specify a name the learning rate should be logged as. @@ -456,7 +473,8 @@ a scheduler configuration should be returned as a dict which can contain the fol Use multiple optimizers (like GANs) ----------------------------------- -To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. +To use multiple optimizers, return two or more optimizers from +:meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. .. testcode:: python @@ -498,10 +516,13 @@ override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function. For example, here step optimizer A every 2 batches and optimizer B every 4 batches. -.. testcode:: +.. testcode:: python # Alternating schedule for optimizer steps (e.g. GANs) - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + def optimizer_step( + self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, + on_tpu=False, using_native_amp=False, using_lbfgs=False, + ): # update generator opt every 2 steps if optimizer_idx == 0: if batch_idx % 2 == 0: @@ -514,10 +535,13 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch Here we add a learning rate warm-up. -.. testcode:: +.. testcode:: python # learning rate warm-up - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + def optimizer_step( + self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, + on_tpu=False, using_native_amp=False, using_lbfgs=False, + ): # skip the first 500 steps if self.trainer.global_step < 500: lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) @@ -527,7 +551,10 @@ Here we add a learning rate warm-up. # update params optimizer.step(closure=optimizer_closure) -.. note:: The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, gradient accumulation and much more ... +.. note:: + The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal + :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, + gradient accumulation and much more ... .. testcode:: python @@ -535,13 +562,16 @@ Here we add a learning rate warm-up. def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): optimizer.step(closure=optimizer_closure) -.. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow. +.. note:: + To access your wrapped Optimizer from :class:`~pytorch_lightning.core.optimizer.LightningOptimizer`, do as follow. .. testcode:: python # function hook in LightningModule - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): - + def optimizer_step( + self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, + on_tpu=False, using_native_amp=False, using_lbfgs=False, + ): # `optimizer` is a `LightningOptimizer` wrapping the optimizer. # To access it, do as follow: optimizer = optimizer.optimizer From 6e9bd4e85d08549832eecf8e594f0977e9ec6cf6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 15 Apr 2021 11:31:15 +0900 Subject: [PATCH 07/14] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Adrian Wälchli --- docs/source/common/optimizers.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 57098738b02e6..75ae5118ab848 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -26,9 +26,10 @@ The users are left with ``optimizer.zero_grad()``, gradient accumulation, model To manually optimize, do the following: -* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function. +* Set the ``self.automatic_optimization = False`` in your ``LightningModule``'s ``__init__``. * Use the following functions and call them manually: + * ``self.optimizers()`` to access your optimizers (one or multiple) * ``optimizer.zero_grad()`` to clear the gradients from the previous training step * ``self.manual_backward(loss)`` instead of ``loss.backward()`` * ``optimizer.step()`` to update your model parameters @@ -55,7 +56,7 @@ Here is a minimal example of manual optimization. .. warning:: Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally. - From 1.2, it is left to the users expertise. + From 1.2, it is left to the user's expertise. .. tip:: * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can @@ -169,14 +170,14 @@ Here is an example training a simple GAN with multiple optimizers. Learning rate scheduling [manual] --------------------------------- You can call ``lr_scheduler.step()`` at arbitrary intervals. -Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined -in your ``LightningModule.configure_optimizers()``. +Use ``self.lr_schedulers()`` in your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers defined +in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. .. warning:: - * Before 1.3, Lightning automatically calls ``lr_scheduler.step()`` in both automatic and manual optimization. From - 1.3, ``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals. + * Before 1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From + 1.3, ``lr_scheduler.step()`` is now for the user to call at arbitrary intervals. * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored even if they are provided in - your ``configure_optimizers()`` in manual optimization. + your ``configure_optimizers()`` during manual optimization. Here is an example calling ``lr_scheduler.step()`` every step. @@ -433,7 +434,7 @@ keyword ``"monitor"`` set to metric that the scheduler should be conditioned on. ) .. note:: - Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` + Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`. By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration From 29f70c01da70fe61de128e8267148f1ef3c32a15 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 15 Apr 2021 11:46:07 +0900 Subject: [PATCH 08/14] Add note for optimizer.optimizer --- docs/source/common/optimizers.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 75ae5118ab848..a73b80791d1e2 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -564,7 +564,10 @@ Here we add a learning rate warm-up. optimizer.step(closure=optimizer_closure) .. note:: - To access your wrapped Optimizer from :class:`~pytorch_lightning.core.optimizer.LightningOptimizer`, do as follow. + ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer + configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own + optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be + able to support accelerators and precision for you. .. testcode:: python From 5d6bb275b14a3aefd0507f66fbfb22cee1a84cb5 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 17 Apr 2021 03:03:17 +0900 Subject: [PATCH 09/14] . --- docs/source/common/lightning_module.rst | 9 +++- docs/source/common/optimizers.rst | 72 ++++++++++++++----------- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index 95f7c0a6dcb7e..97d9fc6192710 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -916,7 +916,10 @@ True if using Automatic Mixed Precision (AMP) automatic_optimization ~~~~~~~~~~~~~~~~~~~~~~ -When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling your optimizers. However, we do take care of precision and any accelerators used. +When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling +your optimizers. However, we do take care of precision and any accelerators used. + +See :ref:`manual optimization` for details. .. code-block:: python @@ -931,7 +934,9 @@ When set to ``False``, Lightning does not automate the optimization process. Thi self.manual_backward(loss) opt.step() -This is recommended only if using 2+ optimizers AND if you know how to perform the optimization procedure properly. Note that automatic optimization can still be used with multiple optimizers by relying on the ``optimizer_idx`` parameter. Manual optimization is most useful for research topics like reinforcement learning, sparse coding, and GAN research. +This is recommended only if using 2+ optimizers AND if you know how to perform the optimization procedure properly. Note +that automatic optimization can still be used with multiple optimizers by relying on the ``optimizer_idx`` parameter. +Manual optimization is most useful for research topics like reinforcement learning, sparse coding, and GAN research. .. code-block:: python diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index a73b80791d1e2..98eb26187b967 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -26,7 +26,7 @@ The users are left with ``optimizer.zero_grad()``, gradient accumulation, model To manually optimize, do the following: -* Set the ``self.automatic_optimization = False`` in your ``LightningModule``'s ``__init__``. +* Set ``self.automatic_optimization=False`` in your ``LightningModule``'s ``__init__``. * Use the following functions and call them manually: * ``self.optimizers()`` to access your optimizers (one or multiple) @@ -170,8 +170,8 @@ Here is an example training a simple GAN with multiple optimizers. Learning rate scheduling [manual] --------------------------------- You can call ``lr_scheduler.step()`` at arbitrary intervals. -Use ``self.lr_schedulers()`` in your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers defined -in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. +Use ``self.lr_schedulers()`` in your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers +defined in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. .. warning:: * Before 1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From @@ -434,8 +434,8 @@ keyword ``"monitor"`` set to metric that the scheduler should be conditioned on. ) .. note:: - Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)`` - in your :class:`~pytorch_lightning.LightningModule`. + Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)`` in + your :class:`~pytorch_lightning.LightningModule`. By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration should be returned as a dict which can contain the following keywords: @@ -474,8 +474,8 @@ should be returned as a dict which can contain the following keywords: Use multiple optimizers (like GANs) ----------------------------------- -To use multiple optimizers, return two or more optimizers from -:meth:`pytorch_lightning.core.LightningModule.configure_optimizers`. +To use multiple optimizers (optionally with learning rate schedulers), return two or more optimizers from +:meth:`~pytorch_lightning.core.LightningModule.configure_optimizers`. .. testcode:: python @@ -485,7 +485,11 @@ To use multiple optimizers, return two or more optimizers from # two optimizers, one scheduler for adam only def configure_optimizers(self): - return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'} + opt1 = Adam(...) + opt2 = SGD(...) + optimizers = [opt1, opt2] + lr_schedulers = {'scheduler': ReduceLROnPlateau(opt1, ...), 'monitor': 'metric_to_track'} + return optimizers, lr_schedulers # two optimizers, two schedulers def configure_optimizers(self): @@ -515,7 +519,12 @@ Step optimizers at arbitrary intervals To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling, override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function. -For example, here step optimizer A every 2 batches and optimizer B every 4 batches. +.. warning:: + If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter to + ``optimizer.step()`` function as shown in the examples because ``training_step()``, ``optimizer.zero_grad()``, + ``backward()`` are called in the closure function. + +For example, here step optimizer A every batch and optimizer B every 2 batches. .. testcode:: python @@ -524,16 +533,18 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False, ): - # update generator opt every 2 steps + # update generator every step if optimizer_idx == 0: - if batch_idx % 2 == 0: - optimizer.step(closure=optimizer_closure) + optimizer.step(closure=optimizer_closure) - # update discriminator opt every 4 steps + # update discriminator every 2 steps if optimizer_idx == 1: - if batch_idx % 4 == 0: + if (batch_idx + 1) % 2 == 0: optimizer.step(closure=optimizer_closure) + # ... + # add as many optimizers as you want + Here we add a learning rate warm-up. .. testcode:: python @@ -552,33 +563,30 @@ Here we add a learning rate warm-up. # update params optimizer.step(closure=optimizer_closure) -.. note:: - The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal - :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, - gradient accumulation and much more ... +----- + +Access your own optimizer +------------------------- +``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer +configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own optimizer +with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to +support accelerators and precision for you. .. testcode:: python # function hook in LightningModule - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False): + def optimizer_step( + self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, + on_tpu=False, using_native_amp=False, using_lbfgs=False, + ): optimizer.step(closure=optimizer_closure) -.. note:: - ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer - configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own - optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be - able to support accelerators and precision for you. - -.. testcode:: python - - # function hook in LightningModule + # `optimizer` is a `LightningOptimizer` wrapping the optimizer. + # To access it, do the following. + # However, It won't work on TPU, AMP, etc... def optimizer_step( self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False, ): - # `optimizer` is a `LightningOptimizer` wrapping the optimizer. - # To access it, do as follow: optimizer = optimizer.optimizer - - # run step. However, it won't work on TPU, AMP, etc... optimizer.step(closure=optimizer_closure) From e0b3633e8222f9703e635d639f881a025d76c0ab Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 17 Apr 2021 03:10:57 +0900 Subject: [PATCH 10/14] Update hooks --- pytorch_lightning/core/hooks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index b320a9b223840..9830e6ca38fa6 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -260,7 +260,7 @@ def on_predict_end(self) -> None: def on_before_zero_grad(self, optimizer: Optimizer) -> None: """ - Called after optimizer.step() and before optimizer.zero_grad(). + Called after ``training_step()`` and before ``optimizer.zero_grad()``. Called in the training loop after taking an optimizer step and before zeroing grads. Good place to inspect weight information with weights updated. @@ -268,10 +268,13 @@ def on_before_zero_grad(self, optimizer: Optimizer) -> None: This is where it is called:: for optimizer in optimizers: - optimizer.step() + out = training_step(...) + model.on_before_zero_grad(optimizer) # < ---- called here optimizer.zero_grad() + backward() + Args: optimizer: The optimizer for which grads should be zeroed. """ From 451233486c9bac0374dfc79992f4d5252985b1bd Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 17 Apr 2021 03:11:23 +0900 Subject: [PATCH 11/14] Update closure docstring --- pytorch_lightning/trainer/training_loop.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 71d9407062001..8dbc41821b24a 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -651,9 +651,7 @@ def _process_closure_result(self, batch_outputs: list, opt_idx: int) -> list: return batch_outputs def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens): - """ - wrap the forward step in a closure so second order methods work - """ + """Wrap forward, zero_grad and backward in a closure so second order methods work""" with self.trainer.profiler.profile("training_step_and_backward"): # lightning module hook result = self.training_step(split_batch, batch_idx, opt_idx, hiddens) From eb9ecc3447927c3650549f603a73fae1db8f5921 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 17 Apr 2021 03:25:11 +0900 Subject: [PATCH 12/14] Update optimizer methods --- docs/source/common/lightning_module.rst | 13 +- pytorch_lightning/core/lightning.py | 156 +++++++++++++----------- 2 files changed, 88 insertions(+), 81 deletions(-) diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index 97d9fc6192710..2f61fdc47397f 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -698,6 +698,12 @@ log_dict .. automethod:: pytorch_lightning.core.lightning.LightningModule.log_dict :noindex: +manual_backward +~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward + :noindex: + print ~~~~~ @@ -1091,13 +1097,6 @@ get_progress_bar_dict .. automethod:: pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict :noindex: -manual_backward -~~~~~~~~~~~~~~~ - -.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward - :noindex: - - on_after_backward ~~~~~~~~~~~~~~~~~ diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 7efe88515b37e..46543a12a435f 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1097,28 +1097,22 @@ def configure_optimizers(self): Return: Any of these 6 options. - - Single optimizer. - - List or Tuple - List of optimizers. - - Two lists - The first list has multiple optimizers, the second a list of LR schedulers (or lr_dict). - - Dictionary, with an 'optimizer' key, and (optionally) a 'lr_scheduler' + - **Single optimizer**. + - **List or Tuple** of optimizers. + - **Two lists** - The first list has multiple optimizers, and the second has multiple LR schedulers (or + multiple lr_dict). + - **Dictionary**, with an ``"optimizer"`` key, and (optionally) a ``"lr_scheduler"`` key whose value is a single LR scheduler or lr_dict. - - Tuple of dictionaries as described, with an optional 'frequency' key. - - None - Fit will run without any optimizer. + - **Tuple of dictionaries** as described above, with an optional ``"frequency"`` key. + - **None** - Fit will run without any optimizer. Note: - The 'frequency' value is an int corresponding to the number of sequential batches - optimized with the specific optimizer. It should be given to none or to all of the optimizers. - There is a difference between passing multiple optimizers in a list, - and passing multiple optimizers in dictionaries with a frequency of 1: - In the former case, all optimizers will operate on the given batch in each optimization step. - In the latter, only one optimizer will operate on the given batch at every step. - - The lr_dict is a dictionary which contains the scheduler and its associated configuration. - The default configuration is shown below. + The lr_dict is a dictionary which contains the scheduler and its associated configuration. The default + configuration is shown below. .. code-block:: python - { + lr_dict = { 'scheduler': lr_scheduler, # The LR scheduler instance (required) 'interval': 'epoch', # The unit of the scheduler's step size 'frequency': 1, # The frequency of the scheduler @@ -1128,43 +1122,51 @@ def configure_optimizers(self): 'name': None, # Custom name for LearningRateMonitor to use } - Only the ``scheduler`` key is required, the rest will be set to the defaults above. + Only the ``"scheduler"`` key is required, the rest will be set to the defaults above. + + Note: + The ``"frequency"`` value is an ``int`` corresponding to the number of sequential batches optimized with the + specific optimizer. It should be given to none or to all of the optimizers. + + There is a difference between passing multiple optimizers in a list and passing multiple optimizers in + dictionaries with a frequency of 1: + In the former case, all optimizers will operate on the given batch in each optimization step. + In the latter, only one optimizer will operate on the given batch at every step. Examples:: # most cases def configure_optimizers(self): - opt = Adam(self.parameters(), lr=1e-3) - return opt + return Adam(self.parameters(), lr=1e-3) # multiple optimizer case (e.g.: GAN) def configure_optimizers(self): - generator_opt = Adam(self.model_gen.parameters(), lr=0.01) - disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02) - return generator_opt, disriminator_opt + gen_opt = Adam(self.model_gen.parameters(), lr=0.01) + dis_opt = Adam(self.model_dis.parameters(), lr=0.02) + return gen_opt, dis_opt # example with learning rate schedulers def configure_optimizers(self): - generator_opt = Adam(self.model_gen.parameters(), lr=0.01) - disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02) - discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10) - return [generator_opt, disriminator_opt], [discriminator_sched] + gen_opt = Adam(self.model_gen.parameters(), lr=0.01) + dis_opt = Adam(self.model_dis.parameters(), lr=0.02) + dis_sch = CosineAnnealing(dis_opt, T_max=10) + return [gen_opt, dis_opt], [dis_sch] # example with step-based learning rate schedulers def configure_optimizers(self): gen_opt = Adam(self.model_gen.parameters(), lr=0.01) - dis_opt = Adam(self.model_disc.parameters(), lr=0.02) - gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99), - 'interval': 'step'} # called after each training step - dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called every epoch - return [gen_opt, dis_opt], [gen_sched, dis_sched] + dis_opt = Adam(self.model_dis.parameters(), lr=0.02) + gen_sch = {'scheduler': ExponentialLR(gen_opt, 0.99), + 'interval': 'step'} # called after each training step + dis_sch = CosineAnnealing(dis_opt, T_max=10) # called every epoch + return [gen_opt, dis_opt], [gen_sch, dis_sch] # example with optimizer frequencies # see training procedure in `Improved Training of Wasserstein GANs`, Algorithm 1 # https://arxiv.org/abs/1704.00028 def configure_optimizers(self): gen_opt = Adam(self.model_gen.parameters(), lr=0.01) - dis_opt = Adam(self.model_disc.parameters(), lr=0.02) + dis_opt = Adam(self.model_dis.parameters(), lr=0.02) n_critic = 5 return ( {'optimizer': dis_opt, 'frequency': n_critic}, @@ -1172,32 +1174,22 @@ def configure_optimizers(self): ) Note: - Some things to know: - - Lightning calls ``.backward()`` and ``.step()`` on each optimizer - and learning rate scheduler as needed. - - - If you use 16-bit precision (``precision=16``), Lightning will automatically - handle the optimizers for you. - - - If you use multiple optimizers, :meth:`training_step` will have an additional - ``optimizer_idx`` parameter. - - - If you use LBFGS Lightning handles the closure function automatically for you. - - - If you use multiple optimizers, gradients will be calculated only - for the parameters of current optimizer at each training step. - - - If you need to control how often those optimizers step or override the - default ``.step()`` schedule, override the :meth:`optimizer_step` hook. - - - If you only want to call a learning rate scheduler every ``x`` step or epoch, - or want to monitor a custom metric, you can specify these in a lr_dict: + - Lightning calls ``.backward()`` and ``.step()`` on each optimizer and learning rate scheduler as needed. + - If you use 16-bit precision (``precision=16``), Lightning will automatically handle the optimizers. + - If you use multiple optimizers, :meth:`training_step` will have an additional ``optimizer_idx`` parameter. + - If you use :class:`torch.optim.LBFGS`, Lightning handles the closure function automatically for you. + - If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer + at each training step. + - If you need to control how often those optimizers step or override the default ``.step()`` schedule, + override the :meth:`optimizer_step` hook. + - If you only want to call a learning rate scheduler every ``x`` step or epoch, or want to monitor a custom + metric, you can specify these in a lr_dict: .. code-block:: python - { + lr_dict = { 'scheduler': lr_scheduler, 'interval': 'step', # or 'epoch' 'monitor': 'val_f1', @@ -1210,23 +1202,24 @@ def configure_optimizers(self): def manual_backward(self, loss: Tensor, optimizer: Optional[Optimizer] = None, *args, **kwargs) -> None: """ Call this directly from your training_step when doing optimizations manually. - By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you + By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you. This function forwards all args to the .backward() call as well. - .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set + See :ref:`manual optimization` for more examples. - .. tip:: In manual mode we still automatically accumulate grad over batches if - Trainer(accumulate_grad_batches=x) is set and you use `optimizer.step()` + .. tip:: + In manual mode, we still automatically clip grads if ``Trainer(gradient_clip_val=x)`` is set. Example:: def training_step(...): - opt_a, opt_b = self.optimizers() + opt = self.optimizers() loss = ... + opt.zero_grad() # automatically applies scaling, etc... self.manual_backward(loss) - opt_a.step() + opt.step() """ if optimizer is not None: rank_zero_deprecation( @@ -1336,18 +1329,18 @@ def optimizer_step( Warning: If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter to ``optimizer.step()`` function as shown in the examples. This ensures that - ``train_step_and_backward_closure`` is called within + ``training_step()``, ``optimizer.zero_grad()``, ``backward()`` are called within :meth:`~pytorch_lightning.trainer.training_loop.TrainLoop.run_training_batch`. Args: epoch: Current epoch batch_idx: Index of current batch optimizer: A PyTorch optimizer - optimizer_idx: If you used multiple optimizers this indexes into that list. - optimizer_closure: closure for all optimizers - on_tpu: true if TPU backward is required - using_native_amp: True if using native amp - using_lbfgs: True if the matching optimizer is lbfgs + optimizer_idx: If you used multiple optimizers, this indexes into that list. + optimizer_closure: Closure for all optimizers + on_tpu: ``True`` if TPU backward is required + using_native_amp: ``True`` if using native amp + using_lbfgs: True if the matching optimizer is :class:`torch.optim.LBFGS` Examples:: @@ -1359,22 +1352,18 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, # Alternating schedule for optimizer steps (i.e.: GANs) def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): - # update generator opt every 2 steps + # update generator opt every step if optimizer_idx == 0: - if batch_idx % 2 == 0 : - optimizer.step(closure=optimizer_closure) - optimizer.zero_grad() + optimizer.step(closure=optimizer_closure) - # update discriminator opt every 4 steps + # update discriminator opt every 2 steps if optimizer_idx == 1: - if batch_idx % 4 == 0 : + if (batch_idx + 1) % 2 == 0 : optimizer.step(closure=optimizer_closure) - optimizer.zero_grad() # ... # add as many optimizers as you want - Here's another example showing how to use this for more advanced things such as learning rate warm-up: @@ -1391,7 +1380,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, # update params optimizer.step(closure=optimizer_closure) - optimizer.zero_grad() """ if not isinstance(optimizer, LightningOptimizer): @@ -1400,6 +1388,26 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer.step(closure=optimizer_closure) def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int): + """Override this method to change the default behaviour of ``optimizer.zero_grad()``. + + Args: + epoch: Current epoch + batch_idx: Index of current batch + optimizer: A PyTorch optimizer + optimizer_idx: If you used multiple optimizers this indexes into that list. + + Examples:: + + # DEFAULT + def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx): + optimizer.zero_grad() + + # Set gradients to `None` instead of zero to improve performance. + def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx): + optimizer.zero_grad(set_to_none=True) + + See :meth:`torch.optim.Optimizer.zero_grad` for the explanation of the above example. + """ optimizer.zero_grad() def tbptt_split_batch(self, batch: Tensor, split_size: int) -> list: From 715bb2c6f84376d8d99a00743b075a10f7760194 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 17 Apr 2021 03:33:20 +0900 Subject: [PATCH 13/14] Update optimizer --- docs/source/common/optimizers.rst | 33 +++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 98eb26187b967..d9b8d25911009 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -59,11 +59,8 @@ Here is a minimal example of manual optimization. From 1.2, it is left to the user's expertise. .. tip:: - * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can - access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, - Lightning won't be able to support accelerators and precision for you. - * Be careful where you call ``optimizer.zero_grad()``, or your model won't converge. - It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``. + Be careful where you call ``optimizer.zero_grad()``, or your model won't converge. + It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``. ----- @@ -339,6 +336,30 @@ Here is an example using a closure function. ------ +Access your own optimizer [manual] +---------------------------------- +``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer +configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own optimizer +with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to +support accelerators and precision for you. + +.. testcode:: python + + def __init__(self): + super().__init__() + self.automatic_optimization = False + + def training_step(batch, batch_idx): + optimizer = self.optimizers() + + # `optimizer` is a `LightningOptimizer` wrapping the optimizer. + # To access it, do the following. + # However, it won't work on TPU, AMP, etc... + optimizer = optimizer.optimizer + ... + +----- + Automatic optimization ====================== With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` @@ -583,7 +604,7 @@ support accelerators and precision for you. # `optimizer` is a `LightningOptimizer` wrapping the optimizer. # To access it, do the following. - # However, It won't work on TPU, AMP, etc... + # However, it won't work on TPU, AMP, etc... def optimizer_step( self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False, From 23803e35c1ef90d8d3e9edfa7aac1d36318ffacc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 17 Apr 2021 10:12:10 +0900 Subject: [PATCH 14/14] Remove manopt + grad clipping (by @flukeskywalker) --- pytorch_lightning/core/lightning.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 46543a12a435f..54ea9d1bdb77e 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1208,9 +1208,6 @@ def manual_backward(self, loss: Tensor, optimizer: Optional[Optimizer] = None, * See :ref:`manual optimization` for more examples. - .. tip:: - In manual mode, we still automatically clip grads if ``Trainer(gradient_clip_val=x)`` is set. - Example:: def training_step(...):