From 8058a81404bbc46e51e070088f3468287c36ca9f Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 9 Apr 2021 07:32:22 +0900
Subject: [PATCH 01/14] .

---
 docs/source/common/optimizers.rst | 506 ++++++++++++++++--------------
 1 file changed, 277 insertions(+), 229 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 422302ea8987e..3aa843daea1b8 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -3,14 +3,12 @@
 ************
 Optimization
 ************
-
 Lightning offers two modes for managing the optimization process:
 
-- automatic optimization (AutoOpt)
+- automatic optimization
 - manual optimization
 
-For the majority of research cases, **automatic optimization** will do the right thing for you and it is what
-most users should use.
+For the majority of research cases, **automatic optimization** will do the right thing for you and it is what most users should use.
 
 For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**.
 
@@ -18,223 +16,309 @@ For advanced/expert users who want to do esoteric optimization schedules or tech
 
 Manual optimization
 ===================
-For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable
-to manually manage the optimization process. To do so, do the following:
+For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to manually manage the optimization process.
+
+This is only recommended for experts who need ultimate flexibility.
+Lightning will handle only precision and accelerators logic.
+The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc..
 
-* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function
+To manually optimize, do the following:
+
+* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule``'s ``__init__`` function.
 * Use ``self.manual_backward(loss)`` instead of ``loss.backward()``.
 
+Here is a basic example of manual optimization.
+ 
 .. testcode:: python
 
-    from pytorch_lightning import LightningModule
+   from pytorch_lightning import LightningModule
+
+   class MyModel(LightningModule):
 
-    class MyModel(LightningModule):
+       def __init__(self):
+           super().__init__()
+           # Important: This property activates manual optimization.
+           self.automatic_optimization = False
 
-        def __init__(self):
-            super().__init__()
-            # Important: This property activate ``manual optimization`` for your model
-            self.automatic_optimization = False
+       def training_step(batch, batch_idx):
+           opt = self.optimizers()
 
-        def training_step(batch, batch_idx):
-            opt = self.optimizers()
-            loss = self.compute_loss(batch)
-            self.manual_backward(loss)
+           loss = self.compute_loss(batch)
 
-.. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc..
+           opt.zero_grad()
+           self.manual_backward(loss)
+           opt.step()
 
-.. warning:: Before 1.2, ``optimzer.step`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertise.
 
-.. tip:: To perform ``accumulate_grad_batches`` with one optimizer, you can do as such.
+.. warning:: Before 1.2, ``optimzer.step()`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertise.
 
 .. tip:: ``self.optimizers()`` will return ``LightningOptimizer`` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you.
 
-.. code-block:: python
+.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``.
 
-    def __init__(self):
-        self.automatic_optimization = False
 
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
+Learning rate scheduling
+------------------------
+From 1.3, Lightning supports learning rate scheduling in manual optimization.
+``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals. Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``.
 
-        loss = self.compute_loss(batch)
-        self.manual_backward(loss)
+.. deprecated:: Before 1.3, ``lr_scheduler.step`` was called automatically in both manual and automatic optimization. 
 
-        # accumulate gradient batches
-        if batch_idx % 2 == 0:
-            opt.step()
-            opt.zero_grad()
+.. warning:: The learning rate scheduler keys are ignored in manual optimization.
 
-.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. See also `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_.
+.. testcode:: python
+
+   # step every batch
+
+   def __init__(self):
+       super().__init__()
+       self.automatic_optimization = False
+
+   def training_step(self, batch, batch_idx):
+       # do foward, backward, and optimization
+       ...
 
-Here is the same example as above using a ``closure``.
+       # single scheduler
+       sch = self.lr_schedulers()
+       sch.step()
+
+       # multiple schedulers
+       sch1, sch2 = self.lr_schedulers()
+       sch1.step()
+       sch2.step()
 
 .. testcode:: python
 
-    def __init__(self):
-        self.automatic_optimization = False
+   def __init__(self):
+       super().__init__()
+       self.automatic_optimization = False
 
-    def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
+   def training_step(self, batch, batch_idx):
+       # do forward, backward, and optimization
+       ...
 
-        def closure():
-            # Only zero_grad on the first batch to accumulate gradients
-            is_first_batch_to_accumulate = batch_idx % 2 == 0
-            if is_first_batch_to_accumulate:
-                opt.zero_grad()
+       sch = self.lr_schedulers()
 
-            loss = self.compute_loss(batch)
-            self.manual_backward(loss)
-            return loss
+       # step every `n` batches
+       if (batch_idx + 1) % n == 0:
+           sch.step()
 
-        opt.step(closure=closure)
+       # step every `n` epochs
+       if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0:
+           sch.step()             
 
-.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``.
+
+Gradient accumulation
+---------------------
+You can accumulate gradients over batches similarly to :attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization.
+To perform gradient accumulation with one optimizer, you can do as such.
+
+.. testcode:: python
+
+   # accumulate gradients over 2 batches
+
+   def __init__(self):
+       super().__init__()
+       self.automatic_optimization = False
+
+   def training_step(self, batch, batch_idx):
+       opt = self.optimizers()
+
+       loss = self.compute_loss(batch)
+       self.manual_backward(loss)
+
+       # accumulate gradients of 2 batches
+       if (batch_idx + 1) % 2 == 0:
+           opt.step()
+           opt.zero_grad()
+
+
+Use closure for LBFGS-like optimizers
+-------------------------------------
+It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model.
+It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`.
+See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more details about the closure.
+
+Here is an example using a closure function.
+
+.. testcode:: python
+
+   def __init__(self):
+       super().__init__()
+       self.automatic_optimization = False
+
+   def configure_optimizers(self):
+       return torch.optim.LBFGS(...)
+
+   def training_step(self, batch, batch_idx):
+       opt = self.optimizers()
+
+       def closure():
+           loss = self.compute_loss(batch)
+           opt.zero_grad()
+           self.manual_backward(loss)
+           return loss
+
+       opt.step(closure=closure)
+
+
+Multiple optimizers
+-------------------
 
 .. testcode:: python
 
-    import torch
-    from torch import Tensor
-    from pytorch_lightning import LightningModule
+   import torch
+   from torch import Tensor
+   from pytorch_lightning import LightningModule
+
+   class SimpleGAN(LightningModule):
 
-    class SimpleGAN(LightningModule):
+       def __init__(self):
+           super().__init__()
+           self.G = Generator()
+           self.D = Discriminator()
 
-        def __init__(self):
-            super().__init__()
-            self.G = Generator()
-            self.D = Discriminator()
+           # Important: This property activates manual optimization.
+           self.automatic_optimization = False
 
-            # Important: This property activate ``manual optimization`` for this model
-            self.automatic_optimization = False
+       def sample_z(self, n) -> Tensor:
+           sample = self._Z.sample((n,))
+           return sample
 
-        def sample_z(self, n) -> Tensor:
-            sample = self._Z.sample((n,))
-            return sample
+       def sample_G(self, n) -> Tensor:
+           z = self.sample_z(n)
+           return self.G(z)
 
-        def sample_G(self, n) -> Tensor:
-            z = self.sample_z(n)
-            return self.G(z)
+       def training_step(self, batch, batch_idx):
+           # Implementation follows the PyTorch tutorial:
+           # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
+           g_opt, d_opt = self.optimizers()
 
-        def training_step(self, batch, batch_idx):
-            # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
-            g_opt, d_opt = self.optimizers()
+           X, _ = batch
+           batch_size = X.shape[0]
 
-            X, _ = batch
-            batch_size = X.shape[0]
+           real_label = torch.ones((batch_size, 1), device=self.device)
+           fake_label = torch.zeros((batch_size, 1), device=self.device)
 
-            real_label = torch.ones((batch_size, 1), device=self.device)
-            fake_label = torch.zeros((batch_size, 1), device=self.device)
+           g_X = self.sample_G(batch_size)
 
-            g_X = self.sample_G(batch_size)
+           ###########################
+           #  Optimize Discriminator #
+           ###########################
+           d_x = self.D(X)
+           errD_real = self.criterion(d_x, real_label)
 
-            ###########################
-            #  Optimize Discriminator #
-            ###########################
-            d_opt.zero_grad()
-            d_x = self.D(X)
-            errD_real = self.criterion(d_x, real_label)
+           d_z = self.D(g_X.detach())
+           errD_fake = self.criterion(d_z, fake_label)
 
-            d_z = self.D(g_X.detach())
-            errD_fake = self.criterion(d_z, fake_label)
+           errD = (errD_real + errD_fake)
 
-            errD = (errD_real + errD_fake)
+           d_opt.zero_grad()
+           self.manual_backward(errD)
+           d_opt.step()
 
-            self.manual_backward(errD)
-            d_opt.step()
+           #######################
+           #  Optimize Generator #
+           #######################
+           d_z = self.D(g_X)
+           errG = self.criterion(d_z, real_label)
 
-            #######################
-            #  Optimize Generator #
-            #######################
-            g_opt.zero_grad()
+           g_opt.zero_grad()
+           self.manual_backward(errG)
+           g_opt.step()
 
-            d_z = self.D(g_X)
-            errG = self.criterion(d_z, real_label)
+           self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
-            self.manual_backward(errG)
-            g_opt.step()
+       def configure_optimizers(self):
+           g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5)
+           d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
+           return g_opt, d_opt
 
-            self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
-        def configure_optimizers(self):
-            g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5)
-            d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
-            return g_opt, d_opt
+Toggle models for faster training
+---------------------------------
+If you're trying to:
 
-.. note:: ``LightningOptimizer`` provides a ``toggle_model`` function as a ``@context_manager`` for advanced users. It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting.
+* use multiple optimizers
+* accumulate gradients over batches
+
+:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users.
+It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting.
 
 Here is an explanation of what it does:
 
-Considering the current optimizer as A and all other optimizers as B.
-Toggling means that all parameters from B exclusive to A will have their ``requires_grad`` attribute set to ``False``. Their original state will be restored when exiting the context manager.
+* Considering the current optimizer as A and all other optimizers as B.
+* Toggling means that all parameters from B exclusive to A will have their ``requires_grad`` attribute set to ``False``.
+* Their original state will be restored when exiting the context manager.
 
 When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase.
 Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed.
 
-
 Here is an example for advanced use-case.
 
 .. testcode:: python
 
-    # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus.
+   # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus.
 
-    class SimpleGAN(LightningModule):
+   class SimpleGAN(LightningModule):
 
-        ...
+       def __init__(self):
+           super().__init__()
+           self.automatic_optimization = False
 
-        def __init__(self):
-            self.automatic_optimization = False
+       def training_step(self, batch, batch_idx):
+           # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
+           g_opt, d_opt = self.optimizers()
 
-        def training_step(self, batch, batch_idx):
-            # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
-            g_opt, d_opt = self.optimizers()
+           X, _ = batch
+           X.requires_grad = True
+           batch_size = X.shape[0]
 
-            X, _ = batch
-            X.requires_grad = True
-            batch_size = X.shape[0]
+           real_label = torch.ones((batch_size, 1), device=self.device)
+           fake_label = torch.zeros((batch_size, 1), device=self.device)
 
-            real_label = torch.ones((batch_size, 1), device=self.device)
-            fake_label = torch.zeros((batch_size, 1), device=self.device)
+           # Sync and clear gradients only at the end of accumulation.
+           is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0
 
-            accumulated_grad_batches = batch_idx % 2 == 0
+           g_X = self.sample_G(batch_size)
 
-            g_X = self.sample_G(batch_size)
+           ###########################
+           #  Optimize Discriminator #
+           ###########################
+           with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
+               d_x = self.D(X)
+               errD_real = self.criterion(d_x, real_label)
 
-            ###########################
-            #  Optimize Discriminator #
-            ###########################
-            with d_opt.toggle_model(sync_grad=accumulated_grad_batches):
-                d_x = self.D(X)
-                errD_real = self.criterion(d_x, real_label)
+               d_z = self.D(g_X.detach())
+               errD_fake = self.criterion(d_z, fake_label)
 
-                d_z = self.D(g_X.detach())
-                errD_fake = self.criterion(d_z, fake_label)
+               errD = (errD_real + errD_fake)
 
-                errD = (errD_real + errD_fake)
+               self.manual_backward(errD)
+               if is_last_batch_to_accumulate:
+                   d_opt.step()
+                   d_opt.zero_grad()
 
-                self.manual_backward(errD)
-                if accumulated_grad_batches:
-                    d_opt.step()
-                    d_opt.zero_grad()
+           #######################
+           #  Optimize Generator #
+           #######################
+           with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
+               d_z = self.D(g_X)
+               errG = self.criterion(d_z, real_label)
 
-            #######################
-            #  Optimize Generator #
-            #######################
-            with g_opt.toggle_model(sync_grad=accumulated_grad_batches):
-                d_z = self.D(g_X)
-                errG = self.criterion(d_z, real_label)
+               self.manual_backward(errG)
+               if is_last_batch_to_accumulate:
+                   g_opt.step()
+                   g_opt.zero_grad()
 
-                self.manual_backward(errG)
-                if accumulated_grad_batches:
-                    g_opt.step()
-                    g_opt.zero_grad()
+           self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
-            self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
 ------
 
 Automatic optimization
 ======================
-With Lightning most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()``
-since Lightning automates that for you.
+With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` since Lightning automates that for you.
 
 .. warning::
    Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally.
@@ -273,27 +357,27 @@ In the case of multiple optimizers, Lightning does the following:
 Learning rate scheduling
 ------------------------
 Every optimizer you use can be paired with any `Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
-In the basic use-case, the scheduler (or multiple schedulers) should be returned as the second output from the ``.configure_optimizers`` method:
+In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.core.LightningModule.configure_optimizers` method:
 
 .. testcode::
 
    # no LR scheduler
    def configure_optimizers(self):
-      return Adam(...)
+       return Adam(...)
 
    # Adam + LR scheduler
    def configure_optimizers(self):
-      optimizer = Adam(...)
-      scheduler = LambdaLR(optimizer, ...)
-      return [optimizer], [scheduler]
+       optimizer = Adam(...)
+       scheduler = LambdaLR(optimizer, ...)
+       return [optimizer], [scheduler]
 
    # Two optimizers each with a scheduler
    def configure_optimizers(self):
-      optimizer1 = Adam(...)
-      optimizer2 = SGD(...)
-      scheduler1 = LambdaLR(optimizer1, ...)
-      scheduler2 = LambdaLR(optimizer2, ...)
-      return [optimizer1, optimizer2], [scheduler1, scheduler2]
+       optimizer1 = Adam(...)
+       optimizer2 = SGD(...)
+       scheduler1 = LambdaLR(optimizer1, ...)
+       scheduler2 = LambdaLR(optimizer2, ...)
+       return [optimizer1, optimizer2], [scheduler1, scheduler2]
 
 When there are schedulers in which the ``.step()`` method is conditioned on a metric value (for example the
 :class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler), Lightning requires that the output
@@ -304,11 +388,12 @@ set to metric that the scheduler should be conditioned on.
 
    # The ReduceLROnPlateau scheduler requires a monitor
    def configure_optimizers(self):
-      return {
-          'optimizer': Adam(...),
-          'lr_scheduler': ReduceLROnPlateau(optimizer, ...),
-          'monitor': 'metric_to_track'
-      }
+       optimizer = Adam(...)
+       return {
+           'optimizer': optimizer,
+           'lr_scheduler': ReduceLROnPlateau(optimizer, ...),
+           'monitor': 'metric_to_track',
+       }
 
    # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler
    def configure_optimizers(self):
@@ -321,12 +406,10 @@ set to metric that the scheduler should be conditioned on.
           {'optimizer': optimizer2, 'lr_scheduler': scheduler2},
       )
 
-.. note::
-    Metrics can be made availble to condition on by simply logging it using ``self.log('metric_to_track', metric_val)``
-    in your lightning module.
+.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your lightning module.
 
-By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration should be
-returned as a dict which can contain the following keywords:
+By default, all schedulers will be called after each epoch ends.
+To change this behaviour, a scheduler configuration should be returned as a dict which can contain the following keywords:
 
 * ``scheduler`` (required): the actual scheduler object
 * ``monitor`` (optional): metric to condition
@@ -362,21 +445,21 @@ returned as a dict which can contain the following keywords:
 
 Use multiple optimizers (like GANs)
 -----------------------------------
-To use multiple optimizers return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`
+To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
 
 .. testcode::
 
    # one optimizer
    def configure_optimizers(self):
-      return Adam(...)
+       return Adam(...)
 
    # two optimizers, no schedulers
    def configure_optimizers(self):
-      return Adam(...), SGD(...)
+       return Adam(...), SGD(...)
 
    # Two optimizers, one scheduler for adam only
    def configure_optimizers(self):
-      return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'}
+       return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'}
 
 Lightning will call each optimizer sequentially:
 
@@ -390,8 +473,8 @@ Lightning will call each optimizer sequentially:
                loss.backward()
                opt.step()
 
-      for lr_scheduler in lr_schedulers:
-          lr_scheduler.step()
+       for lr_scheduler in lr_schedulers:
+           lr_scheduler.step()
 
 ----------
 
@@ -404,86 +487,51 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
 
 .. testcode::
 
-    def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx):
-      optimizer.zero_grad()
-
-    # Alternating schedule for optimizer steps (ie: GANs)
-    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-        # update generator opt every 2 steps
-        if optimizer_idx == 0:
-            if batch_nb % 2 == 0 :
-               optimizer.step(closure=closure)
+   # Alternating schedule for optimizer steps (ie: GANs)
+   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+       # update generator opt every 2 steps
+       if optimizer_idx == 0:
+           if batch_nb % 2 == 0:
+              optimizer.step(closure=closure)
 
-        # update discriminator opt every 4 steps
-        if optimizer_idx == 1:
-            if batch_nb % 4 == 0 :
+       # update discriminator opt every 4 steps
+       if optimizer_idx == 1:
+           if batch_nb % 4 == 0:
                optimizer.step(closure=closure)
 
 Here we add a learning-rate warm up
 
 .. testcode::
 
-    # learning rate warm-up
-    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-        # warm up lr
-        if self.trainer.global_step < 500:
-            lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
-            for pg in optimizer.param_groups:
-                pg['lr'] = lr_scale * self.hparams.learning_rate
+   # learning rate warm-up
+   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+       # warm up lr
+       if self.trainer.global_step < 500:
+           lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
+           for pg in optimizer.param_groups:
+               pg['lr'] = lr_scale * self.hparams.learning_rate
 
-        # update params
-        optimizer.step(closure=closure)
+       # update params
+       optimizer.step(closure=optimizer_closure)
 
 .. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches and much more ...
 
 .. testcode::
 
-    # function hook in LightningModule
-    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-      optimizer.step(closure=closure)
+   # function hook in LightningModule
+   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+       optimizer.step(closure=optimizer_closure)
 
 .. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow.
 
 .. testcode::
 
-    # function hook in LightningModule
-    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-
-      # `optimizer is a ``LightningOptimizer`` wrapping the optimizer.
-      # To access it, do as follow:
-      optimizer = optimizer.optimizer
-
-      # run step. However, it won't work on TPU, AMP, etc...
-      optimizer.step(closure=closure)
-
-
-----------
-
-Using the closure functions for optimization
---------------------------------------------
-
-When using optimization schemes such as LBFGS, the `second_order_closure` needs to be enabled. By default, this function is defined by wrapping the `training_step` and the backward steps as follows
-
-.. warning::
-   Before 1.2.2, ``.zero_grad()`` was called outside the closure internally.
-   From 1.2.2, the closure calls ``.zero_grad()`` inside, so there is no need to define your own closure
-   when using similar optimizers to :class:`torch.optim.LBFGS` which requires reevaluation of the loss with the closure in ``optimizer.step()``.
-
-.. testcode::
-
-    def second_order_closure(pl_module, split_batch, batch_idx, opt_idx, optimizer, hidden):
-        # Model training step on a given batch
-        result = pl_module.training_step(split_batch, batch_idx, opt_idx, hidden)
-
-        # Model backward pass
-        pl_module.backward(result, optimizer, opt_idx)
-
-        # on_after_backward callback
-        pl_module.on_after_backward(result.training_step_output, batch_idx, result.loss)
+   # function hook in LightningModule
+   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
 
-        return result
+       # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
+       # To access it, do as follow:
+       optimizer = optimizer.optimizer
 
-    # This default `second_order_closure` function can be enabled by passing it directly into the `optimizer.step`
-    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, second_order_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-        # update params
-        optimizer.step(second_order_closure)
+       # run step. However, it won't work on TPU, AMP, etc...
+       optimizer.step(closure=optimizer_closure)

From 4477f46ee9fe002339d9a05f18470ebffb752d09 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sun, 11 Apr 2021 09:04:00 +0900
Subject: [PATCH 02/14] .

---
 docs/source/common/optimizers.rst | 234 ++++++++++++++++--------------
 1 file changed, 124 insertions(+), 110 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 3aa843daea1b8..c94bd33e81d89 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -12,7 +12,7 @@ For the majority of research cases, **automatic optimization** will do the right
 
 For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**.
 
-------
+-----
 
 Manual optimization
 ===================
@@ -27,7 +27,7 @@ To manually optimize, do the following:
 * Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule``'s ``__init__`` function.
 * Use ``self.manual_backward(loss)`` instead of ``loss.backward()``.
 
-Here is a basic example of manual optimization.
+Here is a minimal example of manual optimization.
  
 .. testcode:: python
 
@@ -49,22 +49,30 @@ Here is a basic example of manual optimization.
            self.manual_backward(loss)
            opt.step()
 
+.. warning::
+   Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally.
+   From 1.2, it is left to the users expertise.
 
-.. warning:: Before 1.2, ``optimzer.step()`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertise.
-
-.. tip:: ``self.optimizers()`` will return ``LightningOptimizer`` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you.
-
-.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``.
+.. tip::
+   * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can
+     access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step,
+     Lightning won't be able to support accelerators and precision for you.
+   * Be careful where you call ``zero_grad``, or your model won't converge.
+     It is good practice to call ``zero_grad`` before ``manual_backward``.
 
+-----
 
-Learning rate scheduling
-------------------------
-From 1.3, Lightning supports learning rate scheduling in manual optimization.
-``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals. Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``.
+Learning rate scheduling [manual]
+---------------------------------
+You can obtain learning schedulers to call ``lr_scheduler.step()`` at arbitrary intervals.
+Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``.
 
-.. deprecated:: Before 1.3, ``lr_scheduler.step`` was called automatically in both manual and automatic optimization. 
+.. warning::
+   * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored.
+   * Before 1.3, ``lr_scheduler.step()`` was automatically called in manual optimization.
+     From 1.3, ``lr_scheduler.step()`` is disabled so that you can call it at arbitrary intervals.
 
-.. warning:: The learning rate scheduler keys are ignored in manual optimization.
+Here is a example calling ``step()`` every step.
 
 .. testcode:: python
 
@@ -87,6 +95,8 @@ From 1.3, Lightning supports learning rate scheduling in manual optimization.
        sch1.step()
        sch2.step()
 
+If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following.
+
 .. testcode:: python
 
    def __init__(self):
@@ -107,6 +117,7 @@ From 1.3, Lightning supports learning rate scheduling in manual optimization.
        if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0:
            sch.step()             
 
+-----
 
 Gradient accumulation
 ---------------------
@@ -132,38 +143,10 @@ To perform gradient accumulation with one optimizer, you can do as such.
            opt.step()
            opt.zero_grad()
 
+-----
 
-Use closure for LBFGS-like optimizers
--------------------------------------
-It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model.
-It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`.
-See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more details about the closure.
-
-Here is an example using a closure function.
-
-.. testcode:: python
-
-   def __init__(self):
-       super().__init__()
-       self.automatic_optimization = False
-
-   def configure_optimizers(self):
-       return torch.optim.LBFGS(...)
-
-   def training_step(self, batch, batch_idx):
-       opt = self.optimizers()
-
-       def closure():
-           loss = self.compute_loss(batch)
-           opt.zero_grad()
-           self.manual_backward(loss)
-           return loss
-
-       opt.step(closure=closure)
-
-
-Multiple optimizers
--------------------
+Use multiple optimizers [manual]
+--------------------------------
 
 .. testcode:: python
 
@@ -202,9 +185,9 @@ Multiple optimizers
 
            g_X = self.sample_G(batch_size)
 
-           ###########################
-           #  Optimize Discriminator #
-           ###########################
+           ##########################
+           # Optimize Discriminator #
+           ##########################
            d_x = self.D(X)
            errD_real = self.criterion(d_x, real_label)
 
@@ -217,9 +200,9 @@ Multiple optimizers
            self.manual_backward(errD)
            d_opt.step()
 
-           #######################
-           #  Optimize Generator #
-           #######################
+           ######################
+           # Optimize Generator #
+           ######################
            d_z = self.D(g_X)
            errG = self.criterion(d_z, real_label)
 
@@ -234,16 +217,12 @@ Multiple optimizers
            d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
            return g_opt, d_opt
 
+-----
 
-Toggle models for faster training
----------------------------------
-If you're trying to:
-
-* use multiple optimizers
-* accumulate gradients over batches
-
-:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users.
-It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting.
+Improve training time with model toggling
+-----------------------------------------
+Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers
+in a distributed setting.
 
 Here is an explanation of what it does:
 
@@ -254,12 +233,13 @@ Here is an explanation of what it does:
 When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase.
 Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed.
 
+:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users.
+
 Here is an example for advanced use-case.
 
 .. testcode:: python
 
    # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus.
-
    class SimpleGAN(LightningModule):
 
        def __init__(self):
@@ -267,7 +247,8 @@ Here is an example for advanced use-case.
            self.automatic_optimization = False
 
        def training_step(self, batch, batch_idx):
-           # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
+           # Implementation follows the PyTorch tutorial:
+           # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
            g_opt, d_opt = self.optimizers()
 
            X, _ = batch
@@ -282,9 +263,9 @@ Here is an example for advanced use-case.
 
            g_X = self.sample_G(batch_size)
 
-           ###########################
-           #  Optimize Discriminator #
-           ###########################
+           ##########################
+           # Optimize Discriminator #
+           ##########################
            with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
                d_x = self.D(X)
                errD_real = self.criterion(d_x, real_label)
@@ -299,9 +280,9 @@ Here is an example for advanced use-case.
                    d_opt.step()
                    d_opt.zero_grad()
 
-           #######################
-           #  Optimize Generator #
-           #######################
+           ######################
+           # Optimize Generator #
+           ######################
            with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
                d_z = self.D(g_X)
                errG = self.criterion(d_z, real_label)
@@ -313,6 +294,35 @@ Here is an example for advanced use-case.
 
            self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
+-----
+
+Use closure for LBFGS-like optimizers
+-------------------------------------
+It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model.
+It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`.
+See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more details about the closure.
+
+Here is an example using a closure function.
+
+.. testcode:: python
+
+   def __init__(self):
+       super().__init__()
+       self.automatic_optimization = False
+
+   def configure_optimizers(self):
+       return torch.optim.LBFGS(...)
+
+   def training_step(self, batch, batch_idx):
+       opt = self.optimizers()
+
+       def closure():
+           loss = self.compute_loss(batch)
+           opt.zero_grad()
+           self.manual_backward(loss)
+           return loss
+
+       opt.step(closure=closure)
 
 ------
 
@@ -320,11 +330,7 @@ Automatic optimization
 ======================
 With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` since Lightning automates that for you.
 
-.. warning::
-   Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally.
-   From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``.
-
-Under the hood Lightning does the following:
+Under the hood, Lightning does the following:
 
 .. code-block:: python
 
@@ -353,11 +359,16 @@ In the case of multiple optimizers, Lightning does the following:
         for lr_scheduler in lr_schedulers:
             lr_scheduler.step()
 
+.. warning::
+   Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally.
+   From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``.
+
+-----
 
 Learning rate scheduling
 ------------------------
 Every optimizer you use can be paired with any `Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
-In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.core.LightningModule.configure_optimizers` method:
+In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.LightningModule.configure_optimizers` method:
 
 .. testcode::
 
@@ -381,8 +392,8 @@ In the basic use-case, the scheduler(s) should be returned as the second output
 
 When there are schedulers in which the ``.step()`` method is conditioned on a metric value (for example the
 :class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler), Lightning requires that the output
-from ``configure_optimizers`` should be dicts, one for each optimizer, with the keyword ``monitor``
-set to metric that the scheduler should be conditioned on.
+from :meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer,
+with the keyword ``"monitor"`` set to metric that the scheduler should be conditioned on.
 
 .. testcode::
 
@@ -406,24 +417,25 @@ set to metric that the scheduler should be conditioned on.
           {'optimizer': optimizer2, 'lr_scheduler': scheduler2},
       )
 
-.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your lightning module.
+.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`.
 
 By default, all schedulers will be called after each epoch ends.
-To change this behaviour, a scheduler configuration should be returned as a dict which can contain the following keywords:
+To change this behaviour,
+a scheduler configuration should be returned as a dict which can contain the following keywords:
 
-* ``scheduler`` (required): the actual scheduler object
-* ``monitor`` (optional): metric to condition
-* ``interval`` (optional): either ``epoch`` (default) for stepping after each epoch ends or ``step`` for stepping
+* ``"scheduler"`` (required): the actual scheduler object
+* ``"monitor"`` (optional): metric to condition
+* ``"interval"`` (optional): either ``epoch`` (default) for stepping after each epoch ends or ``step`` for stepping
   after each optimization step
-* ``frequency`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1,
+* ``"frequency"`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1,
   corresponding to updating the learning rate after every epoch/step.
-* ``strict`` (optional): if set to ``True`` will enforce that value specified in ``monitor`` is available while trying
-  to call ``scheduler.step()``, and stop training if not found. If ``False`` will only give a warning and continue training
-  (without calling the scheduler).
-* ``name`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the
-  learning rate progress, this keyword can be used to specify a specific name the learning rate should be logged as.
+* ``"strict"`` (optional): if set to ``True`` will enforce that value specified in ``monitor`` is available while trying
+  to call ``scheduler.step()``, and stop training if not found. If ``False`` will only give a warning and continue
+  training without calling the scheduler.
+* ``"name"`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the
+  learning rate progress, this keyword can be used to specify a name the learning rate should be logged as.
 
-.. testcode::
+.. testcode:: python
 
    # Same as the above example with additional params passed to the first scheduler
    # In this case the ReduceLROnPlateau will step after every 10 processed batches
@@ -441,27 +453,29 @@ To change this behaviour, a scheduler configuration should be returned as a dict
       ]
       return optimizers, schedulers
 
-----------
+-----
 
-Use multiple optimizers (like GANs)
------------------------------------
+Multiple optimizers (e.g. GANs)
+-------------------------------
 To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
 
-.. testcode::
-
-   # one optimizer
-   def configure_optimizers(self):
-       return Adam(...)
+.. testcode:: python
 
    # two optimizers, no schedulers
    def configure_optimizers(self):
        return Adam(...), SGD(...)
 
-   # Two optimizers, one scheduler for adam only
+   # two optimizers, one scheduler for adam only
    def configure_optimizers(self):
        return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'}
 
-Lightning will call each optimizer sequentially:
+   # two optimizers, two schedulers
+   def configure_optimizers(self):
+       opt1 = Adam(...)
+       opt2 = SGD(...)
+       return [opt1, opt2], [StepLR(opt1, ...), OneCycleLR(opt2, ...)]
+
+Under the hood, Lightning will call each optimizer sequentially:
 
 .. code-block:: python
 
@@ -476,36 +490,36 @@ Lightning will call each optimizer sequentially:
        for lr_scheduler in lr_schedulers:
            lr_scheduler.step()
 
-----------
+-----
 
 Step optimizers at arbitrary intervals
 --------------------------------------
 To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling,
-override the :meth:`optimizer_step` function.
+override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function.
 
-For example, here step optimizer A every 2 batches and optimizer B every 4 batches
+For example, here step optimizer A every 2 batches and optimizer B every 4 batches.
 
 .. testcode::
 
-   # Alternating schedule for optimizer steps (ie: GANs)
+   # Alternating schedule for optimizer steps (e.g. GANs)
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
        # update generator opt every 2 steps
        if optimizer_idx == 0:
-           if batch_nb % 2 == 0:
-              optimizer.step(closure=closure)
+           if batch_idx % 2 == 0:
+              optimizer.step(closure=optimizer_closure)
 
        # update discriminator opt every 4 steps
        if optimizer_idx == 1:
-           if batch_nb % 4 == 0:
-               optimizer.step(closure=closure)
+           if batch_idx % 4 == 0:
+               optimizer.step(closure=optimizer_closure)
 
-Here we add a learning-rate warm up
+Here we add a learning rate warm-up.
 
 .. testcode::
 
    # learning rate warm-up
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-       # warm up lr
+       # skip the first 500 steps
        if self.trainer.global_step < 500:
            lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
            for pg in optimizer.param_groups:
@@ -514,9 +528,9 @@ Here we add a learning-rate warm up
        # update params
        optimizer.step(closure=optimizer_closure)
 
-.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches and much more ...
+.. note:: The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, gradient accumulation and much more ...
 
-.. testcode::
+.. testcode:: python
 
    # function hook in LightningModule
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
@@ -524,7 +538,7 @@ Here we add a learning-rate warm up
 
 .. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow.
 
-.. testcode::
+.. testcode:: python
 
    # function hook in LightningModule
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):

From 05b6304ab898bd72e537e916c1a8658c7efb38a1 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sun, 11 Apr 2021 09:26:01 +0900
Subject: [PATCH 03/14] Fix link to the section

---
 docs/source/common/optimizers.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index c94bd33e81d89..ee3c405cffa1c 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -455,7 +455,7 @@ a scheduler configuration should be returned as a dict which can contain the fol
 
 -----
 
-Multiple optimizers (e.g. GANs)
+Use multiple optimizers (like GANs)
 -------------------------------
 To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
 

From cf3074161647fe6f91b39664dba48fae9d07f4ff Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sun, 11 Apr 2021 09:29:11 +0900
Subject: [PATCH 04/14] Fix link to the section

---
 docs/source/common/optimizers.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index ee3c405cffa1c..94f77837c1e08 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -456,7 +456,7 @@ a scheduler configuration should be returned as a dict which can contain the fol
 -----
 
 Use multiple optimizers (like GANs)
--------------------------------
+-----------------------------------
 To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
 
 .. testcode:: python

From 1627da4d6ab4e7202203d42716d805c4fd3dcf3b Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sun, 11 Apr 2021 09:42:09 +0900
Subject: [PATCH 05/14] Consistent indent

---
 docs/source/common/optimizers.rst | 473 +++++++++++++++---------------
 1 file changed, 236 insertions(+), 237 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 94f77837c1e08..39b56666900f4 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -31,23 +31,23 @@ Here is a minimal example of manual optimization.
  
 .. testcode:: python
 
-   from pytorch_lightning import LightningModule
+    from pytorch_lightning import LightningModule
 
-   class MyModel(LightningModule):
+    class MyModel(LightningModule):
 
-       def __init__(self):
-           super().__init__()
-           # Important: This property activates manual optimization.
-           self.automatic_optimization = False
+        def __init__(self):
+            super().__init__()
+            # Important: This property activates manual optimization.
+            self.automatic_optimization = False
 
-       def training_step(batch, batch_idx):
-           opt = self.optimizers()
+        def training_step(batch, batch_idx):
+            opt = self.optimizers()
 
-           loss = self.compute_loss(batch)
+            loss = self.compute_loss(batch)
 
-           opt.zero_grad()
-           self.manual_backward(loss)
-           opt.step()
+            opt.zero_grad()
+            self.manual_backward(loss)
+            opt.step()
 
 .. warning::
    Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally.
@@ -76,46 +76,46 @@ Here is a example calling ``step()`` every step.
 
 .. testcode:: python
 
-   # step every batch
+    # step every batch
 
-   def __init__(self):
-       super().__init__()
-       self.automatic_optimization = False
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
 
-   def training_step(self, batch, batch_idx):
-       # do foward, backward, and optimization
-       ...
+    def training_step(self, batch, batch_idx):
+        # do foward, backward, and optimization
+        ...
 
-       # single scheduler
-       sch = self.lr_schedulers()
-       sch.step()
+        # single scheduler
+        sch = self.lr_schedulers()
+        sch.step()
 
-       # multiple schedulers
-       sch1, sch2 = self.lr_schedulers()
-       sch1.step()
-       sch2.step()
+        # multiple schedulers
+        sch1, sch2 = self.lr_schedulers()
+        sch1.step()
+        sch2.step()
 
 If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following.
 
 .. testcode:: python
 
-   def __init__(self):
-       super().__init__()
-       self.automatic_optimization = False
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
 
-   def training_step(self, batch, batch_idx):
-       # do forward, backward, and optimization
-       ...
+    def training_step(self, batch, batch_idx):
+        # do forward, backward, and optimization
+        ...
 
-       sch = self.lr_schedulers()
+        sch = self.lr_schedulers()
 
-       # step every `n` batches
-       if (batch_idx + 1) % n == 0:
-           sch.step()
+        # step every `n` batches
+        if (batch_idx + 1) % n == 0:
+            sch.step()
 
-       # step every `n` epochs
-       if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0:
-           sch.step()             
+        # step every `n` epochs
+        if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0:
+            sch.step()
 
 -----
 
@@ -126,22 +126,22 @@ To perform gradient accumulation with one optimizer, you can do as such.
 
 .. testcode:: python
 
-   # accumulate gradients over 2 batches
+    # accumulate gradients over 2 batches
 
-   def __init__(self):
-       super().__init__()
-       self.automatic_optimization = False
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
 
-   def training_step(self, batch, batch_idx):
-       opt = self.optimizers()
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
 
-       loss = self.compute_loss(batch)
-       self.manual_backward(loss)
+        loss = self.compute_loss(batch)
+        self.manual_backward(loss)
 
-       # accumulate gradients of 2 batches
-       if (batch_idx + 1) % 2 == 0:
-           opt.step()
-           opt.zero_grad()
+        # accumulate gradients of 2 batches
+        if (batch_idx + 1) % 2 == 0:
+            opt.step()
+            opt.zero_grad()
 
 -----
 
@@ -150,72 +150,71 @@ Use multiple optimizers [manual]
 
 .. testcode:: python
 
-   import torch
-   from torch import Tensor
-   from pytorch_lightning import LightningModule
+    import torch
+    from torch import Tensor
+    from pytorch_lightning import LightningModule
 
-   class SimpleGAN(LightningModule):
+    class SimpleGAN(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.G = Generator()
+            self.D = Discriminator()
 
-       def __init__(self):
-           super().__init__()
-           self.G = Generator()
-           self.D = Discriminator()
+            # Important: This property activates manual optimization.
+            self.automatic_optimization = False
 
-           # Important: This property activates manual optimization.
-           self.automatic_optimization = False
+        def sample_z(self, n) -> Tensor:
+            sample = self._Z.sample((n,))
+            return sample
 
-       def sample_z(self, n) -> Tensor:
-           sample = self._Z.sample((n,))
-           return sample
+        def sample_G(self, n) -> Tensor:
+            z = self.sample_z(n)
+            return self.G(z)
 
-       def sample_G(self, n) -> Tensor:
-           z = self.sample_z(n)
-           return self.G(z)
+        def training_step(self, batch, batch_idx):
+            # Implementation follows the PyTorch tutorial:
+            # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
+            g_opt, d_opt = self.optimizers()
 
-       def training_step(self, batch, batch_idx):
-           # Implementation follows the PyTorch tutorial:
-           # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
-           g_opt, d_opt = self.optimizers()
+            X, _ = batch
+            batch_size = X.shape[0]
 
-           X, _ = batch
-           batch_size = X.shape[0]
+            real_label = torch.ones((batch_size, 1), device=self.device)
+            fake_label = torch.zeros((batch_size, 1), device=self.device)
 
-           real_label = torch.ones((batch_size, 1), device=self.device)
-           fake_label = torch.zeros((batch_size, 1), device=self.device)
+            g_X = self.sample_G(batch_size)
 
-           g_X = self.sample_G(batch_size)
+            ##########################
+            # Optimize Discriminator #
+            ##########################
+            d_x = self.D(X)
+            errD_real = self.criterion(d_x, real_label)
 
-           ##########################
-           # Optimize Discriminator #
-           ##########################
-           d_x = self.D(X)
-           errD_real = self.criterion(d_x, real_label)
+            d_z = self.D(g_X.detach())
+            errD_fake = self.criterion(d_z, fake_label)
 
-           d_z = self.D(g_X.detach())
-           errD_fake = self.criterion(d_z, fake_label)
+            errD = (errD_real + errD_fake)
 
-           errD = (errD_real + errD_fake)
+            d_opt.zero_grad()
+            self.manual_backward(errD)
+            d_opt.step()
 
-           d_opt.zero_grad()
-           self.manual_backward(errD)
-           d_opt.step()
+            ######################
+            # Optimize Generator #
+            ######################
+            d_z = self.D(g_X)
+            errG = self.criterion(d_z, real_label)
 
-           ######################
-           # Optimize Generator #
-           ######################
-           d_z = self.D(g_X)
-           errG = self.criterion(d_z, real_label)
+            g_opt.zero_grad()
+            self.manual_backward(errG)
+            g_opt.step()
 
-           g_opt.zero_grad()
-           self.manual_backward(errG)
-           g_opt.step()
+            self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
-           self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
-
-       def configure_optimizers(self):
-           g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5)
-           d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
-           return g_opt, d_opt
+        def configure_optimizers(self):
+            g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5)
+            d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5)
+            return g_opt, d_opt
 
 -----
 
@@ -239,60 +238,60 @@ Here is an example for advanced use-case.
 
 .. testcode:: python
 
-   # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus.
-   class SimpleGAN(LightningModule):
+    # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus.
+    class SimpleGAN(LightningModule):
 
-       def __init__(self):
-           super().__init__()
-           self.automatic_optimization = False
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
 
-       def training_step(self, batch, batch_idx):
-           # Implementation follows the PyTorch tutorial:
-           # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
-           g_opt, d_opt = self.optimizers()
+        def training_step(self, batch, batch_idx):
+            # Implementation follows the PyTorch tutorial:
+            # https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
+            g_opt, d_opt = self.optimizers()
 
-           X, _ = batch
-           X.requires_grad = True
-           batch_size = X.shape[0]
+            X, _ = batch
+            X.requires_grad = True
+            batch_size = X.shape[0]
 
-           real_label = torch.ones((batch_size, 1), device=self.device)
-           fake_label = torch.zeros((batch_size, 1), device=self.device)
+            real_label = torch.ones((batch_size, 1), device=self.device)
+            fake_label = torch.zeros((batch_size, 1), device=self.device)
 
-           # Sync and clear gradients only at the end of accumulation.
-           is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0
+            # Sync and clear gradients only at the end of accumulation.
+            is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0
 
-           g_X = self.sample_G(batch_size)
+            g_X = self.sample_G(batch_size)
 
-           ##########################
-           # Optimize Discriminator #
-           ##########################
-           with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
-               d_x = self.D(X)
-               errD_real = self.criterion(d_x, real_label)
+            ##########################
+            # Optimize Discriminator #
+            ##########################
+            with d_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
+                d_x = self.D(X)
+                errD_real = self.criterion(d_x, real_label)
 
-               d_z = self.D(g_X.detach())
-               errD_fake = self.criterion(d_z, fake_label)
+                d_z = self.D(g_X.detach())
+                errD_fake = self.criterion(d_z, fake_label)
 
-               errD = (errD_real + errD_fake)
+                errD = (errD_real + errD_fake)
 
-               self.manual_backward(errD)
-               if is_last_batch_to_accumulate:
-                   d_opt.step()
-                   d_opt.zero_grad()
+                self.manual_backward(errD)
+                if is_last_batch_to_accumulate:
+                    d_opt.step()
+                    d_opt.zero_grad()
 
-           ######################
-           # Optimize Generator #
-           ######################
-           with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
-               d_z = self.D(g_X)
-               errG = self.criterion(d_z, real_label)
+            ######################
+            # Optimize Generator #
+            ######################
+            with g_opt.toggle_model(sync_grad=is_last_batch_to_accumulate):
+                d_z = self.D(g_X)
+                errG = self.criterion(d_z, real_label)
 
-               self.manual_backward(errG)
-               if is_last_batch_to_accumulate:
-                   g_opt.step()
-                   g_opt.zero_grad()
+                self.manual_backward(errG)
+                if is_last_batch_to_accumulate:
+                    g_opt.step()
+                    g_opt.zero_grad()
 
-           self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
+            self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True)
 
 -----
 
@@ -306,23 +305,23 @@ Here is an example using a closure function.
 
 .. testcode:: python
 
-   def __init__(self):
-       super().__init__()
-       self.automatic_optimization = False
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
 
-   def configure_optimizers(self):
-       return torch.optim.LBFGS(...)
+    def configure_optimizers(self):
+        return torch.optim.LBFGS(...)
 
-   def training_step(self, batch, batch_idx):
-       opt = self.optimizers()
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
 
-       def closure():
-           loss = self.compute_loss(batch)
-           opt.zero_grad()
-           self.manual_backward(loss)
-           return loss
+        def closure():
+            loss = self.compute_loss(batch)
+            opt.zero_grad()
+            self.manual_backward(loss)
+            return loss
 
-       opt.step(closure=closure)
+        opt.step(closure=closure)
 
 ------
 
@@ -397,25 +396,25 @@ with the keyword ``"monitor"`` set to metric that the scheduler should be condit
 
 .. testcode::
 
-   # The ReduceLROnPlateau scheduler requires a monitor
-   def configure_optimizers(self):
-       optimizer = Adam(...)
-       return {
-           'optimizer': optimizer,
-           'lr_scheduler': ReduceLROnPlateau(optimizer, ...),
-           'monitor': 'metric_to_track',
-       }
+    # The ReduceLROnPlateau scheduler requires a monitor
+    def configure_optimizers(self):
+        optimizer = Adam(...)
+        return {
+            'optimizer': optimizer,
+            'lr_scheduler': ReduceLROnPlateau(optimizer, ...),
+            'monitor': 'metric_to_track',
+        }
 
-   # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler
-   def configure_optimizers(self):
-      optimizer1 = Adam(...)
-      optimizer2 = SGD(...)
-      scheduler1 = ReduceLROnPlateau(optimizer1, ...)
-      scheduler2 = LambdaLR(optimizer2, ...)
-      return (
-          {'optimizer': optimizer1, 'lr_scheduler': scheduler1, 'monitor': 'metric_to_track'},
-          {'optimizer': optimizer2, 'lr_scheduler': scheduler2},
-      )
+    # In the case of two optimizers, only one using the ReduceLROnPlateau scheduler
+    def configure_optimizers(self):
+       optimizer1 = Adam(...)
+       optimizer2 = SGD(...)
+       scheduler1 = ReduceLROnPlateau(optimizer1, ...)
+       scheduler2 = LambdaLR(optimizer2, ...)
+       return (
+           {'optimizer': optimizer1, 'lr_scheduler': scheduler1, 'monitor': 'metric_to_track'},
+           {'optimizer': optimizer2, 'lr_scheduler': scheduler2},
+       )
 
 .. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`.
 
@@ -437,21 +436,21 @@ a scheduler configuration should be returned as a dict which can contain the fol
 
 .. testcode:: python
 
-   # Same as the above example with additional params passed to the first scheduler
-   # In this case the ReduceLROnPlateau will step after every 10 processed batches
-   def configure_optimizers(self):
-      optimizers = [Adam(...), SGD(...)]
-      schedulers = [
-         {
-            'scheduler': ReduceLROnPlateau(optimizers[0], ...),
-            'monitor': 'metric_to_track',
-            'interval': 'step',
-            'frequency': 10,
-            'strict': True,
-         },
-         LambdaLR(optimizers[1], ...)
-      ]
-      return optimizers, schedulers
+    # Same as the above example with additional params passed to the first scheduler
+    # In this case the ReduceLROnPlateau will step after every 10 processed batches
+    def configure_optimizers(self):
+       optimizers = [Adam(...), SGD(...)]
+       schedulers = [
+          {
+             'scheduler': ReduceLROnPlateau(optimizers[0], ...),
+             'monitor': 'metric_to_track',
+             'interval': 'step',
+             'frequency': 10,
+             'strict': True,
+          },
+          LambdaLR(optimizers[1], ...)
+       ]
+       return optimizers, schedulers
 
 -----
 
@@ -461,34 +460,34 @@ To use multiple optimizers, return two or more optimizers from :meth:`pytorch_li
 
 .. testcode:: python
 
-   # two optimizers, no schedulers
-   def configure_optimizers(self):
-       return Adam(...), SGD(...)
+    # two optimizers, no schedulers
+    def configure_optimizers(self):
+        return Adam(...), SGD(...)
 
-   # two optimizers, one scheduler for adam only
-   def configure_optimizers(self):
-       return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'}
+    # two optimizers, one scheduler for adam only
+    def configure_optimizers(self):
+        return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'}
 
-   # two optimizers, two schedulers
-   def configure_optimizers(self):
-       opt1 = Adam(...)
-       opt2 = SGD(...)
-       return [opt1, opt2], [StepLR(opt1, ...), OneCycleLR(opt2, ...)]
+    # two optimizers, two schedulers
+    def configure_optimizers(self):
+        opt1 = Adam(...)
+        opt2 = SGD(...)
+        return [opt1, opt2], [StepLR(opt1, ...), OneCycleLR(opt2, ...)]
 
 Under the hood, Lightning will call each optimizer sequentially:
 
 .. code-block:: python
 
-   for epoch in epochs:
-       for batch in data:
-           for opt in optimizers:
-               loss = train_step(batch, batch_idx, optimizer_idx)
-               opt.zero_grad()
-               loss.backward()
-               opt.step()
+    for epoch in epochs:
+        for batch in data:
+            for opt in optimizers:
+                loss = train_step(batch, batch_idx, optimizer_idx)
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
 
-       for lr_scheduler in lr_schedulers:
-           lr_scheduler.step()
+        for lr_scheduler in lr_schedulers:
+            lr_scheduler.step()
 
 -----
 
@@ -501,51 +500,51 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
 
 .. testcode::
 
-   # Alternating schedule for optimizer steps (e.g. GANs)
-   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-       # update generator opt every 2 steps
-       if optimizer_idx == 0:
-           if batch_idx % 2 == 0:
-              optimizer.step(closure=optimizer_closure)
-
-       # update discriminator opt every 4 steps
-       if optimizer_idx == 1:
-           if batch_idx % 4 == 0:
+    # Alternating schedule for optimizer steps (e.g. GANs)
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+        # update generator opt every 2 steps
+        if optimizer_idx == 0:
+            if batch_idx % 2 == 0:
                optimizer.step(closure=optimizer_closure)
 
+        # update discriminator opt every 4 steps
+        if optimizer_idx == 1:
+            if batch_idx % 4 == 0:
+                optimizer.step(closure=optimizer_closure)
+
 Here we add a learning rate warm-up.
 
 .. testcode::
 
-   # learning rate warm-up
-   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-       # skip the first 500 steps
-       if self.trainer.global_step < 500:
-           lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
-           for pg in optimizer.param_groups:
-               pg['lr'] = lr_scale * self.hparams.learning_rate
+    # learning rate warm-up
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+        # skip the first 500 steps
+        if self.trainer.global_step < 500:
+            lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
+            for pg in optimizer.param_groups:
+                pg['lr'] = lr_scale * self.hparams.learning_rate
 
-       # update params
-       optimizer.step(closure=optimizer_closure)
+        # update params
+        optimizer.step(closure=optimizer_closure)
 
 .. note:: The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, gradient accumulation and much more ...
 
 .. testcode:: python
 
-   # function hook in LightningModule
-   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-       optimizer.step(closure=optimizer_closure)
+    # function hook in LightningModule
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+        optimizer.step(closure=optimizer_closure)
 
 .. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow.
 
 .. testcode:: python
 
-   # function hook in LightningModule
-   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+    # function hook in LightningModule
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
 
-       # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
-       # To access it, do as follow:
-       optimizer = optimizer.optimizer
+        # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
+        # To access it, do as follow:
+        optimizer = optimizer.optimizer
 
-       # run step. However, it won't work on TPU, AMP, etc...
-       optimizer.step(closure=optimizer_closure)
+        # run step. However, it won't work on TPU, AMP, etc...
+        optimizer.step(closure=optimizer_closure)

From 43e8ee692be9cada7966ac3021d667201dc90a9c Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 14 Apr 2021 06:01:32 +0900
Subject: [PATCH 06/14] Update docs

---
 docs/source/common/optimizers.rst | 250 +++++++++++++++++-------------
 1 file changed, 140 insertions(+), 110 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 39b56666900f4..57098738b02e6 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -8,7 +8,8 @@ Lightning offers two modes for managing the optimization process:
 - automatic optimization
 - manual optimization
 
-For the majority of research cases, **automatic optimization** will do the right thing for you and it is what most users should use.
+For the majority of research cases, **automatic optimization** will do the right thing for you and it is what most
+users should use.
 
 For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**.
 
@@ -16,7 +17,8 @@ For advanced/expert users who want to do esoteric optimization schedules or tech
 
 Manual optimization
 ===================
-For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to manually manage the optimization process.
+For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to
+manually manage the optimization process.
 
 This is only recommended for experts who need ultimate flexibility.
 Lightning will handle only precision and accelerators logic.
@@ -24,8 +26,12 @@ The users are left with ``optimizer.zero_grad()``, gradient accumulation, model
 
 To manually optimize, do the following:
 
-* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule``'s ``__init__`` function.
-* Use ``self.manual_backward(loss)`` instead of ``loss.backward()``.
+* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function.
+* Use the following functions and call them manually:
+
+  * ``optimizer.zero_grad()`` to clear the gradients from the previous training step
+  * ``self.manual_backward(loss)`` instead of ``loss.backward()``
+  * ``optimizer.step()`` to update your model parameters
 
 Here is a minimal example of manual optimization.
  
@@ -42,10 +48,8 @@ Here is a minimal example of manual optimization.
 
         def training_step(batch, batch_idx):
             opt = self.optimizers()
-
-            loss = self.compute_loss(batch)
-
             opt.zero_grad()
+            loss = self.compute_loss(batch)
             self.manual_backward(loss)
             opt.step()
 
@@ -57,77 +61,20 @@ Here is a minimal example of manual optimization.
    * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can
      access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step,
      Lightning won't be able to support accelerators and precision for you.
-   * Be careful where you call ``zero_grad``, or your model won't converge.
-     It is good practice to call ``zero_grad`` before ``manual_backward``.
-
------
-
-Learning rate scheduling [manual]
----------------------------------
-You can obtain learning schedulers to call ``lr_scheduler.step()`` at arbitrary intervals.
-Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined in ``LightningModule.configure_optimizers()``.
-
-.. warning::
-   * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored.
-   * Before 1.3, ``lr_scheduler.step()`` was automatically called in manual optimization.
-     From 1.3, ``lr_scheduler.step()`` is disabled so that you can call it at arbitrary intervals.
-
-Here is a example calling ``step()`` every step.
-
-.. testcode:: python
-
-    # step every batch
-
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-    def training_step(self, batch, batch_idx):
-        # do foward, backward, and optimization
-        ...
-
-        # single scheduler
-        sch = self.lr_schedulers()
-        sch.step()
-
-        # multiple schedulers
-        sch1, sch2 = self.lr_schedulers()
-        sch1.step()
-        sch2.step()
-
-If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following.
-
-.. testcode:: python
-
-    def __init__(self):
-        super().__init__()
-        self.automatic_optimization = False
-
-    def training_step(self, batch, batch_idx):
-        # do forward, backward, and optimization
-        ...
-
-        sch = self.lr_schedulers()
-
-        # step every `n` batches
-        if (batch_idx + 1) % n == 0:
-            sch.step()
-
-        # step every `n` epochs
-        if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0:
-            sch.step()
+   * Be careful where you call ``optimizer.zero_grad()``, or your model won't converge.
+     It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``.
 
 -----
 
 Gradient accumulation
 ---------------------
-You can accumulate gradients over batches similarly to :attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization.
+You can accumulate gradients over batches similarly to
+:attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization.
 To perform gradient accumulation with one optimizer, you can do as such.
 
 .. testcode:: python
 
-    # accumulate gradients over 2 batches
-
+    # accumulate gradients over `n` batches
     def __init__(self):
         super().__init__()
         self.automatic_optimization = False
@@ -138,15 +85,16 @@ To perform gradient accumulation with one optimizer, you can do as such.
         loss = self.compute_loss(batch)
         self.manual_backward(loss)
 
-        # accumulate gradients of 2 batches
-        if (batch_idx + 1) % 2 == 0:
+        # accumulate gradients of `n` batches
+        if (batch_idx + 1) % n == 0:
             opt.step()
             opt.zero_grad()
 
 -----
 
-Use multiple optimizers [manual]
---------------------------------
+Use multiple optimizers (like GANs) [manual]
+--------------------------------------------
+Here is an example training a simple GAN with multiple optimizers.
 
 .. testcode:: python
 
@@ -218,10 +166,68 @@ Use multiple optimizers [manual]
 
 -----
 
-Improve training time with model toggling
------------------------------------------
-Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers
-in a distributed setting.
+Learning rate scheduling [manual]
+---------------------------------
+You can call ``lr_scheduler.step()`` at arbitrary intervals.
+Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined
+in your ``LightningModule.configure_optimizers()``.
+
+.. warning::
+   * Before 1.3, Lightning automatically calls ``lr_scheduler.step()`` in both automatic and manual optimization. From
+     1.3, ``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals.
+   * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored even if they are provided in
+     your ``configure_optimizers()`` in manual optimization.
+
+Here is an example calling ``lr_scheduler.step()`` every step.
+
+.. testcode:: python
+
+    # step every batch
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+    def training_step(self, batch, batch_idx):
+        # do forward, backward, and optimization
+        ...
+
+        # single scheduler
+        sch = self.lr_schedulers()
+        sch.step()
+
+        # multiple schedulers
+        sch1, sch2 = self.lr_schedulers()
+        sch1.step()
+        sch2.step()
+
+If you want to call ``lr_scheduler.step()`` every ``n`` steps/epochs, do the following.
+
+.. testcode:: python
+
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+    def training_step(self, batch, batch_idx):
+        # do forward, backward, and optimization
+        ...
+
+        sch = self.lr_schedulers()
+
+        # step every `n` batches
+        if (batch_idx + 1) % n == 0:
+            sch.step()
+
+        # step every `n` epochs
+        if self.trainer.is_last_batch and (self.trainer.current_epoch + 1) % n == 0:
+            sch.step()
+
+-----
+
+Improve training speed with model toggling
+------------------------------------------
+Toggling models can improve your training speed when performing gradient accumulation with multiple optimizers in a
+distributed setting.
 
 Here is an explanation of what it does:
 
@@ -232,7 +238,9 @@ Here is an explanation of what it does:
 When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase.
 Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed.
 
-:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a :meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a :func:`contextlib.contextmanager` for advanced users.
+:class:`~pytorch_lightning.core.optimizer.LightningOptimizer` provides a
+:meth:`~pytorch_lightning.core.optimizer.LightningOptimizer.toggle_model` function as a
+:func:`contextlib.contextmanager` for advanced users.
 
 Here is an example for advanced use-case.
 
@@ -257,8 +265,11 @@ Here is an example for advanced use-case.
             real_label = torch.ones((batch_size, 1), device=self.device)
             fake_label = torch.zeros((batch_size, 1), device=self.device)
 
-            # Sync and clear gradients only at the end of accumulation.
-            is_last_batch_to_accumulate = (batch_idx + 1) % 2 == 0
+            # Sync and clear gradients
+            # at the end of accumulation or
+            # at the end of an epoch.
+            is_last_batch_to_accumulate = \
+                (batch_idx + 1) % 2 == 0 or self.trainer.is_last_batch
 
             g_X = self.sample_G(batch_size)
 
@@ -297,9 +308,11 @@ Here is an example for advanced use-case.
 
 Use closure for LBFGS-like optimizers
 -------------------------------------
-It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and ``backward`` of your model.
-It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure such as :class:`torch.optim.LBFGS`.
-See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more details about the closure.
+It is a good practice to provide the optimizer with a closure function that performs a ``forward``, ``zero_grad`` and
+``backward`` of your model. It is optional for most optimizers, but makes your code compatible if you switch to an
+optimizer which requires a closure, such as :class:`torch.optim.LBFGS`.
+
+See `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_ for more about the closure.
 
 Here is an example using a closure function.
 
@@ -327,7 +340,8 @@ Here is an example using a closure function.
 
 Automatic optimization
 ======================
-With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` since Lightning automates that for you.
+With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()``
+since Lightning automates that for you.
 
 Under the hood, Lightning does the following:
 
@@ -359,17 +373,19 @@ In the case of multiple optimizers, Lightning does the following:
             lr_scheduler.step()
 
 .. warning::
-   Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally.
-   From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``.
+   Before 1.2.2, Lightning internally calls ``backward``, ``step`` and ``zero_grad`` in the order.
+   From 1.2.2, the order is changed to ``zero_grad``, ``backward`` and ``step``.
 
 -----
 
 Learning rate scheduling
 ------------------------
-Every optimizer you use can be paired with any `Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
-In the basic use-case, the scheduler(s) should be returned as the second output from the :meth:`~pytorch_lightning.LightningModule.configure_optimizers` method:
+Every optimizer you use can be paired with any
+`Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_. In the basic
+use-case, the scheduler(s) should be returned as the second output from the
+:meth:`~pytorch_lightning.LightningModule.configure_optimizers` method:
 
-.. testcode::
+.. testcode:: python
 
    # no LR scheduler
    def configure_optimizers(self):
@@ -389,10 +405,10 @@ In the basic use-case, the scheduler(s) should be returned as the second output
        scheduler2 = LambdaLR(optimizer2, ...)
        return [optimizer1, optimizer2], [scheduler1, scheduler2]
 
-When there are schedulers in which the ``.step()`` method is conditioned on a metric value (for example the
-:class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler), Lightning requires that the output
-from :meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer,
-with the keyword ``"monitor"`` set to metric that the scheduler should be conditioned on.
+When there are schedulers in which the ``.step()`` method is conditioned on a metric value, such as the
+:class:`~torch.optim.lr_scheduler.ReduceLROnPlateau` scheduler, Lightning requires that the output from
+:meth:`~pytorch_lightning.LightningModule.configure_optimizers` should be dicts, one for each optimizer, with the
+keyword ``"monitor"`` set to metric that the scheduler should be conditioned on.
 
 .. testcode::
 
@@ -416,21 +432,22 @@ with the keyword ``"monitor"`` set to metric that the scheduler should be condit
            {'optimizer': optimizer2, 'lr_scheduler': scheduler2},
        )
 
-.. note:: Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)`` in your :class:`~pytorch_lightning.LightningModule`.
+.. note::
+   Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)``
+   in your :class:`~pytorch_lightning.LightningModule`.
 
-By default, all schedulers will be called after each epoch ends.
-To change this behaviour,
-a scheduler configuration should be returned as a dict which can contain the following keywords:
+By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration
+should be returned as a dict which can contain the following keywords:
 
 * ``"scheduler"`` (required): the actual scheduler object
 * ``"monitor"`` (optional): metric to condition
-* ``"interval"`` (optional): either ``epoch`` (default) for stepping after each epoch ends or ``step`` for stepping
+* ``"interval"`` (optional): either ``"epoch"`` (default) for stepping after each epoch ends or ``"step"`` for stepping
   after each optimization step
 * ``"frequency"`` (optional): how many epochs/steps should pass between calls to ``scheduler.step()``. Default is 1,
   corresponding to updating the learning rate after every epoch/step.
-* ``"strict"`` (optional): if set to ``True`` will enforce that value specified in ``monitor`` is available while trying
-  to call ``scheduler.step()``, and stop training if not found. If ``False`` will only give a warning and continue
-  training without calling the scheduler.
+* ``"strict"`` (optional): if set to ``True``, will enforce that value specified in ``"monitor"`` is available while
+  trying to call ``scheduler.step()``, and stop training if not found. If ``False``, it will only give a warning and
+  continue training without calling the scheduler.
 * ``"name"`` (optional): if using the :class:`~pytorch_lightning.callbacks.LearningRateMonitor` callback to monitor the
   learning rate progress, this keyword can be used to specify a name the learning rate should be logged as.
 
@@ -456,7 +473,8 @@ a scheduler configuration should be returned as a dict which can contain the fol
 
 Use multiple optimizers (like GANs)
 -----------------------------------
-To use multiple optimizers, return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
+To use multiple optimizers, return two or more optimizers from
+:meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
 
 .. testcode:: python
 
@@ -498,10 +516,13 @@ override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function.
 
 For example, here step optimizer A every 2 batches and optimizer B every 4 batches.
 
-.. testcode::
+.. testcode:: python
 
     # Alternating schedule for optimizer steps (e.g. GANs)
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+    def optimizer_step(
+        self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+        on_tpu=False, using_native_amp=False, using_lbfgs=False,
+    ):
         # update generator opt every 2 steps
         if optimizer_idx == 0:
             if batch_idx % 2 == 0:
@@ -514,10 +535,13 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
 
 Here we add a learning rate warm-up.
 
-.. testcode::
+.. testcode:: python
 
     # learning rate warm-up
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+    def optimizer_step(
+        self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+        on_tpu=False, using_native_amp=False, using_lbfgs=False,
+    ):
         # skip the first 500 steps
         if self.trainer.global_step < 500:
             lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
@@ -527,7 +551,10 @@ Here we add a learning rate warm-up.
         # update params
         optimizer.step(closure=optimizer_closure)
 
-.. note:: The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP, gradient accumulation and much more ...
+.. note::
+    The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal
+    :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP,
+    gradient accumulation and much more ...
 
 .. testcode:: python
 
@@ -535,13 +562,16 @@ Here we add a learning rate warm-up.
     def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
         optimizer.step(closure=optimizer_closure)
 
-.. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow.
+.. note::
+    To access your wrapped Optimizer from :class:`~pytorch_lightning.core.optimizer.LightningOptimizer`, do as follow.
 
 .. testcode:: python
 
     # function hook in LightningModule
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-
+    def optimizer_step(
+        self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+        on_tpu=False, using_native_amp=False, using_lbfgs=False,
+    ):
         # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
         # To access it, do as follow:
         optimizer = optimizer.optimizer

From 6e9bd4e85d08549832eecf8e594f0977e9ec6cf6 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 15 Apr 2021 11:31:15 +0900
Subject: [PATCH 07/14] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 docs/source/common/optimizers.rst | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 57098738b02e6..75ae5118ab848 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -26,9 +26,10 @@ The users are left with ``optimizer.zero_grad()``, gradient accumulation, model
 
 To manually optimize, do the following:
 
-* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function.
+* Set the ``self.automatic_optimization = False`` in your ``LightningModule``'s ``__init__``.
 * Use the following functions and call them manually:
 
+  * ``self.optimizers()`` to access your optimizers (one or multiple)
   * ``optimizer.zero_grad()`` to clear the gradients from the previous training step
   * ``self.manual_backward(loss)`` instead of ``loss.backward()``
   * ``optimizer.step()`` to update your model parameters
@@ -55,7 +56,7 @@ Here is a minimal example of manual optimization.
 
 .. warning::
    Before 1.2, ``optimizer.step()`` was calling ``optimizer.zero_grad()`` internally.
-   From 1.2, it is left to the users expertise.
+   From 1.2, it is left to the user's expertise.
 
 .. tip::
    * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can
@@ -169,14 +170,14 @@ Here is an example training a simple GAN with multiple optimizers.
 Learning rate scheduling [manual]
 ---------------------------------
 You can call ``lr_scheduler.step()`` at arbitrary intervals.
-Use ``self.lr_schedulers()`` in LightningModule to access your learning rate schedulers defined
-in your ``LightningModule.configure_optimizers()``.
+Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers defined
+in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`.
 
 .. warning::
-   * Before 1.3, Lightning automatically calls ``lr_scheduler.step()`` in both automatic and manual optimization. From
-     1.3, ``lr_scheduler.step()`` is disabled in manual optimization so that you can call it at arbitrary intervals.
+   * Before 1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From
+     1.3, ``lr_scheduler.step()`` is now for the user to call at arbitrary intervals.
    * Note that the lr_dict keys, such as ``"step"`` and ``""interval"``, will be ignored even if they are provided in
-     your ``configure_optimizers()`` in manual optimization.
+     your ``configure_optimizers()`` during manual optimization.
 
 Here is an example calling ``lr_scheduler.step()`` every step.
 
@@ -433,7 +434,7 @@ keyword ``"monitor"`` set to metric that the scheduler should be conditioned on.
        )
 
 .. note::
-   Metrics can be made available to condition on by simply logging it using ``self.log('metric_to_track', metric_val)``
+   Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)``
    in your :class:`~pytorch_lightning.LightningModule`.
 
 By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration

From 29f70c01da70fe61de128e8267148f1ef3c32a15 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Thu, 15 Apr 2021 11:46:07 +0900
Subject: [PATCH 08/14] Add note for optimizer.optimizer

---
 docs/source/common/optimizers.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 75ae5118ab848..a73b80791d1e2 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -564,7 +564,10 @@ Here we add a learning rate warm-up.
         optimizer.step(closure=optimizer_closure)
 
 .. note::
-    To access your wrapped Optimizer from :class:`~pytorch_lightning.core.optimizer.LightningOptimizer`, do as follow.
+    ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
+    configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own
+    optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be
+    able to support accelerators and precision for you.
 
 .. testcode:: python
 

From 5d6bb275b14a3aefd0507f66fbfb22cee1a84cb5 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 17 Apr 2021 03:03:17 +0900
Subject: [PATCH 09/14] .

---
 docs/source/common/lightning_module.rst |  9 +++-
 docs/source/common/optimizers.rst       | 72 ++++++++++++++-----------
 2 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index 95f7c0a6dcb7e..97d9fc6192710 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -916,7 +916,10 @@ True if using Automatic Mixed Precision (AMP)
 
 automatic_optimization
 ~~~~~~~~~~~~~~~~~~~~~~
-When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling your optimizers. However, we do take care of precision and any accelerators used.
+When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling
+your optimizers. However, we do take care of precision and any accelerators used.
+
+See :ref:`manual optimization<common/optimizers:Manual optimization>` for details.
 
 .. code-block:: python
 
@@ -931,7 +934,9 @@ When set to ``False``, Lightning does not automate the optimization process. Thi
         self.manual_backward(loss)
         opt.step()
 
-This is recommended only if using 2+ optimizers AND if you know how to perform the optimization procedure properly. Note that automatic optimization can still be used with multiple optimizers by relying on the ``optimizer_idx`` parameter. Manual optimization is most useful for research topics like reinforcement learning, sparse coding, and GAN research.
+This is recommended only if using 2+ optimizers AND if you know how to perform the optimization procedure properly. Note
+that automatic optimization can still be used with multiple optimizers by relying on the ``optimizer_idx`` parameter.
+Manual optimization is most useful for research topics like reinforcement learning, sparse coding, and GAN research.
 
 .. code-block:: python
 
diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index a73b80791d1e2..98eb26187b967 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -26,7 +26,7 @@ The users are left with ``optimizer.zero_grad()``, gradient accumulation, model
 
 To manually optimize, do the following:
 
-* Set the ``self.automatic_optimization = False`` in your ``LightningModule``'s ``__init__``.
+* Set ``self.automatic_optimization=False`` in your ``LightningModule``'s ``__init__``.
 * Use the following functions and call them manually:
 
   * ``self.optimizers()`` to access your optimizers (one or multiple)
@@ -170,8 +170,8 @@ Here is an example training a simple GAN with multiple optimizers.
 Learning rate scheduling [manual]
 ---------------------------------
 You can call ``lr_scheduler.step()`` at arbitrary intervals.
-Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers defined
-in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`.
+Use ``self.lr_schedulers()`` in  your :class:`~pytorch_lightning.LightningModule` to access any learning rate schedulers
+defined in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`.
 
 .. warning::
    * Before 1.3, Lightning automatically called ``lr_scheduler.step()`` in both automatic and manual optimization. From
@@ -434,8 +434,8 @@ keyword ``"monitor"`` set to metric that the scheduler should be conditioned on.
        )
 
 .. note::
-   Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)``
-   in your :class:`~pytorch_lightning.LightningModule`.
+    Metrics can be made available to monitor by simply logging it using ``self.log('metric_to_track', metric_val)`` in
+    your :class:`~pytorch_lightning.LightningModule`.
 
 By default, all schedulers will be called after each epoch ends. To change this behaviour, a scheduler configuration
 should be returned as a dict which can contain the following keywords:
@@ -474,8 +474,8 @@ should be returned as a dict which can contain the following keywords:
 
 Use multiple optimizers (like GANs)
 -----------------------------------
-To use multiple optimizers, return two or more optimizers from
-:meth:`pytorch_lightning.core.LightningModule.configure_optimizers`.
+To use multiple optimizers (optionally with learning rate schedulers), return two or more optimizers from
+:meth:`~pytorch_lightning.core.LightningModule.configure_optimizers`.
 
 .. testcode:: python
 
@@ -485,7 +485,11 @@ To use multiple optimizers, return two or more optimizers from
 
     # two optimizers, one scheduler for adam only
     def configure_optimizers(self):
-        return [Adam(...), SGD(...)], {'scheduler': ReduceLROnPlateau(), 'monitor': 'metric_to_track'}
+        opt1 = Adam(...)
+        opt2 = SGD(...)
+        optimizers = [opt1, opt2]
+        lr_schedulers = {'scheduler': ReduceLROnPlateau(opt1, ...), 'monitor': 'metric_to_track'}
+        return optimizers, lr_schedulers
 
     # two optimizers, two schedulers
     def configure_optimizers(self):
@@ -515,7 +519,12 @@ Step optimizers at arbitrary intervals
 To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling,
 override the :meth:`~pytorch_lightning.LightningModule.optimizer_step` function.
 
-For example, here step optimizer A every 2 batches and optimizer B every 4 batches.
+.. warning::
+    If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter to
+    ``optimizer.step()`` function as shown in the examples because ``training_step()``, ``optimizer.zero_grad()``,
+    ``backward()`` are called in the closure function.
+
+For example, here step optimizer A every batch and optimizer B every 2 batches.
 
 .. testcode:: python
 
@@ -524,16 +533,18 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
         self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
         on_tpu=False, using_native_amp=False, using_lbfgs=False,
     ):
-        # update generator opt every 2 steps
+        # update generator every step
         if optimizer_idx == 0:
-            if batch_idx % 2 == 0:
-               optimizer.step(closure=optimizer_closure)
+            optimizer.step(closure=optimizer_closure)
 
-        # update discriminator opt every 4 steps
+        # update discriminator every 2 steps
         if optimizer_idx == 1:
-            if batch_idx % 4 == 0:
+            if (batch_idx + 1) % 2 == 0:
                 optimizer.step(closure=optimizer_closure)
 
+        # ...
+        # add as many optimizers as you want
+
 Here we add a learning rate warm-up.
 
 .. testcode:: python
@@ -552,33 +563,30 @@ Here we add a learning rate warm-up.
         # update params
         optimizer.step(closure=optimizer_closure)
 
-.. note::
-    The default :meth:`~pytorch_lightning.LightningModule.optimizer_step` is relying on the internal
-    :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` to properly perform a step. It handles TPUs, AMP,
-    gradient accumulation and much more ...
+-----
+
+Access your own optimizer
+-------------------------
+``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
+configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own optimizer
+with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to
+support accelerators and precision for you.
 
 .. testcode:: python
 
     # function hook in LightningModule
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+    def optimizer_step(
+        self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+        on_tpu=False, using_native_amp=False, using_lbfgs=False,
+    ):
         optimizer.step(closure=optimizer_closure)
 
-.. note::
-    ``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
-    configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own
-    optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be
-    able to support accelerators and precision for you.
-
-.. testcode:: python
-
-    # function hook in LightningModule
+    # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
+    # To access it, do the following.
+    # However, It won't work on TPU, AMP, etc...
     def optimizer_step(
         self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
         on_tpu=False, using_native_amp=False, using_lbfgs=False,
     ):
-        # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
-        # To access it, do as follow:
         optimizer = optimizer.optimizer
-
-        # run step. However, it won't work on TPU, AMP, etc...
         optimizer.step(closure=optimizer_closure)

From e0b3633e8222f9703e635d639f881a025d76c0ab Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 17 Apr 2021 03:10:57 +0900
Subject: [PATCH 10/14] Update hooks

---
 pytorch_lightning/core/hooks.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index b320a9b223840..9830e6ca38fa6 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -260,7 +260,7 @@ def on_predict_end(self) -> None:
 
     def on_before_zero_grad(self, optimizer: Optimizer) -> None:
         """
-        Called after optimizer.step() and before optimizer.zero_grad().
+        Called after ``training_step()`` and before ``optimizer.zero_grad()``.
 
         Called in the training loop after taking an optimizer step and before zeroing grads.
         Good place to inspect weight information with weights updated.
@@ -268,10 +268,13 @@ def on_before_zero_grad(self, optimizer: Optimizer) -> None:
         This is where it is called::
 
             for optimizer in optimizers:
-                optimizer.step()
+                out = training_step(...)
+
                 model.on_before_zero_grad(optimizer) # < ---- called here
                 optimizer.zero_grad()
 
+                backward()
+
         Args:
             optimizer: The optimizer for which grads should be zeroed.
         """

From 451233486c9bac0374dfc79992f4d5252985b1bd Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 17 Apr 2021 03:11:23 +0900
Subject: [PATCH 11/14] Update closure docstring

---
 pytorch_lightning/trainer/training_loop.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 71d9407062001..8dbc41821b24a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -651,9 +651,7 @@ def _process_closure_result(self, batch_outputs: list, opt_idx: int) -> list:
         return batch_outputs
 
     def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens):
-        """
-        wrap the forward step in a closure so second order methods work
-        """
+        """Wrap forward, zero_grad and backward in a closure so second order methods work"""
         with self.trainer.profiler.profile("training_step_and_backward"):
             # lightning module hook
             result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)

From eb9ecc3447927c3650549f603a73fae1db8f5921 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 17 Apr 2021 03:25:11 +0900
Subject: [PATCH 12/14] Update optimizer methods

---
 docs/source/common/lightning_module.rst |  13 +-
 pytorch_lightning/core/lightning.py     | 156 +++++++++++++-----------
 2 files changed, 88 insertions(+), 81 deletions(-)

diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index 97d9fc6192710..2f61fdc47397f 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -698,6 +698,12 @@ log_dict
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.log_dict
     :noindex:
 
+manual_backward
+~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward
+    :noindex:
+
 print
 ~~~~~
 
@@ -1091,13 +1097,6 @@ get_progress_bar_dict
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict
     :noindex:
 
-manual_backward
-~~~~~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward
-    :noindex:
-
-
 on_after_backward
 ~~~~~~~~~~~~~~~~~
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 7efe88515b37e..46543a12a435f 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1097,28 +1097,22 @@ def configure_optimizers(self):
         Return:
             Any of these 6 options.
 
-            - Single optimizer.
-            - List or Tuple - List of optimizers.
-            - Two lists - The first list has multiple optimizers, the second a list of LR schedulers (or lr_dict).
-            - Dictionary, with an 'optimizer' key, and (optionally) a 'lr_scheduler'
+            - **Single optimizer**.
+            - **List or Tuple** of optimizers.
+            - **Two lists** - The first list has multiple optimizers, and the second has multiple LR schedulers (or
+              multiple lr_dict).
+            - **Dictionary**, with an ``"optimizer"`` key, and (optionally) a ``"lr_scheduler"``
               key whose value is a single LR scheduler or lr_dict.
-            - Tuple of dictionaries as described, with an optional 'frequency' key.
-            - None - Fit will run without any optimizer.
+            - **Tuple of dictionaries** as described above, with an optional ``"frequency"`` key.
+            - **None** - Fit will run without any optimizer.
 
         Note:
-            The 'frequency' value is an int corresponding to the number of sequential batches
-            optimized with the specific optimizer. It should be given to none or to all of the optimizers.
-            There is a difference between passing multiple optimizers in a list,
-            and passing multiple optimizers in dictionaries with a frequency of 1:
-            In the former case, all optimizers will operate on the given batch in each optimization step.
-            In the latter, only one optimizer will operate on the given batch at every step.
-
-            The lr_dict is a dictionary which contains the scheduler and its associated configuration.
-            The default configuration is shown below.
+            The lr_dict is a dictionary which contains the scheduler and its associated configuration. The default
+            configuration is shown below.
 
             .. code-block:: python
 
-                {
+                lr_dict = {
                     'scheduler': lr_scheduler, # The LR scheduler instance (required)
                     'interval': 'epoch', # The unit of the scheduler's step size
                     'frequency': 1, # The frequency of the scheduler
@@ -1128,43 +1122,51 @@ def configure_optimizers(self):
                     'name': None, # Custom name for LearningRateMonitor to use
                 }
 
-            Only the ``scheduler`` key is required, the rest will be set to the defaults above.
+            Only the ``"scheduler"`` key is required, the rest will be set to the defaults above.
+
+        Note:
+            The ``"frequency"`` value is an ``int`` corresponding to the number of sequential batches optimized with the
+            specific optimizer. It should be given to none or to all of the optimizers.
+
+            There is a difference between passing multiple optimizers in a list and passing multiple optimizers in
+            dictionaries with a frequency of 1:
+            In the former case, all optimizers will operate on the given batch in each optimization step.
+            In the latter, only one optimizer will operate on the given batch at every step.
 
         Examples::
 
             # most cases
             def configure_optimizers(self):
-                opt = Adam(self.parameters(), lr=1e-3)
-                return opt
+                return Adam(self.parameters(), lr=1e-3)
 
             # multiple optimizer case (e.g.: GAN)
             def configure_optimizers(self):
-                generator_opt = Adam(self.model_gen.parameters(), lr=0.01)
-                disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02)
-                return generator_opt, disriminator_opt
+                gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
+                dis_opt = Adam(self.model_dis.parameters(), lr=0.02)
+                return gen_opt, dis_opt
 
             # example with learning rate schedulers
             def configure_optimizers(self):
-                generator_opt = Adam(self.model_gen.parameters(), lr=0.01)
-                disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02)
-                discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10)
-                return [generator_opt, disriminator_opt], [discriminator_sched]
+                gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
+                dis_opt = Adam(self.model_dis.parameters(), lr=0.02)
+                dis_sch = CosineAnnealing(dis_opt, T_max=10)
+                return [gen_opt, dis_opt], [dis_sch]
 
             # example with step-based learning rate schedulers
             def configure_optimizers(self):
                 gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
-                dis_opt = Adam(self.model_disc.parameters(), lr=0.02)
-                gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99),
-                             'interval': 'step'}  # called after each training step
-                dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called every epoch
-                return [gen_opt, dis_opt], [gen_sched, dis_sched]
+                dis_opt = Adam(self.model_dis.parameters(), lr=0.02)
+                gen_sch = {'scheduler': ExponentialLR(gen_opt, 0.99),
+                           'interval': 'step'}  # called after each training step
+                dis_sch = CosineAnnealing(dis_opt, T_max=10) # called every epoch
+                return [gen_opt, dis_opt], [gen_sch, dis_sch]
 
             # example with optimizer frequencies
             # see training procedure in `Improved Training of Wasserstein GANs`, Algorithm 1
             # https://arxiv.org/abs/1704.00028
             def configure_optimizers(self):
                 gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
-                dis_opt = Adam(self.model_disc.parameters(), lr=0.02)
+                dis_opt = Adam(self.model_dis.parameters(), lr=0.02)
                 n_critic = 5
                 return (
                     {'optimizer': dis_opt, 'frequency': n_critic},
@@ -1172,32 +1174,22 @@ def configure_optimizers(self):
                 )
 
         Note:
-
             Some things to know:
 
-            - Lightning calls ``.backward()`` and ``.step()`` on each optimizer
-              and learning rate scheduler as needed.
-
-            - If you use 16-bit precision (``precision=16``), Lightning will automatically
-              handle the optimizers for you.
-
-            - If you use multiple optimizers, :meth:`training_step` will have an additional
-              ``optimizer_idx`` parameter.
-
-            - If you use LBFGS Lightning handles the closure function automatically for you.
-
-            - If you use multiple optimizers, gradients will be calculated only
-              for the parameters of current optimizer at each training step.
-
-            - If you need to control how often those optimizers step or override the
-              default ``.step()`` schedule, override the :meth:`optimizer_step` hook.
-
-            - If you only want to call a learning rate scheduler every ``x`` step or epoch,
-              or want to monitor a custom metric, you can specify these in a lr_dict:
+            - Lightning calls ``.backward()`` and ``.step()`` on each optimizer and learning rate scheduler as needed.
+            - If you use 16-bit precision (``precision=16``), Lightning will automatically handle the optimizers.
+            - If you use multiple optimizers, :meth:`training_step` will have an additional ``optimizer_idx`` parameter.
+            - If you use :class:`torch.optim.LBFGS`, Lightning handles the closure function automatically for you.
+            - If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer
+              at each training step.
+            - If you need to control how often those optimizers step or override the default ``.step()`` schedule,
+              override the :meth:`optimizer_step` hook.
+            - If you only want to call a learning rate scheduler every ``x`` step or epoch, or want to monitor a custom
+              metric, you can specify these in a lr_dict:
 
               .. code-block:: python
 
-                  {
+                  lr_dict = {
                       'scheduler': lr_scheduler,
                       'interval': 'step',  # or 'epoch'
                       'monitor': 'val_f1',
@@ -1210,23 +1202,24 @@ def configure_optimizers(self):
     def manual_backward(self, loss: Tensor, optimizer: Optional[Optimizer] = None, *args, **kwargs) -> None:
         """
         Call this directly from your training_step when doing optimizations manually.
-        By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you
+        By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you.
 
         This function forwards all args to the .backward() call as well.
 
-        .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set
+        See :ref:`manual optimization<common/optimizers:Manual optimization>` for more examples.
 
-        .. tip:: In manual mode we still automatically accumulate grad over batches if
-           Trainer(accumulate_grad_batches=x) is set and you use `optimizer.step()`
+        .. tip::
+            In manual mode, we still automatically clip grads if ``Trainer(gradient_clip_val=x)`` is set.
 
         Example::
 
             def training_step(...):
-                opt_a, opt_b = self.optimizers()
+                opt = self.optimizers()
                 loss = ...
+                opt.zero_grad()
                 # automatically applies scaling, etc...
                 self.manual_backward(loss)
-                opt_a.step()
+                opt.step()
         """
         if optimizer is not None:
             rank_zero_deprecation(
@@ -1336,18 +1329,18 @@ def optimizer_step(
         Warning:
             If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter
             to ``optimizer.step()`` function as shown in the examples. This ensures that
-            ``train_step_and_backward_closure`` is called within
+            ``training_step()``, ``optimizer.zero_grad()``, ``backward()`` are called within
             :meth:`~pytorch_lightning.trainer.training_loop.TrainLoop.run_training_batch`.
 
         Args:
             epoch: Current epoch
             batch_idx: Index of current batch
             optimizer: A PyTorch optimizer
-            optimizer_idx: If you used multiple optimizers this indexes into that list.
-            optimizer_closure: closure for all optimizers
-            on_tpu: true if TPU backward is required
-            using_native_amp: True if using native amp
-            using_lbfgs: True if the matching optimizer is lbfgs
+            optimizer_idx: If you used multiple optimizers, this indexes into that list.
+            optimizer_closure: Closure for all optimizers
+            on_tpu: ``True`` if TPU backward is required
+            using_native_amp: ``True`` if using native amp
+            using_lbfgs: True if the matching optimizer is :class:`torch.optim.LBFGS`
 
         Examples::
 
@@ -1359,22 +1352,18 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
             # Alternating schedule for optimizer steps (i.e.: GANs)
             def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
                                optimizer_closure, on_tpu, using_native_amp, using_lbfgs):
-                # update generator opt every 2 steps
+                # update generator opt every step
                 if optimizer_idx == 0:
-                    if batch_idx % 2 == 0 :
-                        optimizer.step(closure=optimizer_closure)
-                        optimizer.zero_grad()
+                    optimizer.step(closure=optimizer_closure)
 
-                # update discriminator opt every 4 steps
+                # update discriminator opt every 2 steps
                 if optimizer_idx == 1:
-                    if batch_idx % 4 == 0 :
+                    if (batch_idx + 1) % 2 == 0 :
                         optimizer.step(closure=optimizer_closure)
-                        optimizer.zero_grad()
 
                 # ...
                 # add as many optimizers as you want
 
-
         Here's another example showing how to use this for more advanced things such as
         learning rate warm-up:
 
@@ -1391,7 +1380,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
 
                 # update params
                 optimizer.step(closure=optimizer_closure)
-                optimizer.zero_grad()
 
         """
         if not isinstance(optimizer, LightningOptimizer):
@@ -1400,6 +1388,26 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
         optimizer.step(closure=optimizer_closure)
 
     def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
+        """Override this method to change the default behaviour of ``optimizer.zero_grad()``.
+
+        Args:
+            epoch: Current epoch
+            batch_idx: Index of current batch
+            optimizer: A PyTorch optimizer
+            optimizer_idx: If you used multiple optimizers this indexes into that list.
+
+        Examples::
+
+            # DEFAULT
+            def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx):
+                optimizer.zero_grad()
+
+            # Set gradients to `None` instead of zero to improve performance.
+            def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx):
+                optimizer.zero_grad(set_to_none=True)
+
+        See :meth:`torch.optim.Optimizer.zero_grad` for the explanation of the above example.
+        """
         optimizer.zero_grad()
 
     def tbptt_split_batch(self, batch: Tensor, split_size: int) -> list:

From 715bb2c6f84376d8d99a00743b075a10f7760194 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 17 Apr 2021 03:33:20 +0900
Subject: [PATCH 13/14] Update optimizer

---
 docs/source/common/optimizers.rst | 33 +++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 98eb26187b967..d9b8d25911009 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -59,11 +59,8 @@ Here is a minimal example of manual optimization.
    From 1.2, it is left to the user's expertise.
 
 .. tip::
-   * ``self.optimizers()`` will return :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` objects. You can
-     access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step,
-     Lightning won't be able to support accelerators and precision for you.
-   * Be careful where you call ``optimizer.zero_grad()``, or your model won't converge.
-     It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``.
+   Be careful where you call ``optimizer.zero_grad()``, or your model won't converge.
+   It is good practice to call ``optimizer.zero_grad()`` before ``self.manual_backward(loss)``.
 
 -----
 
@@ -339,6 +336,30 @@ Here is an example using a closure function.
 
 ------
 
+Access your own optimizer [manual]
+----------------------------------
+``optimizer`` is a :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` object wrapping your own optimizer
+configured in your :meth:`~pytorch_lightning.LightningModule.configure_optimizers`. You can access your own optimizer
+with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to
+support accelerators and precision for you.
+
+.. testcode:: python
+
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
+    def training_step(batch, batch_idx):
+        optimizer = self.optimizers()
+
+        # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
+        # To access it, do the following.
+        # However, it won't work on TPU, AMP, etc...
+        optimizer = optimizer.optimizer
+        ...
+
+-----
+
 Automatic optimization
 ======================
 With Lightning, most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()``
@@ -583,7 +604,7 @@ support accelerators and precision for you.
 
     # `optimizer` is a `LightningOptimizer` wrapping the optimizer.
     # To access it, do the following.
-    # However, It won't work on TPU, AMP, etc...
+    # However, it won't work on TPU, AMP, etc...
     def optimizer_step(
         self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
         on_tpu=False, using_native_amp=False, using_lbfgs=False,

From 23803e35c1ef90d8d3e9edfa7aac1d36318ffacc Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 17 Apr 2021 10:12:10 +0900
Subject: [PATCH 14/14] Remove manopt + grad clipping (by @flukeskywalker)

---
 pytorch_lightning/core/lightning.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 46543a12a435f..54ea9d1bdb77e 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1208,9 +1208,6 @@ def manual_backward(self, loss: Tensor, optimizer: Optional[Optimizer] = None, *
 
         See :ref:`manual optimization<common/optimizers:Manual optimization>` for more examples.
 
-        .. tip::
-            In manual mode, we still automatically clip grads if ``Trainer(gradient_clip_val=x)`` is set.
-
         Example::
 
             def training_step(...):