diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 2478f698e659b..6697181bb98f3 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""nn.Module with additional great features."""
+"""The LightningModule - an nn.Module with many additional features."""
 
 import collections
 import copy
@@ -93,16 +93,16 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 
         self._loaded_optimizer_states_dict = {}
 
-        #: Pointer to the trainer object
+        # pointer to the trainer object
         self.trainer = None
 
         self._distrib_type = None
         self._device_type = None
 
-        #: True if using amp
+        # true if using amp
         self.use_amp: bool = False
 
-        #: The precision used
+        # the precision used
         self.precision: int = 32
 
         # optionally can be set by user
@@ -117,6 +117,17 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self._metric_attributes: Optional[Dict[int, str]] = None
 
     def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]:
+        """
+        Returns the optimizer(s) that are being used during training. Useful for manual optimization.
+
+        Args:
+            use_pl_optimizer: If ``True``, will wrap the optimizer(s) in a
+                :class:`~pytorch_lightning.core.optimizer.LightningOptimizer` for automatic handling of precision and
+                profiling.
+
+        Returns:
+            A single optimizer, or a list of optimizers in case multiple ones are present.
+        """
         if use_pl_optimizer:
             opts = list(self.trainer.lightning_optimizers.values())
         else:
@@ -129,6 +140,13 @@ def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Opt
         return opts
 
     def lr_schedulers(self) -> Optional[Union[Any, List[Any]]]:
+        """
+        Returns the learning rate scheduler(s) that are being used during training. Useful for manual optimization.
+
+        Returns:
+            A single scheduler, or a list of schedulers in case multiple ones are present, or ``None`` if no
+            schedulers were returned in :meth:`configure_optimizers`.
+        """
         if not self.trainer.lr_schedulers:
             return None
 
@@ -144,32 +162,43 @@ def lr_schedulers(self) -> Optional[Union[Any, List[Any]]]:
 
     @property
     def example_input_array(self) -> Any:
+        """
+        The example input array is a specification of what the module can consume in the :meth:`forward` method.
+        The return type is interpreted as follows:
+
+        -   Single tensor: It is assumed the model takes a single argument, i.e.,
+            ``model.forward(model.example_input_array)``
+        -   Tuple: The input array should be interpreted as a sequence of positional arguments, i.e.,
+            ``model.forward(*model.example_input_array)``
+        -   Dict: The input array represents named keyword arguments, i.e.,
+            ``model.forward(**model.example_input_array)``
+        """
         return self._example_input_array
 
+    @example_input_array.setter
+    def example_input_array(self, example: Any) -> None:
+        self._example_input_array = example
+
     @property
     def current_epoch(self) -> int:
-        """The current epoch"""
+        """The current epoch in the Trainer. If no Trainer is attached, this propery is 0."""
         return self.trainer.current_epoch if self.trainer else 0
 
     @property
     def global_step(self) -> int:
-        """Total training batches seen across all epochs"""
+        """Total training batches seen across all epochs. If no Trainer is attached, this propery is 0."""
         return self.trainer.global_step if self.trainer else 0
 
     @property
     def global_rank(self) -> int:
-        """ The index of the current process across all nodes and devices. """
+        """The index of the current process across all nodes and devices."""
         return self.trainer.global_rank if self.trainer else 0
 
     @property
     def local_rank(self) -> int:
-        """ The index of the current process within a single node. """
+        """The index of the current process within a single node."""
         return self.trainer.local_rank if self.trainer else 0
 
-    @example_input_array.setter
-    def example_input_array(self, example: Any) -> None:
-        self._example_input_array = example
-
     @property
     def datamodule(self) -> Any:
         warning_cache.deprecation(
@@ -204,7 +233,7 @@ def datamodule(self, datamodule: Any) -> None:
     @property
     def on_gpu(self):
         """
-        True if your model is currently running on GPUs.
+        Returns ``True`` if this model is currently located on a GPU.
         Useful to set flags around the LightningModule for different CPU vs GPU behavior.
         """
         return self.device.type == "cuda"
@@ -212,7 +241,7 @@ def on_gpu(self):
     @property
     def automatic_optimization(self) -> bool:
         """
-        If False you are responsible for calling .backward, .step, zero_grad.
+        If set to ``False`` you are responsible for calling ``.backward()``, ``.step()``, ``.zero_grad()``.
         """
         return self._automatic_optimization
 
@@ -223,8 +252,9 @@ def automatic_optimization(self, automatic_optimization: bool) -> None:
     @property
     def truncated_bptt_steps(self) -> int:
         """
-        truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of much a longer sequence.
-        If this is > 0, the training step is passed ``hiddens``.
+        Enables `Truncated Backpropagation Through Time` in the Trainer when set to a positive integer. It represents
+        the number of times :meth:`training_step` gets called before backpropagation. If this is > 0, the
+        :meth:`training_step` receives an additional argument ``hiddens`` and is expected to return a hidden state.
         """
         return self._truncated_bptt_steps
 
@@ -298,13 +328,13 @@ def log(
         rank_zero_only: Optional[bool] = None,
     ) -> None:
         """
-        Log a key, value
+        Log a key, value pair.
 
         Example::
 
             self.log('train_loss', loss)
 
-        The default behavior per hook is as follows
+        The default behavior per hook is as follows:
 
         .. csv-table:: ``*`` also applies to the test loop
            :header: "LightningModule Hook", "on_step", "on_epoch", "prog_bar", "logger"
@@ -448,7 +478,7 @@ def log_dict(
         add_dataloader_idx: bool = True,
     ) -> None:
         """
-        Log a dictionary of values at once
+        Log a dictionary of values at once.
 
         Example::
 
@@ -592,16 +622,14 @@ def all_gather(
         sync_grads: bool = False,
     ):
         r"""
-        Allows users to call ``self.all_gather()`` from the LightningModule, thus making
-        the ```all_gather``` operation accelerator agnostic.
-
-        ```all_gather``` is a function provided by accelerators to gather a tensor from several
-        distributed processes
+        Allows users to call ``self.all_gather()`` from the LightningModule, thus making the ``all_gather`` operation
+        accelerator agnostic. ``all_gather`` is a function provided by accelerators to gather a tensor from several
+        distributed processes.
 
         Args:
-            tensor: int, float, tensor of shape (batch, ...), or a (possibly nested) collection thereof.
+            data: int, float, tensor of shape (batch, ...), or a (possibly nested) collection thereof.
             group: the process group to gather results from. Defaults to all processes (world)
-            sync_grads: flag that allows users to synchronize gradients for all_gather op
+            sync_grads: flag that allows users to synchronize gradients for the all_gather operation
 
         Return:
             A tensor of shape (world_size, batch, ...), or if the input was a collection
@@ -756,7 +784,7 @@ def training_step_end(self, training_step_outputs):
     def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
         """
         Called at the end of the training epoch with the outputs of all training steps.
-        Use this in case you need to do something with all the outputs for every training_step.
+        Use this in case you need to do something with all the outputs returned by :meth:`training_step`.
 
         .. code-block:: python
 
@@ -816,8 +844,6 @@ def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]:
                 (only if multiple val dataloaders used)
 
         Return:
-           Any of.
-
             - Any object or value
             - ``None`` - Validation will skip to the next batch
 
@@ -1237,9 +1263,9 @@ def configure_optimizers(self):
             - **Single optimizer**.
             - **List or Tuple** of optimizers.
             - **Two lists** - The first list has multiple optimizers, and the second has multiple LR schedulers
-                (or multiple ``lr_dict``).
+              (or multiple ``lr_dict``).
             - **Dictionary**, with an ``"optimizer"`` key, and (optionally) a ``"lr_scheduler"``
-                key whose value is a single LR scheduler or ``lr_dict``.
+              key whose value is a single LR scheduler or ``lr_dict``.
             - **Tuple of dictionaries** as described above, with an optional ``"frequency"`` key.
             - **None** - Fit will run without any optimizer.
 
@@ -1394,10 +1420,8 @@ def configure_optimizers(self):
 
     def manual_backward(self, loss: Tensor, optimizer: Optional[Optimizer] = None, *args, **kwargs) -> None:
         """
-        Call this directly from your training_step when doing optimizations manually.
-        By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you.
-
-        This function forwards all args to the .backward() call as well.
+        Call this directly from your :meth:`training_step` when doing optimizations manually.
+        By using this, Lightning can ensure that all the proper scaling gets applied when using mixed precision.
 
         See :ref:`manual optimization<common/optimizers:Manual optimization>` for more examples.
 
@@ -1410,6 +1434,12 @@ def training_step(...):
                 # automatically applies scaling, etc...
                 self.manual_backward(loss)
                 opt.step()
+
+        Args:
+            loss: The tensor on which to compute gradients. Must have a graph attached.
+            optimizer: This argument is unused and deprecated. It will be removed in v1.4.
+            *args: Additional positional arguments to be forwarded to :meth:`~torch.Tensor.backward`
+            **kwargs: Additional keyword arguments to be forwarded to :meth:`~torch.Tensor.backward`
         """
         if optimizer is not None:
             rank_zero_deprecation(
@@ -1426,22 +1456,19 @@ def training_step(...):
 
     def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None:
         """
-        Override backward with your own implementation if you need to.
+        Called to perform backward on the loss returned in :meth:`training_step`.
+        Override this hook with your own implementation if you need to.
 
         Args:
-            loss: Loss is already scaled by accumulated grads
+            loss: The loss tensor returned by :meth:`training_step`. If gradient accumulation is used, the loss here
+                holds the normalized value (scaled by 1 / accumulation steps).
             optimizer: Current optimizer being used
             optimizer_idx: Index of the current optimizer being used
 
-        Called to perform backward step.
-        Feel free to override as needed.
-        The loss passed in has already been scaled for accumulated gradients if requested.
-
         Example::
 
             def backward(self, loss, optimizer, optimizer_idx):
                 loss.backward()
-
         """
         if self.automatic_optimization or self._running_manual_backward:
             loss.backward(*args, **kwargs)
@@ -1450,18 +1477,16 @@ def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):
         """
         Makes sure only the gradients of the current optimizer's parameters are calculated
         in the training step to prevent dangling gradients in multiple-optimizer setup.
-
-        .. note:: Only called when using multiple optimizers
-
-        Override for your own behavior
-
-        It works with ``untoggle_optimizer`` to make sure param_requires_grad_state is properly reset.
+        It works with :meth:`untoggle_optimizer` to make sure ``param_requires_grad_state`` is properly reset.
+        Override for your own behavior.
 
         Args:
-            optimizer: Current optimizer used in training_loop
-            optimizer_idx: Current optimizer idx in training_loop
-        """
+            optimizer: Current optimizer used in the training loop
+            optimizer_idx: Current optimizer idx in the training loop
 
+        Note:
+            Only called when using multiple optimizers
+        """
         # Iterate over all optimizer parameters to preserve their `requires_grad` information
         # in case these are pre-defined during `configure_optimizers`
         param_requires_grad_state = {}
@@ -1483,12 +1508,14 @@ def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):
 
     def untoggle_optimizer(self, optimizer_idx: int):
         """
-        .. note:: Only called when using multiple optimizers
-
-        Override for your own behavior
+        Resets the state of required gradients that were toggled with :meth:`toggle_optimizer`.
+        Override for your own behavior.
 
         Args:
-            optimizer_idx: Current optimizer idx in training_loop
+            optimizer_idx: Current optimizer idx in the training loop
+
+        Note:
+            Only called when using multiple optimizers
         """
         for opt_idx, opt in enumerate(self.optimizers(use_pl_optimizer=False)):
             if optimizer_idx != opt_idx:
@@ -1520,8 +1547,7 @@ def optimizer_step(
         Warning:
             If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter
             to ``optimizer.step()`` function as shown in the examples. This ensures that
-            ``training_step()``, ``optimizer.zero_grad()``, ``backward()`` are called within
-            :meth:`~pytorch_lightning.loops.training_batch_loop.TrainingBatchLoop.advance`.
+            ``training_step()``, ``optimizer.zero_grad()``, ``backward()`` are called within the training loop.
 
         Args:
             epoch: Current epoch
@@ -1662,6 +1688,21 @@ def tbptt_split_batch(self, batch, split_size):
         return splits
 
     def summarize(self, mode: Optional[str] = "top", max_depth: Optional[int] = None) -> Optional[ModelSummary]:
+        """
+        Summarize this LightningModule.
+
+        Args:
+            mode: Can be either ``'top'`` (summarize only direct submodules) or ``'full'`` (summarize all layers).
+
+                .. deprecated:: v1.4
+                    This parameter was deprecated in v1.4 in favor of `max_depth` and will be removed in v1.6.
+
+            max_depth: The maximum depth of layer nesting that the summary will include. A value of 0 turns the
+                layer summary off. Default: 1.
+
+        Return:
+            The model summary object
+        """
         model_summary = None
 
         # temporary mapping from mode to max_depth
@@ -1801,16 +1842,16 @@ def save_hyperparameters(
         ignore: Optional[Union[Sequence[str], str]] = None,
         frame: Optional[types.FrameType] = None
     ) -> None:
-        """Save model arguments to ``hparams`` attribute.
+        """Save model arguments to the ``hparams`` attribute.
 
         Args:
-            args: single object of `dict`, `NameSpace` or `OmegaConf`
-                or string names or arguments from class ``__init__``
-            ignore: an argument name or a list of argument names from
-                class ``__init__`` to be ignored
-            frame: a frame object. Default is None
+            args: single object of type :class:`dict`, :class:`~argparse.Namespace`, `OmegaConf`
+                or strings representing the argument names in ``__init__``.
+            ignore: an argument name or a list of argument names in ``__init__`` to be ignored
+            frame: a frame object. Default is ``None``.
 
         Example::
+
             >>> class ManuallyArgsModel(LightningModule):
             ...     def __init__(self, arg1, arg2, arg3):
             ...         super().__init__()
@@ -1889,7 +1930,7 @@ def to_onnx(
         **kwargs,
     ):
         """
-        Saves the model in ONNX format
+        Saves the model in ONNX format.
 
         Args:
             file_path: The path of the file the onnx model should be saved to.
@@ -1941,8 +1982,8 @@ def to_torchscript(
     ) -> Union[ScriptModule, Dict[str, ScriptModule]]:
         """
         By default compiles the whole model to a :class:`~torch.jit.ScriptModule`.
-        If you want to use tracing, please provided the argument `method='trace'` and make sure that either the
-        example_inputs argument is provided, or the model has self.example_input_array set.
+        If you want to use tracing, please provided the argument ``method='trace'`` and make sure that either the
+        `example_inputs` argument is provided, or the model has :attr:`example_input_array` set.
         If you would like to customize the modules that are scripted you should override this method.
         In case you want to return multiple modules, we recommend using a dictionary.
 
@@ -1950,7 +1991,7 @@ def to_torchscript(
             file_path: Path where to save the torchscript. Default: None (no file saved).
             method: Whether to use TorchScript's script or trace method. Default: 'script'
             example_inputs: An input to be used to do tracing when method is set to 'trace'.
-              Default: None (Use self.example_input_array)
+              Default: None (uses :attr:`example_input_array`)
             **kwargs: Additional arguments that will be passed to the :func:`torch.jit.script` or
               :func:`torch.jit.trace` function.
 
@@ -1980,7 +2021,7 @@ def to_torchscript(
             True
 
         Return:
-            This LightningModule as a torchscript, regardless of whether file_path is
+            This LightningModule as a torchscript, regardless of whether `file_path` is
             defined or not.
         """
         mode = self.training
@@ -2014,12 +2055,20 @@ def to_torchscript(
 
     @property
     def hparams(self) -> Union[AttributeDict, dict, Namespace]:
+        """
+        The collection of hyperparameters saved with :meth:`save_hyperparameters`. It is mutable by the user.
+        For the frozen set of initial hyperparameters, use :attr:`hparams_initial`.
+        """
         if not hasattr(self, "_hparams"):
             self._hparams = AttributeDict()
         return self._hparams
 
     @property
     def hparams_initial(self) -> AttributeDict:
+        """
+        The collection of hyperparameters saved with :meth:`save_hyperparameters`. These contents are read-only.
+        Manual updates to the saved hyperparameters can instead be performed through :attr:`hparams`.
+        """
         if not hasattr(self, "_hparams_initial"):
             return AttributeDict()
         # prevent any change
@@ -2027,6 +2076,10 @@ def hparams_initial(self) -> AttributeDict:
 
     @property
     def model_size(self) -> float:
+        """
+        The model's size in megabytes. The computation includes everything in the
+        :meth:`~torch.nn.Module.state_dict`, i.e., by default the parameteters and buffers.
+        """
         # todo: think about better way without need to dump model to drive
         tmp_name = f"{uuid.uuid4().hex}.pt"
         torch.save(self.state_dict(), tmp_name)
@@ -2035,8 +2088,8 @@ def model_size(self) -> float:
         return size_mb
 
     def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
-        """Appends the :attr:`trainer.callback_metrics` dictionary to the given queue.
-
+        """
+        Appends the :attr:`trainer.callback_metrics` dictionary to the given queue.
         To avoid issues with memory sharing, we cast the data to numpy.
 
         Args:
@@ -2048,8 +2101,8 @@ def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
         queue.put(callback_metrics)
 
     def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
-        """Retrieve the :attr:`trainer.callback_metrics` dictionary from the given queue.
-
+        """
+        Retrieve the :attr:`trainer.callback_metrics` dictionary from the given queue.
         To preserve consistency, we cast back the data to ``torch.Tensor``.
 
         Args: