Improve SWA docs (#8717)

carmocca · web-flow · commit 4928dc55790e · 2021-08-05T16:07:50.000Z
diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst
@@ -1,6 +1,7 @@
 .. testsetup:: *
 
-    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning import Trainer
+    from pytorch_lightning.callbacks import StochasticWeightAveraging
 
 .. _training_tricks:
 
@@ -57,15 +58,18 @@ This can be used with both non-trained and trained models. The SWA procedure smo
 it harder to end up in a local minimum during optimization.
 
 For a more detailed explanation of SWA and how it works,
-read `this <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>`__ post by the PyTorch team.
+read `this post <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>`__ by the PyTorch team.
 
-.. seealso:: :class:`~pytorch_lightning.callbacks.StochasticWeightAveraging` (Callback)
+.. seealso:: The :class:`~pytorch_lightning.callbacks.StochasticWeightAveraging` callback
 
 .. testcode::
 
-    # Enable Stochastic Weight Averaging
+    # Enable Stochastic Weight Averaging - uses the class defaults
     trainer = Trainer(stochastic_weight_avg=True)
 
+    # alternatively, if you need to pass custom arguments
+    trainer = Trainer(callbacks=[StochasticWeightAveraging(...)])
+
 ----------
 
 Auto scaling of batch size
diff --git a/pytorch_lightning/callbacks/stochastic_weight_avg.py b/pytorch_lightning/callbacks/stochastic_weight_avg.py
@@ -16,7 +16,7 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 """
 from copy import deepcopy
-from typing import Callable, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import torch
 from torch import nn
@@ -35,7 +35,7 @@ class StochasticWeightAveraging(Callback):
     def __init__(
         self,
         swa_epoch_start: Union[int, float] = 0.8,
-        swa_lrs: Optional[Union[float, list]] = None,
+        swa_lrs: Optional[Union[float, List[float]]] = None,
         annealing_epochs: int = 10,
         annealing_strategy: str = "cos",
         avg_fn: Optional[_AVG_FN] = None,
@@ -62,19 +62,19 @@ def __init__(
 
         .. warning:: ``StochasticWeightAveraging`` is currently only supported on every epoch.
 
-        SWA can easily be activated directly from the Trainer as follow:
-
-        .. code-block:: python
-
-            Trainer(stochastic_weight_avg=True)
+        See also how to :ref:`enable it directly on the Trainer <advanced/training_tricks:Stochastic Weight Averaging>`
 
         Arguments:
 
             swa_epoch_start: If provided as int, the procedure will start from
                 the ``swa_epoch_start``-th epoch. If provided as float between 0 and 1,
                 the procedure will start from ``int(swa_epoch_start * max_epochs)`` epoch
 
-            swa_lrs: the learning rate value for all param groups together or separately for each group.
+            swa_lrs: The SWA learning rate to use:
+
+                - ``None``. Use the current learning rate of the optimizer at the time the SWA procedure starts.
+                - ``float``. Use this value for all parameter groups of the optimizer.
+                - ``List[float]``. A list values for each parameter group of the optimizer.
 
             annealing_epochs: number of epochs in the annealing phase (default: 10)
 
@@ -105,7 +105,9 @@ def __init__(
         wrong_float = isinstance(swa_lrs, float) and swa_lrs <= 0
         wrong_list = isinstance(swa_lrs, list) and not all(lr > 0 and isinstance(lr, float) for lr in swa_lrs)
         if swa_lrs is not None and (wrong_type or wrong_float or wrong_list):
-            raise MisconfigurationException("The `swa_lrs` should be a positive float or a list of positive float.")
+            raise MisconfigurationException(
+                "The `swa_lrs` should be `None`, a positive float, or a list of positive floats"
+            )
 
         if avg_fn is not None and not isinstance(avg_fn, Callable):
             raise MisconfigurationException("The `avg_fn` should be callable.")
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
@@ -16,7 +16,7 @@
 import os
 from functools import wraps
 from platform import python_version
-from typing import Any, Callable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
 from torch.nn.parallel.distributed import DistributedDataParallel
diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py
@@ -185,7 +185,7 @@ def test_swa_raises():
         StochasticWeightAveraging(swa_epoch_start=1.5, swa_lrs=0.1)
     with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
         StochasticWeightAveraging(swa_epoch_start=-1, swa_lrs=0.1)
-    with pytest.raises(MisconfigurationException, match="positive float or a list of positive float"):
+    with pytest.raises(MisconfigurationException, match="positive float, or a list of positive floats"):
         StochasticWeightAveraging(swa_epoch_start=5, swa_lrs=[0.2, 1])