Merge branch 'master' into feature/5311-flatten-dict

Borda · web-flow · commit 6ad1b4e8051e · 2021-01-05T09:13:36.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added a check for optimizer attached to lr_scheduler ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338))
+
 - Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
 
 
@@ -23,6 +25,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333))
+
 - Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
 
 
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
@@ -33,11 +33,11 @@ class LearningRateMonitor(Callback):
     Automatically monitor and logs learning rate for learning rate schedulers during training.
 
     Args:
-        logging_interval: set to `epoch` or `step` to log `lr` of all optimizers
-            at the same interval, set to `None` to log at individual interval
-            according to the `interval` key of each scheduler. Defaults to ``None``.
+        logging_interval: set to ``'epoch'`` or ``'step'`` to log ``lr`` of all optimizers
+            at the same interval, set to ``None`` to log at individual interval
+            according to the ``interval`` key of each scheduler. Defaults to ``None``.
         log_momentum: option to also log the momentum values of the optimizer, if the optimizer
-            has the `momentum` attribute. Defaults to ``False``.
+            has the ``momentum`` or ``betas`` attribute. Defaults to ``False``.
 
     Example::
 
@@ -47,17 +47,19 @@ class LearningRateMonitor(Callback):
         >>> trainer = Trainer(callbacks=[lr_monitor])
 
     Logging names are automatically determined based on optimizer class name.
-    In case of multiple optimizers of same type, they will be named `Adam`,
-    `Adam-1` etc. If a optimizer has multiple parameter groups they will
-    be named `Adam/pg1`, `Adam/pg2` etc. To control naming, pass in a
-    `name` keyword in the construction of the learning rate schdulers
+    In case of multiple optimizers of same type, they will be named ``Adam``,
+    ``Adam-1`` etc. If a optimizer has multiple parameter groups they will
+    be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a
+    ``name`` keyword in the construction of the learning rate schdulers
 
     Example::
 
         def configure_optimizer(self):
             optimizer = torch.optim.Adam(...)
-            lr_scheduler = {'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, ...)
-                            'name': 'my_logging_name'}
+            lr_scheduler = {
+                'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, ...)
+                'name': 'my_logging_name'
+            }
             return [optimizer], [lr_scheduler]
 
     """
@@ -80,16 +82,28 @@ def on_train_start(self, trainer, *args, **kwargs):
         """
         if not trainer.logger:
             raise MisconfigurationException(
-                'Cannot use LearningRateMonitor callback with Trainer that has no logger.'
+                'Cannot use `LearningRateMonitor` callback with `Trainer` that has no logger.'
             )
 
         if not trainer.lr_schedulers:
             rank_zero_warn(
-                'You are using LearningRateMonitor callback with models that'
+                'You are using `LearningRateMonitor` callback with models that'
                 ' have no learning rate schedulers. Please see documentation'
                 ' for `configure_optimizers` method.', RuntimeWarning
             )
 
+        if self.log_momentum:
+            def _check_no_key(key):
+                return any(
+                    key not in sch['scheduler'].optimizer.defaults for sch in trainer.lr_schedulers
+                )
+
+            if _check_no_key('momentum') and _check_no_key('betas'):
+                rank_zero_warn(
+                    "You have set log_momentum=True, but some optimizers do not"
+                    " have momentum. This will log a value 0 for the momentum.", RuntimeWarning
+                )
+
         # Find names for schedulers
         names = self._find_names(trainer.lr_schedulers)
 
@@ -121,19 +135,17 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
 
         for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers):
             if scheduler['interval'] == interval or interval == 'any':
-                param_groups = scheduler['scheduler'].optimizer.param_groups
-                if len(param_groups) != 1:
-                    for i, pg in enumerate(param_groups):
-                        lr = self._extract_lr(param_group=pg, name=f'{name}/pg{i + 1}')
-                        latest_stat.update(lr)
-                        momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum/pg{i + 1}')
-                        latest_stat.update(momentum)
-
-                else:
-                    pg = param_groups[0]
-                    lr = self._extract_lr(param_group=pg, name=name)
+                opt = scheduler['scheduler'].optimizer
+                param_groups = opt.param_groups
+                use_betas = 'betas' in opt.defaults
+
+                for i, pg in enumerate(param_groups):
+                    suffix = f'/pg{i + 1}' if len(param_groups) > 1 else ''
+                    lr = self._extract_lr(param_group=pg, name=f'{name}{suffix}')
                     latest_stat.update(lr)
-                    momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum')
+                    momentum = self._extract_momentum(
+                        param_group=pg, name=f'{name}-momentum{suffix}', use_betas=use_betas
+                    )
                     latest_stat.update(momentum)
 
         return latest_stat
@@ -143,11 +155,11 @@ def _extract_lr(self, param_group, name: str) -> Dict[str, float]:
         self.lrs[name].append(lr)
         return {name: lr}
 
-    def _extract_momentum(self, param_group, name: str) -> Dict[str, float]:
+    def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str, float]:
         if not self.log_momentum:
             return {}
 
-        momentum = param_group.get('momentum')
+        momentum = param_group.get('betas')[0] if use_betas else param_group.get('momentum', 0)
         self.last_momentum_values[name] = momentum
         return {name: momentum}
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -14,15 +14,15 @@
 
 """nn.Module with additional great features."""
 
-from abc import ABC
-from argparse import Namespace
 import collections
 import copy
 import inspect
 import os
-from pathlib import Path
 import re
 import tempfile
+from abc import ABC
+from argparse import Namespace
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
@@ -1327,9 +1327,17 @@ def tbptt_split_batch(self, batch, split_size):
 
         return splits
 
-    def summarize(self, mode: str = ModelSummary.MODE_DEFAULT) -> ModelSummary:
-        model_summary = ModelSummary(self, mode=mode)
-        log.info("\n" + str(model_summary))
+    def summarize(self, mode: Optional[str] = ModelSummary.MODE_DEFAULT) -> Optional[ModelSummary]:
+        model_summary = None
+
+        if mode in ModelSummary.MODES:
+            model_summary = ModelSummary(self, mode=mode)
+            log.info("\n" + str(model_summary))
+        elif mode is not None:
+            raise MisconfigurationException(
+                f"`mode` can be None, {', '.join(ModelSummary.MODES)}, got {mode}"
+            )
+
         return model_summary
 
     def freeze(self) -> None:
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
@@ -75,7 +75,9 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]:
                 ' * {"optimizer": `torch.optim.Optimizer`, (optional) "lr_scheduler": `torch.optim.lr_scheduler`}\n'
                 ' * A list of the previously described dict format, with an optional "frequency" key (int)'
             )
+
         lr_schedulers = self.configure_schedulers(lr_schedulers, monitor=monitor)
+        _validate_scheduler_optimizer(optimizers, lr_schedulers)
 
         return optimizers, lr_schedulers, optimizer_frequencies
 
@@ -183,3 +185,10 @@ def zero_grad(self):
 
     def __repr__(self):
         return 'No Optimizer'
+
+
+def _validate_scheduler_optimizer(optimizers, lr_schedulers):
+    if any(sch['scheduler'].optimizer not in optimizers for sch in lr_schedulers):
+        raise MisconfigurationException(
+            "Some schedulers are attatched with an optimizer that wasn't returned from `configure_optimizers`."
+        )
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -311,7 +311,6 @@ def __init__(
         self.plugin_connector = PluginConnector(self)
 
         # training state
-        self.weights_summary = weights_summary
         self.model = None
         self.shown_warnings = set()
 
@@ -374,7 +373,8 @@ def __init__(
             max_steps,
             min_steps,
             num_sanity_val_steps,
-            automatic_optimization
+            automatic_optimization,
+            weights_summary,
         )
         self.evaluation_loop.on_trainer_init()
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -49,7 +49,14 @@ def __init__(self, trainer):
         self._cur_grad_norm_dict = None
 
     def on_trainer_init(
-        self, max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps, automatic_optimization
+        self,
+        max_epochs,
+        min_epochs,
+        max_steps,
+        min_steps,
+        num_sanity_val_steps,
+        automatic_optimization,
+        weights_summary,
     ):
         self.trainer.global_step = 0
         self.trainer.current_epoch = 0
@@ -73,6 +80,12 @@ def on_trainer_init(
         else:
             self.trainer.num_sanity_val_steps = num_sanity_val_steps
 
+        self.trainer.weights_summary = weights_summary
+        if weights_summary is not None and weights_summary not in ModelSummary.MODES:
+            raise MisconfigurationException(
+                f"`weights_summary` can be None, {', '.join(ModelSummary.MODES)}, got {weights_summary}"
+            )
+
     @property
     def num_optimizers(self):
         num_optimizers = len(self.get_optimizers_iterable())
@@ -161,11 +174,8 @@ def setup_training(self, model: LightningModule):
             ref_model.on_pretrain_routine_start()
 
         # print model summary
-        if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing:
-            if self.trainer.weights_summary in ModelSummary.MODES:
-                ref_model.summarize(mode=self.trainer.weights_summary)
-            else:
-                raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES))
+        if self.trainer.is_global_zero and not self.trainer.testing:
+            ref_model.summarize(mode=self.trainer.weights_summary)
 
         # track model now.
         # if cluster resets state, the model will update with the saved weights
diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
+from torch import optim
 
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
@@ -47,19 +48,34 @@ def test_lr_monitor_single_lr(tmpdir):
         'Names of learning rates not set correctly'
 
 
-def test_lr_monitor_single_lr_with_momentum(tmpdir):
-    """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """
-    tutils.reset_seed()
+@pytest.mark.parametrize('opt', ['SGD', 'Adam'])
+def test_lr_monitor_single_lr_with_momentum(tmpdir, opt):
+    """
+    Test that learning rates and momentum are extracted and logged for single lr scheduler.
+    """
+    class LogMomentumModel(BoringModel):
+        def __init__(self, opt):
+            super().__init__()
+            self.opt = opt
 
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__onecycle_scheduler
+        def configure_optimizers(self):
+            if self.opt == 'SGD':
+                opt_kwargs = {'momentum': 0.9}
+            elif self.opt == 'Adam':
+                opt_kwargs = {'betas': (0.9, 0.999)}
 
+            optimizer = getattr(optim, self.opt)(self.parameters(), lr=1e-2, **opt_kwargs)
+            lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-2, total_steps=10_000)
+            return [optimizer], [lr_scheduler]
+
+    model = LogMomentumModel(opt=opt)
     lr_monitor = LearningRateMonitor(log_momentum=True)
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
-        limit_val_batches=0.1,
-        limit_train_batches=0.5,
+        limit_val_batches=2,
+        limit_train_batches=5,
+        log_every_n_steps=1,
         callbacks=[lr_monitor],
     )
     result = trainer.fit(model)
@@ -69,7 +85,39 @@ def test_lr_monitor_single_lr_with_momentum(tmpdir):
         'Expected momentum to be logged'
     assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \
         'Number of momentum values logged does not match number of lr schedulers'
-    assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \
+    assert all(k == f'lr-{opt}-momentum' for k in lr_monitor.last_momentum_values.keys()), \
+        'Names of momentum values not set correctly'
+
+
+def test_log_momentum_no_momentum_optimizer(tmpdir):
+    """
+    Test that if optimizer doesn't have momentum then a warning is raised with log_momentum=True.
+    """
+    class LogMomentumModel(BoringModel):
+        def configure_optimizers(self):
+            optimizer = optim.ASGD(self.parameters(), lr=1e-2)
+            lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer], [lr_scheduler]
+
+    model = LogMomentumModel()
+    lr_monitor = LearningRateMonitor(log_momentum=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=2,
+        limit_train_batches=5,
+        log_every_n_steps=1,
+        callbacks=[lr_monitor],
+    )
+    with pytest.warns(RuntimeWarning, match="optimizers do not have momentum."):
+        result = trainer.fit(model)
+        assert result
+
+    assert all(v == 0 for v in lr_monitor.last_momentum_values.values()), \
+        'Expected momentum to be logged'
+    assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \
+        'Number of momentum values logged does not match number of lr schedulers'
+    assert all(k == 'lr-ASGD-momentum' for k in lr_monitor.last_momentum_values.keys()), \
         'Names of momentum values not set correctly'
 
 
@@ -105,7 +153,7 @@ def test_lr_monitor_no_logger(tmpdir):
         logger=False
     )
 
-    with pytest.raises(MisconfigurationException, match='Trainer that has no logger'):
+    with pytest.raises(MisconfigurationException, match='`Trainer` that has no logger'):
         trainer.fit(model)
 
 
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
@@ -15,8 +15,9 @@
 import torch
 import torch.nn as nn
 
-from pytorch_lightning import LightningModule
+from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.core.memory import UNKNOWN_SIZE, ModelSummary
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.models import ParityModuleRNN
 
 
@@ -68,6 +69,15 @@ def forward(self, x):
         return self.reduce(self.embed(x))
 
 
+def test_invalid_weights_summmary():
+    """ Test that invalid value for weights_summary raises an error. """
+    with pytest.raises(MisconfigurationException, match='`mode` can be None, .* got temp'):
+        UnorderedModel().summarize(mode='temp')
+
+    with pytest.raises(MisconfigurationException, match='`weights_summary` can be None, .* got temp'):
+        Trainer(weights_summary='temp')
+
+
 @pytest.mark.parametrize(['mode'], [
     pytest.param(ModelSummary.MODE_FULL),
     pytest.param(ModelSummary.MODE_TOP),
diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py