From b7e7c6f405c015a964ed2a3600622a37ee83d11b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 5 Feb 2021 15:10:28 +0000 Subject: [PATCH 1/7] resolve manual_backward --- pytorch_lightning/accelerators/accelerator.py | 9 ++-- pytorch_lightning/core/optimizer.py | 4 +- pytorch_lightning/overrides/base.py | 7 ++- .../plugins/training_type/ddp.py | 23 +++++++++- .../plugins/training_type/horovod.py | 5 ++- .../plugins/training_type/parallel.py | 5 ++- .../training_type/training_type_plugin.py | 11 ++++- pytorch_lightning/trainer/training_loop.py | 12 +++-- pytorch_lightning/utilities/__init__.py | 1 + pytorch_lightning/utilities/imports.py | 2 + tests/core/test_lightning_optimizer.py | 2 +- .../optimization/test_manual_optimization.py | 44 +++++++++++-------- 12 files changed, 91 insertions(+), 34 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 5588828853746..7377b89d7b5c4 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -145,6 +145,9 @@ def training_step(self, args): with self.training_type_plugin.train_step_context(): return self.training_type_plugin.training_step(*args) + def post_training_step(self): + self.training_type_plugin.post_training_step() + def validation_step(self, args): """The actual validation step. @@ -251,13 +254,13 @@ def backward( opt_idx: the index of the optimizer should_accumulate: whether to accumulate gradients """ + self.training_type_plugin.pre_backward(closure_loss, optimizer, opt_idx) + output = self.precision_plugin.backward( self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs ) - # TODO: this is a hack, find a better solution for this (hook?) - if isinstance(self.training_type_plugin, HorovodPlugin): - optimizer.synchronize() + self.training_type_plugin.post_backward(closure_loss, optimizer, opt_idx) return output diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index e5c91354dda1a..201659e08d3ae 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -62,6 +62,7 @@ def __init__(self, self._trainer = None self._accumulate_grad_batches = accumulate_grad_batches self._optimizer_idx = None + self._total_optimizer_step_calls = 0 @property def optimizer(self): @@ -265,10 +266,11 @@ def dis_closure(): if make_optimizer_step: self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs) + self._total_optimizer_step_calls += 1 else: # make sure to call optimizer_closure when accumulating with self._trainer.profiler.profile(f"closure_{self._optimizer_idx}"): - with self._trainer.train_loop.block_ddp_sync_behaviour(): + with self._trainer.train_loop.block_ddp_sync_behaviour(True): closure() def __repr__(self): diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py index 3dd20f6d4303b..65a21c91e6052 100644 --- a/pytorch_lightning/overrides/base.py +++ b/pytorch_lightning/overrides/base.py @@ -46,6 +46,12 @@ def forward(self, *inputs, **kwargs): if running_stage == RunningStage.TRAINING: output = self.module.training_step(*inputs, **kwargs) + # In manual_optimization, we need to prevent DDP reducer as + # it is done manually in ``LightningModule.manual_backward``. + # `require_backward_grad_sync` will be reset + # ddp_plugin ``post_training_step`` hook + if not self.module.automatic_optimization: + self.module.trainer.model.require_backward_grad_sync = False warn_if_output_is_none(output, "training_step") elif running_stage == RunningStage.TESTING: output = self.module.test_step(*inputs, **kwargs) @@ -55,7 +61,6 @@ def forward(self, *inputs, **kwargs): warn_if_output_is_none(output, "validation_step") else: output = self.module.predict(*inputs, **kwargs) - return output diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index a8ad0708db9bf..45768894cf46a 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from pytorch_lightning.overrides.distributed import prepare_for_backward import subprocess import sys from time import sleep @@ -21,12 +22,14 @@ import torch import torch.distributed as torch_distrib from torch.nn.parallel.distributed import DistributedDataParallel - +from torch.optim import Optimizer from pytorch_lightning import _logger as log from pytorch_lightning.distributed import LightningDistributed from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.utilities import _PYTORCH_GREATER_EQUAL_THAN_1_7_0 +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities import _HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import ( find_free_network_port, @@ -178,6 +181,15 @@ def set_world_ranks(self): self.world_size = self.num_nodes * self.num_processes def configure_ddp(self): + + # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()``` breaking manual_optimization + if _PYTORCH_GREATER_EQUAL_THAN_1_7_0 and not self.lightning_module.automatic_optimization: + rank_zero_warn( + "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " + "to properly work with DDP." + ) + self._ddp_kwargs["find_unused_parameters"] = True + self._model = DistributedDataParallel( LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), @@ -253,6 +265,11 @@ def barrier(self, *args, **kwargs): def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) + def pre_backward(self, closure_loss: torch.Tensor, optimizer: Optimizer, opt_idx: int): + """Run before precision plugin executes backward""" + if not self.lightning_module.automatic_optimization and self.model.require_backward_grad_sync: + prepare_for_backward(self.model, closure_loss) + def model_to_device(self): if self.root_device.type == "cuda": torch.cuda.set_device(self.root_device) @@ -274,3 +291,7 @@ def test_step(self, *args, **kwargs): def predict(self, *args, **kwargs): return self.model(*args, **kwargs) + + def post_training_step(self): + if not self.lightning_module.automatic_optimization: + self.model.require_backward_grad_sync = True diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 335f65b3e3fbb..3deff8befde26 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -15,7 +15,7 @@ from typing import Any, List, Optional, Union import torch -from torch.optim.lr_scheduler import _LRScheduler +from torch.optim.lr_scheduler import _LRScheduler, Optimizer from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -116,6 +116,9 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = hvd.broadcast_object(obj, src) return obj + def post_backward(self, closure_loss: torch.Tensor, optimizer: Optimizer, opt_idx: int): + optimizer.synchronize() + def model_to_device(self): if self.on_gpu: torch.cuda.set_device(self.root_device) diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 758e1a2e77d05..6c7ccd6f2e0aa 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -100,8 +100,9 @@ def block_backward_sync(self): This is useful for skipping sync when accumulating gradients, reducing communication overhead Returns: context manager with sync behaviour off """ - if isinstance(self.model, (LightningDistributedDataParallel, DistributedDataParallel)): - yield self.model.no_sync() + if isinstance(self.model, DistributedDataParallel): + with self.model.no_sync(): + yield None else: yield None diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 4c6a61f7daca0..738bcc9347d94 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -16,7 +16,7 @@ from typing import Any, Optional, Sequence, TYPE_CHECKING, Union import torch - +from torch.optim import Optimizer from pytorch_lightning import _logger as log from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import unwrap_lightning_module @@ -69,6 +69,12 @@ def reduce_early_stopping_decision(self, should_stop: bool) -> bool: """Reduce the early stopping decision across all possibly spawned processes""" return should_stop + def pre_backward(self, closure_loss: torch.Tensor, optimizer: Optimizer, opt_idx: int): + """Run before precision plugin executes backward""" + + def post_backward(self, closure_loss: torch.Tensor, optimizer: Optimizer, opt_idx: int): + """Run after precision plugin executes backward""" + @property def model(self) -> torch.nn.Module: """Returns the potentially wrapped LightningModule""" @@ -107,6 +113,9 @@ def start_testing(self, trainer: 'Trainer') -> None: def training_step(self, *args, **kwargs): return self.lightning_module.training_step(*args, **kwargs) + def post_training_step(self): + pass + def validation_step(self, *args, **kwargs): return self.lightning_module.validation_step(*args, **kwargs) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index bc1b5a51b25f6..51472a70588f1 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -288,6 +288,8 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): model_ref._results = Result() with self.trainer.profiler.profile("training_step"): training_step_output = self.trainer.accelerator_backend.training_step(args) + self.trainer.accelerator_backend.post_training_step() + self.trainer.logger_connector.cache_logged_metrics() self._check_training_step_output(training_step_output) @@ -695,7 +697,7 @@ def train_step_and_backward_closure(): return result @contextmanager - def block_ddp_sync_behaviour(self): + def block_ddp_sync_behaviour(self, should_block_sync: bool = False): """ automatic_optimization = True Blocks ddp sync gradients behaviour on backwards pass. @@ -709,8 +711,9 @@ def block_ddp_sync_behaviour(self): context manager with sync behaviour off """ - if isinstance(self.trainer.training_type_plugin, ParallelPlugin) and self.automatic_optimization: - yield self.trainer.training_type_plugin.block_backward_sync() + if isinstance(self.trainer.training_type_plugin, ParallelPlugin) and (self.automatic_optimization or should_block_sync): + with self.trainer.training_type_plugin.block_backward_sync(): + yield None else: yield None @@ -751,7 +754,8 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, self._curr_step_result = result if result is None: - self.warning_cache.warn("training_step returned None if it was on purpose, ignore this warning...") + if self.automatic_optimization: + self.warning_cache.warn("training_step returned None if it was on purpose, ignore this warning...") return None if self.trainer.train_loop.automatic_optimization: diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index a8f3e134936ff..5526f588d1c89 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -39,6 +39,7 @@ _RPC_AVAILABLE, _TORCHTEXT_AVAILABLE, _XLA_AVAILABLE, + _PYTORCH_GREATER_EQUAL_THAN_1_7_0 ) from pytorch_lightning.utilities.parsing import AttributeDict, flatten_dict, is_picklable # noqa: F401 from pytorch_lightning.utilities.xla_device import XLADeviceUtils # noqa: F401 diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index c0a4d15411dc4..33c44eee67719 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -55,3 +55,5 @@ def _module_available(module_path: str) -> bool: _FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and LooseVersion(torch.__version__) >= LooseVersion("1.6.0") _BOLTS_AVAILABLE = _module_available('pl_bolts') _PYTORCH_PRUNE_AVAILABLE = _module_available('torch.nn.utils.prune') +_PYTORCH_GREATER_EQUAL_THAN_1_7_0 = LooseVersion(torch.__version__) >= LooseVersion("1.7.0") + diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index c6f476764ba42..8043f9acac081 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -205,7 +205,7 @@ def test_state(tmpdir): lightning_dict = {} special_attrs = ["_accumulate_grad_batches", "_optimizer", "_optimizer_idx", "_support_closure", "_trainer", "__getstate__", "__setstate__", "state_dict", "load_state_dict", - "zero_grad", "__setstate__", "add_param_group"] + "zero_grad", "__setstate__", "add_param_group", "_total_optimizer_step_calls"] for k, v in lightning_optimizer.__dict__.items(): if k not in special_attrs: diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 1a7d99564bab1..bf7f660f030b5 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -15,6 +15,7 @@ import os from unittest import mock from unittest.mock import ANY, call, patch +from copy import deepcopy import pytest import torch @@ -935,12 +936,10 @@ def configure_optimizers(self): @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@patch("torch.optim.Adam.step") -@patch("torch.optim.SGD.step") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest") -def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_step, mock_adam_step, tmpdir): +def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): """ Tests that `step` works with optimizer_closure and different accumulated_gradient frequency """ @@ -973,6 +972,9 @@ def training_step(self, batch, batch_idx, optimizer_idx): world_size = torch_distrib.get_world_size(torch_distrib.group.WORLD) assert world_size == 2 + make_gen_optimizer_step = batch_idx % 2 == 1 + make_dis_optimizer_step = batch_idx % 4 == 0 + def compute_loss(): x = batch[0] x = F.dropout(x, 0.1) @@ -982,32 +984,33 @@ def compute_loss(): loss_zeros = self.loss_zeros(None, predictions) return loss_ones, loss_zeros - def make_manual_backward(loss, opt, retain_graph=False): + def make_manual_backward(loss, opt, retain_graph=False, make_optimizer_step=True): self.manual_backward(loss, opt, retain_graph=retain_graph) - grad_clone = self.layer.weight.grad.clone() - assert self.manual_sync_grad() - self.layer.weight.grad /= world_size - assert torch.equal(self.layer.weight.grad, grad_clone) + if make_optimizer_step: + grad_clone = self.layer.weight.grad.clone() + assert self.manual_sync_grad() + self.layer.weight.grad /= world_size + assert torch.equal(self.layer.weight.grad, grad_clone) def gen_closure(): loss_ones_gen, loss_zeros = compute_loss() - make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True) - make_manual_backward(loss_ones_gen, opt_gen) + make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True, make_optimizer_step=make_gen_optimizer_step) + make_manual_backward(loss_ones_gen, opt_gen, make_optimizer_step=make_gen_optimizer_step) def dis_closure(): loss_ones_gen, loss_zeros = compute_loss() - make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True) - make_manual_backward(loss_ones_gen, opt_dis) + make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True, make_optimizer_step=make_dis_optimizer_step) + make_manual_backward(loss_ones_gen, opt_dis, make_optimizer_step=make_dis_optimizer_step) # this will accumulate gradients for 2 batches and then call opt_gen.step() - opt_gen.step(closure=gen_closure, make_optimizer_step=batch_idx % 2 == 0, optim='sgd') + opt_gen.step(closure=gen_closure, make_optimizer_step=make_gen_optimizer_step) # update discriminator every 4 baches # therefore, no gradient accumulation for discriminator - if batch_idx % 4 == 0 : + if make_dis_optimizer_step: # Note: Set make_optimizer_step to True or it will use by default # Trainer(accumulate_grad_batches=x) - opt_dis.step(closure=dis_closure, make_optimizer_step=True, optim='adam') + opt_dis.step(closure=dis_closure, make_optimizer_step=True) def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer @@ -1021,6 +1024,7 @@ def configure_optimizers(self): seed_everything(42) model = TestModel() + model_copy = deepcopy(model) model.val_dataloader = None model.training_epoch_end = None @@ -1037,8 +1041,10 @@ def configure_optimizers(self): ) trainer.fit(model) - expected_calls = [call(closure=ANY, optim='sgd')] * 4 - mock_sgd_step.assert_has_calls(expected_calls) - expected_calls = [call(closure=ANY, optim='adam')] * 2 - mock_adam_step.assert_has_calls(expected_calls) + for param, param_copy in zip(model.parameters(), model_copy.parameters()): + assert not torch.equal(param, param_copy) + + opt_a, opt_b = model.optimizers() + assert opt_a._total_optimizer_step_calls == 4 + assert opt_b._total_optimizer_step_calls == 2 From 02d31699af698e5102cc678c4e11828ce59eafbd Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Feb 2021 15:19:34 +0000 Subject: [PATCH 2/7] resolve flake8 --- pytorch_lightning/accelerators/__init__.py | 8 ++++---- pytorch_lightning/core/optimizer.py | 5 +---- pytorch_lightning/overrides/base.py | 7 ++++--- pytorch_lightning/overrides/fairscale.py | 1 - pytorch_lightning/trainer/properties.py | 3 +-- pytorch_lightning/trainer/trainer.py | 1 - pytorch_lightning/trainer/training_loop.py | 6 ++++-- pytorch_lightning/utilities/imports.py | 1 - tests/accelerators/legacy/test_accelerator_connector.py | 1 - tests/core/test_datamodules.py | 2 +- tests/models/test_amp.py | 5 +++-- tests/models/test_hooks.py | 2 +- tests/models/test_tpu.py | 2 +- tests/trainer/optimization/test_manual_optimization.py | 8 +++++--- 14 files changed, 25 insertions(+), 27 deletions(-) diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 2ec118303d153..66faa8154b467 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.accelerators.cpu import CPUAccelerator -from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.tpu import TPUAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator # noqa F401 +from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.tpu import TPUAccelerator # noqa F401 diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index 201659e08d3ae..ce9b0960b7055 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -17,12 +17,9 @@ from torch.optim.optimizer import Optimizer -from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType +from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException -if _TPU_AVAILABLE: - import torch_xla.core.xla_model as xm - def is_lightning_optimizer(optimizer): return isinstance(optimizer, LightningOptimizer) diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py index 65a21c91e6052..d7376e9bcdad9 100644 --- a/pytorch_lightning/overrides/base.py +++ b/pytorch_lightning/overrides/base.py @@ -46,9 +46,10 @@ def forward(self, *inputs, **kwargs): if running_stage == RunningStage.TRAINING: output = self.module.training_step(*inputs, **kwargs) - # In manual_optimization, we need to prevent DDP reducer as - # it is done manually in ``LightningModule.manual_backward``. - # `require_backward_grad_sync` will be reset + + # In manual_optimization, we need to prevent DDP reducer as + # it is done manually in ``LightningModule.manual_backward`` + # `require_backward_grad_sync` will be reset # ddp_plugin ``post_training_step`` hook if not self.module.automatic_optimization: self.module.trainer.model.require_backward_grad_sync = False diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py index 2404beb8832f9..f7c3b8d5fd575 100644 --- a/pytorch_lightning/overrides/fairscale.py +++ b/pytorch_lightning/overrides/fairscale.py @@ -13,7 +13,6 @@ # limitations under the License. from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module -from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE LightningShardedDataParallel = None diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index f625c4f994286..e59864df19029 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -17,10 +17,9 @@ from argparse import ArgumentParser, Namespace from typing import Any, cast, List, Optional, Type, TypeVar, Union -from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.accelerator_connector import BackendConnector from pytorch_lightning.accelerators.legacy.accelerator import Accelerator -from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ProgressBarBase from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.trainer.states import TrainerState diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 35ae1af66e16c..8e833c33cbbcf 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Trainer to automate the training.""" -import os import warnings from pathlib import Path from typing import Dict, Iterable, List, Optional, Union diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 51472a70588f1..eb5a06e880401 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -18,7 +18,6 @@ import numpy as np import torch -from pytorch_lightning import LightningModule from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.optimizer import LightningOptimizer @@ -711,7 +710,10 @@ def block_ddp_sync_behaviour(self, should_block_sync: bool = False): context manager with sync behaviour off """ - if isinstance(self.trainer.training_type_plugin, ParallelPlugin) and (self.automatic_optimization or should_block_sync): + if ( + isinstance(self.trainer.training_type_plugin, ParallelPlugin) + and (self.automatic_optimization or should_block_sync) + ): with self.trainer.training_type_plugin.block_backward_sync(): yield None else: diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 33c44eee67719..7dfa3d9b1c71d 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -56,4 +56,3 @@ def _module_available(module_path: str) -> bool: _BOLTS_AVAILABLE = _module_available('pl_bolts') _PYTORCH_PRUNE_AVAILABLE = _module_available('torch.nn.utils.prune') _PYTORCH_GREATER_EQUAL_THAN_1_7_0 = LooseVersion(torch.__version__) >= LooseVersion("1.7.0") - diff --git a/tests/accelerators/legacy/test_accelerator_connector.py b/tests/accelerators/legacy/test_accelerator_connector.py index 20a4ef6424cc6..625b231b84179 100644 --- a/tests/accelerators/legacy/test_accelerator_connector.py +++ b/tests/accelerators/legacy/test_accelerator_connector.py @@ -25,7 +25,6 @@ from pytorch_lightning.callbacks import Callback from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPSpawnPlugin, PrecisionPlugin, SingleDevicePlugin from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment -from pytorch_lightning.utilities import DistributedType from tests.base.boring_model import BoringModel diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 4ecdf768070a4..802c897b4ee2f 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -15,7 +15,7 @@ from argparse import ArgumentParser from typing import Any, Dict from unittest import mock -from unittest.mock import MagicMock, PropertyMock +from unittest.mock import PropertyMock import pytest import torch diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 0b038d47e6032..96f357467cb06 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -139,7 +139,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): callbacks=[checkpoint], logger=logger, ) - result = trainer.fit(model) + _ = trainer.fit(model) # correct result and ok accuracy assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete' @@ -149,7 +149,8 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc') == 'abc' assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23' assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23' - assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' + generated = trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') + assert generated == 'abc23' def test_cpu_model_with_amp(tmpdir): diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 5decd9993cb73..ab3f3f7ee88e7 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -14,7 +14,7 @@ import inspect import os from unittest import mock -from unittest.mock import MagicMock, PropertyMock +from unittest.mock import PropertyMock import pytest import torch diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 4bad926375a0a..73da0622cbeba 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -19,7 +19,7 @@ from torch.utils.data import DataLoader import tests.base.develop_pipelines as tpipes -from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning import Trainer from pytorch_lightning.accelerators import TPUAccelerator from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.trainer.states import TrainerState diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index bf7f660f030b5..d7a71ce9e7d87 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -13,9 +13,9 @@ # limitations under the License. import collections import os +from copy import deepcopy from unittest import mock from unittest.mock import ANY, call, patch -from copy import deepcopy import pytest import torch @@ -994,12 +994,14 @@ def make_manual_backward(loss, opt, retain_graph=False, make_optimizer_step=True def gen_closure(): loss_ones_gen, loss_zeros = compute_loss() - make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True, make_optimizer_step=make_gen_optimizer_step) + make_manual_backward( + loss_ones_gen, opt_gen, retain_graph=True, make_optimizer_step=make_gen_optimizer_step) make_manual_backward(loss_ones_gen, opt_gen, make_optimizer_step=make_gen_optimizer_step) def dis_closure(): loss_ones_gen, loss_zeros = compute_loss() - make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True, make_optimizer_step=make_dis_optimizer_step) + make_manual_backward( + loss_ones_gen, opt_dis, retain_graph=True, make_optimizer_step=make_dis_optimizer_step) make_manual_backward(loss_ones_gen, opt_dis, make_optimizer_step=make_dis_optimizer_step) # this will accumulate gradients for 2 batches and then call opt_gen.step() From a2d6632c44b91bdc3ffe2616eba59a4fa9ee02a4 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Feb 2021 15:24:31 +0000 Subject: [PATCH 3/7] update --- tests/trainer/optimization/test_manual_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index d7a71ce9e7d87..e47cad79001fe 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -1045,7 +1045,7 @@ def configure_optimizers(self): trainer.fit(model) for param, param_copy in zip(model.parameters(), model_copy.parameters()): - assert not torch.equal(param, param_copy) + assert not torch.equal(param.cpu().data, param_copy.data) opt_a, opt_b = model.optimizers() assert opt_a._total_optimizer_step_calls == 4 From cbb5ccf2addfd0f8c0558f9041526abc0c360cfd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 5 Feb 2021 17:39:32 +0000 Subject: [PATCH 4/7] resolve for ddp_spawn --- benchmarks/test_sharded_parity.py | 2 +- .../plugins/training_type/ddp.py | 9 +- .../plugins/training_type/ddp_spawn.py | 23 +++ .../optimization/test_manual_optimization.py | 173 ++++++++++-------- 4 files changed, 127 insertions(+), 80 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 7568a82b3058e..c021e3b89da54 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -15,7 +15,7 @@ import os import platform import time -from typing import Type +from typing import Type, Union import pytest import torch diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 45768894cf46a..29b35ef1ec0b2 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -180,15 +180,18 @@ def set_world_ranks(self): self.global_rank = self.node_rank * self.num_processes + self.local_rank self.world_size = self.num_nodes * self.num_processes - def configure_ddp(self): - + def pre_configure_ddp(self): # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()``` breaking manual_optimization if _PYTORCH_GREATER_EQUAL_THAN_1_7_0 and not self.lightning_module.automatic_optimization: rank_zero_warn( "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " "to properly work with DDP." ) - self._ddp_kwargs["find_unused_parameters"] = True + self._ddp_kwargs["find_unused_parameters"] = True + + def configure_ddp(self): + + self.pre_configure_ddp() self._model = DistributedDataParallel( LightningDistributedModule(self.model), diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 1115e6ea285fc..34f64eee5cc36 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -13,12 +13,14 @@ # limitations under the License. import os import re +from pytorch_lightning.overrides.distributed import prepare_for_backward from typing import Any, Dict, Optional, Union import torch import torch.distributed as torch_distrib import torch.multiprocessing as mp from torch.nn.parallel.distributed import DistributedDataParallel +from torch.optim import Optimizer from pytorch_lightning import _logger as log from pytorch_lightning.distributed.dist import LightningDistributed @@ -27,6 +29,7 @@ from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load +from pytorch_lightning.utilities import _PYTORCH_GREATER_EQUAL_THAN_1_7_0 from pytorch_lightning.utilities.distributed import ( find_free_network_port, rank_zero_only, @@ -159,7 +162,18 @@ def post_training(self): # recover the weights of the processes trained in the children self.__recover_child_process_weights(best_path, last_path) + def pre_configure_ddp(self): + # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()``` breaking manual_optimization + if _PYTORCH_GREATER_EQUAL_THAN_1_7_0 and not self.lightning_module.automatic_optimization: + rank_zero_warn( + "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " + "to properly work with DDP." + ) + self._ddp_kwargs["find_unused_parameters"] = True + def configure_ddp(self): + + self.pre_configure_ddp() self._model = DistributedDataParallel( LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), @@ -225,6 +239,11 @@ def model_to_device(self): torch.cuda.set_device(self.root_device) self.model.to(self.root_device) + def pre_backward(self, closure_loss: torch.Tensor, optimizer: Optimizer, opt_idx: int): + """Run before precision plugin executes backward""" + if not self.lightning_module.automatic_optimization and self.model.require_backward_grad_sync: + prepare_for_backward(self.model, closure_loss) + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) @@ -241,3 +260,7 @@ def test_step(self, *args, **kwargs): def predict(self, *args, **kwargs): return self.model(*args, **kwargs) + + def post_training_step(self): + if not self.lightning_module.automatic_optimization: + self.model.require_backward_grad_sync = True diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index bf7f660f030b5..f9c85df843145 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -21,6 +21,7 @@ import torch import torch.distributed as torch_distrib import torch.nn.functional as F +from pytorch_lightning.callbacks import Callback from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.utilities import _APEX_AVAILABLE @@ -935,95 +936,99 @@ def configure_optimizers(self): mock_adam_step.assert_has_calls(expected_calls) -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', - reason="test should be run outside of pytest") -def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): - """ - Tests that `step` works with optimizer_closure and different accumulated_gradient frequency - """ +class TestManualOptimizationDDPCallack(Callback): - class TestModel(BoringModel): - def __init__(self): - super().__init__() - self.automatic_optimization = False + def on_train_end(self, trainer, pl_module): - def loss_ones(self, batch, prediction): - # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls - return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) + opt_a, opt_b = pl_module.optimizers() + assert opt_a._total_optimizer_step_calls == 4 + assert opt_b._total_optimizer_step_calls == 2 - def loss_zeros(self, batch, prediction): - # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls - return torch.nn.functional.mse_loss(prediction, torch.zeros_like(prediction)) - def manual_sync_grad(self) -> bool: - torch_distrib.all_reduce(self.layer.weight.grad.data, async_op=False) - return True - def training_step(self, batch, batch_idx, optimizer_idx): - # emulate gans training - opt_gen, opt_dis = self.optimizers() - - # Note: Be careful, don't log on the same key in self.log in both closure - # as they will be aggregated together on epoch_end +class TesManualOptimizationDDPModel(BoringModel): + def __init__(self): + super().__init__() + self.automatic_optimization = False - world_size = torch_distrib.get_world_size(torch_distrib.group.WORLD) - assert world_size == 2 + def loss_ones(self, batch, prediction): + # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls + return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) - make_gen_optimizer_step = batch_idx % 2 == 1 - make_dis_optimizer_step = batch_idx % 4 == 0 + def loss_zeros(self, batch, prediction): + # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls + return torch.nn.functional.mse_loss(prediction, torch.zeros_like(prediction)) - def compute_loss(): - x = batch[0] - x = F.dropout(x, 0.1) - predictions = self(x) - predictions = F.dropout(predictions, 0.1) - loss_ones = self.loss_ones(None, predictions) - loss_zeros = self.loss_zeros(None, predictions) - return loss_ones, loss_zeros - - def make_manual_backward(loss, opt, retain_graph=False, make_optimizer_step=True): - self.manual_backward(loss, opt, retain_graph=retain_graph) - if make_optimizer_step: - grad_clone = self.layer.weight.grad.clone() - assert self.manual_sync_grad() - self.layer.weight.grad /= world_size - assert torch.equal(self.layer.weight.grad, grad_clone) + def manual_sync_grad(self) -> bool: + torch_distrib.all_reduce(self.layer.weight.grad.data, async_op=False) + return True - def gen_closure(): - loss_ones_gen, loss_zeros = compute_loss() - make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True, make_optimizer_step=make_gen_optimizer_step) - make_manual_backward(loss_ones_gen, opt_gen, make_optimizer_step=make_gen_optimizer_step) + def training_step(self, batch, batch_idx, optimizer_idx): - def dis_closure(): - loss_ones_gen, loss_zeros = compute_loss() - make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True, make_optimizer_step=make_dis_optimizer_step) - make_manual_backward(loss_ones_gen, opt_dis, make_optimizer_step=make_dis_optimizer_step) + # emulate gans training + opt_gen, opt_dis = self.optimizers() - # this will accumulate gradients for 2 batches and then call opt_gen.step() - opt_gen.step(closure=gen_closure, make_optimizer_step=make_gen_optimizer_step) + # Note: Be careful, don't log on the same key in self.log in both closure + # as they will be aggregated together on epoch_end - # update discriminator every 4 baches - # therefore, no gradient accumulation for discriminator - if make_dis_optimizer_step: - # Note: Set make_optimizer_step to True or it will use by default - # Trainer(accumulate_grad_batches=x) - opt_dis.step(closure=dis_closure, make_optimizer_step=True) + world_size = torch_distrib.get_world_size(torch_distrib.group.WORLD) + assert world_size == 2 - def training_epoch_end(self, outputs) -> None: - # outputs should be an array with an entry per optimizer - assert len(outputs) == 2 + make_gen_optimizer_step = batch_idx % 2 == 1 + make_dis_optimizer_step = batch_idx % 4 == 0 - def configure_optimizers(self): - optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) - return [optimizer_gen, optimizer_dis] + def compute_loss(): + x = batch[0] + x = F.dropout(x, 0.1) + predictions = self(x) + predictions = F.dropout(predictions, 0.1) + loss_ones = self.loss_ones(None, predictions) + loss_zeros = self.loss_zeros(None, predictions) + return loss_ones, loss_zeros + + def make_manual_backward(loss, opt, retain_graph=False, make_optimizer_step=True): + self.manual_backward(loss, opt, retain_graph=retain_graph) + if make_optimizer_step: + grad_clone = self.layer.weight.grad.clone() + assert self.manual_sync_grad() + self.layer.weight.grad /= world_size + assert torch.equal(self.layer.weight.grad, grad_clone) + + def gen_closure(): + loss_ones_gen, loss_zeros = compute_loss() + make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True, make_optimizer_step=make_gen_optimizer_step) + make_manual_backward(loss_ones_gen, opt_gen, make_optimizer_step=make_gen_optimizer_step) + + def dis_closure(): + loss_ones_gen, loss_zeros = compute_loss() + make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True, make_optimizer_step=make_dis_optimizer_step) + make_manual_backward(loss_ones_gen, opt_dis, make_optimizer_step=make_dis_optimizer_step) + + # this will accumulate gradients for 2 batches and then call opt_gen.step() + opt_gen.step(closure=gen_closure, make_optimizer_step=make_gen_optimizer_step) + + # update discriminator every 4 baches + # therefore, no gradient accumulation for discriminator + if make_dis_optimizer_step: + # Note: Set make_optimizer_step to True or it will use by default + # Trainer(accumulate_grad_batches=x) + opt_dis.step(closure=dis_closure, make_optimizer_step=True) + + def training_epoch_end(self, outputs) -> None: + # outputs should be an array with an entry per optimizer + assert len(outputs) == 2 + + def configure_optimizers(self): + optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) + return [optimizer_gen, optimizer_dis] + +def train_manual_optimization(tmpdir, accelerator): seed_everything(42) - model = TestModel() + model = TesManualOptimizationDDPModel() model_copy = deepcopy(model) model.val_dataloader = None model.training_epoch_end = None @@ -1037,14 +1042,30 @@ def configure_optimizers(self): log_every_n_steps=1, accumulate_grad_batches=2, gpus=2, - accelerator="ddp", + accelerator=accelerator, + callbacks=[TestManualOptimizationDDPCallack()] ) trainer.fit(model) for param, param_copy in zip(model.parameters(), model_copy.parameters()): - assert not torch.equal(param, param_copy) + assert not torch.equal(param.cpu().data, param_copy.data) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', + reason="test should be run outside of pytest") +def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): + """ + Tests that `step` works with optimizer_closure and different accumulated_gradient frequency + """ + + train_manual_optimization(tmpdir, "ddp") + + +def test_step_with_optimizer_closure_with_different_frequencies_ddp_spawn(tmpdir): + """ + Tests that `step` works with optimizer_closure and different accumulated_gradient frequency + """ - opt_a, opt_b = model.optimizers() - assert opt_a._total_optimizer_step_calls == 4 - assert opt_b._total_optimizer_step_calls == 2 + train_manual_optimization(tmpdir, "ddp_spawn") \ No newline at end of file From 0a585d278d00bf9220118ca2544b25383a544799 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Feb 2021 18:40:04 +0000 Subject: [PATCH 5/7] resolve flake8 --- tests/trainer/optimization/test_manual_optimization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index f9c85df843145..1d1bf8290fe16 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -13,17 +13,17 @@ # limitations under the License. import collections import os +from copy import deepcopy from unittest import mock from unittest.mock import ANY, call, patch -from copy import deepcopy import pytest import torch import torch.distributed as torch_distrib import torch.nn.functional as F -from pytorch_lightning.callbacks import Callback from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.callbacks import Callback from pytorch_lightning.utilities import _APEX_AVAILABLE from tests.base.boring_model import BoringModel @@ -945,8 +945,6 @@ def on_train_end(self, trainer, pl_module): assert opt_b._total_optimizer_step_calls == 2 - - class TesManualOptimizationDDPModel(BoringModel): def __init__(self): super().__init__() @@ -1024,6 +1022,7 @@ def configure_optimizers(self): optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) return [optimizer_gen, optimizer_dis] + def train_manual_optimization(tmpdir, accelerator): seed_everything(42) @@ -1063,9 +1062,10 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): train_manual_optimization(tmpdir, "ddp") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_step_with_optimizer_closure_with_different_frequencies_ddp_spawn(tmpdir): """ Tests that `step` works with optimizer_closure and different accumulated_gradient frequency """ - train_manual_optimization(tmpdir, "ddp_spawn") \ No newline at end of file + train_manual_optimization(tmpdir, "ddp_spawn") From 2de112804b0905fe645cd7f05577eb255ed9f28c Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Feb 2021 11:49:02 +0000 Subject: [PATCH 6/7] resolve flake8 --- pytorch_lightning/accelerators/__init__.py | 8 ++++---- pytorch_lightning/core/optimizer.py | 5 +---- pytorch_lightning/overrides/base.py | 7 ++++--- pytorch_lightning/overrides/fairscale.py | 1 - pytorch_lightning/trainer/properties.py | 3 +-- pytorch_lightning/trainer/trainer.py | 1 - pytorch_lightning/trainer/training_loop.py | 6 ++++-- pytorch_lightning/utilities/imports.py | 1 - tests/accelerators/legacy/test_accelerator_connector.py | 1 - tests/core/test_datamodules.py | 2 +- tests/models/test_amp.py | 5 +++-- tests/models/test_hooks.py | 2 +- tests/models/test_tpu.py | 2 +- tests/trainer/optimization/test_manual_optimization.py | 6 +++--- 14 files changed, 23 insertions(+), 27 deletions(-) diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 2ec118303d153..66faa8154b467 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.accelerators.cpu import CPUAccelerator -from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.tpu import TPUAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator # noqa F401 +from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.tpu import TPUAccelerator # noqa F401 diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index 201659e08d3ae..ce9b0960b7055 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -17,12 +17,9 @@ from torch.optim.optimizer import Optimizer -from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType +from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException -if _TPU_AVAILABLE: - import torch_xla.core.xla_model as xm - def is_lightning_optimizer(optimizer): return isinstance(optimizer, LightningOptimizer) diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py index 65a21c91e6052..d7376e9bcdad9 100644 --- a/pytorch_lightning/overrides/base.py +++ b/pytorch_lightning/overrides/base.py @@ -46,9 +46,10 @@ def forward(self, *inputs, **kwargs): if running_stage == RunningStage.TRAINING: output = self.module.training_step(*inputs, **kwargs) - # In manual_optimization, we need to prevent DDP reducer as - # it is done manually in ``LightningModule.manual_backward``. - # `require_backward_grad_sync` will be reset + + # In manual_optimization, we need to prevent DDP reducer as + # it is done manually in ``LightningModule.manual_backward`` + # `require_backward_grad_sync` will be reset # ddp_plugin ``post_training_step`` hook if not self.module.automatic_optimization: self.module.trainer.model.require_backward_grad_sync = False diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py index 2404beb8832f9..f7c3b8d5fd575 100644 --- a/pytorch_lightning/overrides/fairscale.py +++ b/pytorch_lightning/overrides/fairscale.py @@ -13,7 +13,6 @@ # limitations under the License. from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module -from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE LightningShardedDataParallel = None diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index f625c4f994286..e59864df19029 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -17,10 +17,9 @@ from argparse import ArgumentParser, Namespace from typing import Any, cast, List, Optional, Type, TypeVar, Union -from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.accelerator_connector import BackendConnector from pytorch_lightning.accelerators.legacy.accelerator import Accelerator -from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ProgressBarBase from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.trainer.states import TrainerState diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 35ae1af66e16c..8e833c33cbbcf 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Trainer to automate the training.""" -import os import warnings from pathlib import Path from typing import Dict, Iterable, List, Optional, Union diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 51472a70588f1..eb5a06e880401 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -18,7 +18,6 @@ import numpy as np import torch -from pytorch_lightning import LightningModule from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.optimizer import LightningOptimizer @@ -711,7 +710,10 @@ def block_ddp_sync_behaviour(self, should_block_sync: bool = False): context manager with sync behaviour off """ - if isinstance(self.trainer.training_type_plugin, ParallelPlugin) and (self.automatic_optimization or should_block_sync): + if ( + isinstance(self.trainer.training_type_plugin, ParallelPlugin) + and (self.automatic_optimization or should_block_sync) + ): with self.trainer.training_type_plugin.block_backward_sync(): yield None else: diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 33c44eee67719..7dfa3d9b1c71d 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -56,4 +56,3 @@ def _module_available(module_path: str) -> bool: _BOLTS_AVAILABLE = _module_available('pl_bolts') _PYTORCH_PRUNE_AVAILABLE = _module_available('torch.nn.utils.prune') _PYTORCH_GREATER_EQUAL_THAN_1_7_0 = LooseVersion(torch.__version__) >= LooseVersion("1.7.0") - diff --git a/tests/accelerators/legacy/test_accelerator_connector.py b/tests/accelerators/legacy/test_accelerator_connector.py index 20a4ef6424cc6..625b231b84179 100644 --- a/tests/accelerators/legacy/test_accelerator_connector.py +++ b/tests/accelerators/legacy/test_accelerator_connector.py @@ -25,7 +25,6 @@ from pytorch_lightning.callbacks import Callback from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPSpawnPlugin, PrecisionPlugin, SingleDevicePlugin from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment -from pytorch_lightning.utilities import DistributedType from tests.base.boring_model import BoringModel diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 4ecdf768070a4..802c897b4ee2f 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -15,7 +15,7 @@ from argparse import ArgumentParser from typing import Any, Dict from unittest import mock -from unittest.mock import MagicMock, PropertyMock +from unittest.mock import PropertyMock import pytest import torch diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 0b038d47e6032..96f357467cb06 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -139,7 +139,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): callbacks=[checkpoint], logger=logger, ) - result = trainer.fit(model) + _ = trainer.fit(model) # correct result and ok accuracy assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete' @@ -149,7 +149,8 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc') == 'abc' assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23' assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23' - assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' + generated = trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') + assert generated == 'abc23' def test_cpu_model_with_amp(tmpdir): diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 5decd9993cb73..ab3f3f7ee88e7 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -14,7 +14,7 @@ import inspect import os from unittest import mock -from unittest.mock import MagicMock, PropertyMock +from unittest.mock import PropertyMock import pytest import torch diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 4bad926375a0a..73da0622cbeba 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -19,7 +19,7 @@ from torch.utils.data import DataLoader import tests.base.develop_pipelines as tpipes -from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning import Trainer from pytorch_lightning.accelerators import TPUAccelerator from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.trainer.states import TrainerState diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index f9c85df843145..fce6caff3669c 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -13,17 +13,17 @@ # limitations under the License. import collections import os +from copy import deepcopy from unittest import mock from unittest.mock import ANY, call, patch -from copy import deepcopy import pytest import torch import torch.distributed as torch_distrib import torch.nn.functional as F -from pytorch_lightning.callbacks import Callback from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.callbacks import Callback from pytorch_lightning.utilities import _APEX_AVAILABLE from tests.base.boring_model import BoringModel @@ -1068,4 +1068,4 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp_spawn(tmpdir Tests that `step` works with optimizer_closure and different accumulated_gradient frequency """ - train_manual_optimization(tmpdir, "ddp_spawn") \ No newline at end of file + train_manual_optimization(tmpdir, "ddp_spawn") From a3df785718789eeb813b27313f22f6f71b492449 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Feb 2021 18:40:04 +0000 Subject: [PATCH 7/7] resolve flake8 --- tests/trainer/optimization/test_manual_optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index fce6caff3669c..1d1bf8290fe16 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -945,8 +945,6 @@ def on_train_end(self, trainer, pl_module): assert opt_b._total_optimizer_step_calls == 2 - - class TesManualOptimizationDDPModel(BoringModel): def __init__(self): super().__init__() @@ -1024,6 +1022,7 @@ def configure_optimizers(self): optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) return [optimizer_gen, optimizer_dis] + def train_manual_optimization(tmpdir, accelerator): seed_everything(42) @@ -1063,6 +1062,7 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): train_manual_optimization(tmpdir, "ddp") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_step_with_optimizer_closure_with_different_frequencies_ddp_spawn(tmpdir): """ Tests that `step` works with optimizer_closure and different accumulated_gradient frequency