From fddeee3a0facb50639f94fbcaa90ca221381e9c1 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 9 Nov 2020 17:19:02 +0100 Subject: [PATCH 001/157] move to old package --- .../accelerators/{ => old}/__init__.py | 0 .../accelerators/old/accelerator.py | 259 ++++++++++++++++++ .../{ => old}/accelerator_connector.py | 0 .../accelerators/{ => old}/cpu_accelerator.py | 0 .../{ => old}/ddp2_accelerator.py | 0 .../accelerators/{ => old}/ddp_accelerator.py | 0 .../{ => old}/ddp_cpu_hpc_accelerator.py | 0 .../{ => old}/ddp_cpu_spawn_accelerator.py | 0 .../{ => old}/ddp_hpc_accelerator.py | 0 .../{ => old}/ddp_spawn_accelerator.py | 0 .../accelerators/{ => old}/dp_accelerator.py | 0 .../accelerators/{ => old}/gpu_accelerator.py | 0 .../{ => old}/horovod_accelerator.py | 0 .../accelerators/{ => old}/tpu_accelerator.py | 0 14 files changed, 259 insertions(+) rename pytorch_lightning/accelerators/{ => old}/__init__.py (100%) create mode 100644 pytorch_lightning/accelerators/old/accelerator.py rename pytorch_lightning/accelerators/{ => old}/accelerator_connector.py (100%) rename pytorch_lightning/accelerators/{ => old}/cpu_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/ddp2_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/ddp_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/ddp_cpu_hpc_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/ddp_cpu_spawn_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/ddp_hpc_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/ddp_spawn_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/dp_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/gpu_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/horovod_accelerator.py (100%) rename pytorch_lightning/accelerators/{ => old}/tpu_accelerator.py (100%) diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/old/__init__.py similarity index 100% rename from pytorch_lightning/accelerators/__init__.py rename to pytorch_lightning/accelerators/old/__init__.py diff --git a/pytorch_lightning/accelerators/old/accelerator.py b/pytorch_lightning/accelerators/old/accelerator.py new file mode 100644 index 0000000000000..b16e0125054bb --- /dev/null +++ b/pytorch_lightning/accelerators/old/accelerator.py @@ -0,0 +1,259 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import math +from enum import Enum +from pytorch_lightning.core.lightning import LightningModule +from typing import Any, Optional, Union + +import torch + +from pytorch_lightning.utilities import AMPType, rank_zero_warn +from pytorch_lightning.utilities.apply_func import move_data_to_device +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.parsing import AttributeDict +import torch.distributed as torch_distrib +from pytorch_lightning import _logger as log + +try: + from apex import amp +except ImportError: + amp = None + +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + + class ReduceOp: + SUM = None + + +EPSILON = 1e-6 +EPSILON_FP16 = 1e-5 + + +class Accelerator(object): + def __init__(self, trainer=None, cluster_environment=None, ddp_plugin=None): + self.trainer = trainer + self.nickname = None + self.cluster_environment = cluster_environment + self.dist = AttributeDict(rank=0, device=None) + self.ddp_plugin = ddp_plugin + + if trainer is not None: + self.train_loop = self.trainer.train + self.validation_loop = self.trainer.run_evaluation + self.test_loop = self.trainer.run_evaluation + + def setup(self, model): + pass + + def teardown(self): + # Ensure if necessary all processes are finished + self.barrier() + + def barrier(self, name: Optional[str] = None): + pass + + def broadcast(self, obj, src=0): + return obj + + def train_or_test(self): + if self.trainer.testing: + results = self.trainer.run_test() + else: + results = self.trainer.train() + return results + + def batch_to_device(self, batch: Any, device: torch.device): + model = self.trainer.get_model() + if model is not None: + return model.transfer_batch_to_device(batch, device) + return move_data_to_device(batch, device) + + def training_step_end(self, output): + return output + + def test_step_end(self, output): + return output + + def validation_step_end(self, output): + return output + + def process_dataloader(self, dataloader): + return dataloader + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + if self.trainer.precision == 16: + closure_loss = self.trainer.precision_connector.backend.backward( + closure_loss, optimizer, opt_idx, *args, **kwargs + ) + else: + # do backward pass + model = self.trainer.get_model() + model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + return closure_loss + + def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): + model_ref = self.trainer.get_model() + is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) + native_amp = self.trainer.amp_backend == AMPType.NATIVE + + # native amp + lbfgs is a no go right now + if native_amp and is_lbfgs: + raise MisconfigurationException( + "native PyTorch amp and lbfgs are not compatible." + " To request, please file a Github issue in PyTorch and tag @mcarilli" + ) + + # model hook + model_ref.optimizer_step( + epoch=self.trainer.current_epoch, + batch_idx=batch_idx, + optimizer=optimizer, + optimizer_idx=opt_idx, + optimizer_closure=lambda_closure, + on_tpu=False, # TPUAccelerator class sets this as True + using_native_amp=native_amp, + using_lbfgs=is_lbfgs, + ) + + # scale when native amp + if native_amp: + self.trainer.scaler.update() + + def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): + model_ref = self.trainer.get_model() + model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) + + def clip_gradients(self, optimizer, clip_val=None): + # TODO: separate TPU case from here + self._clip_gradients(optimizer, clip_val) + + def _clip_gradients(self, optimizer, clip_val=None): + # use the trainer's clip val if none passed + grad_clip_val = self.trainer.gradient_clip_val + if clip_val is not None: + grad_clip_val = clip_val + grad_clip_val = float(grad_clip_val) + + # this code is a modification of torch.nn.utils.clip_grad_norm_ + # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md + if grad_clip_val <= 0: + return + + model = self.trainer.get_model() + if self.trainer.amp_backend == AMPType.APEX: + parameters = amp.master_params(optimizer) + else: + parameters = model.parameters() + + max_norm = grad_clip_val + norm_type = float(2.0) + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + + if norm_type == math.inf: + total_norm = max(p.grad.data.abs().max() for p in parameters) + else: + device = parameters[0].device + out = torch.empty(len(parameters), device=device) + for i, p in enumerate(parameters): + torch.norm(p.grad.data.to(device), norm_type, out=out[i]) + total_norm = torch.norm(out, norm_type) + + eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON + clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps) + clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) + for p in parameters: + p.grad.data.mul_(clip_coef.to(p.grad.data.device)) + + def on_train_epoch_end(self, outputs): + pass + + def on_train_end(self): + pass + + def early_stopping_should_stop(self, pl_module): + return self.trainer.should_stop + + def setup_optimizers(self, model): + if self.trainer.testing is True: + return + + optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) + self.trainer.optimizers = optimizers + self.trainer.lr_schedulers = lr_schedulers + self.trainer.optimizer_frequencies = optimizer_frequencies + + def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: + os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) + torch_backend = "nccl" if self.trainer.on_gpu else "gloo" + + if not torch.distributed.is_initialized(): + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + + def sync_tensor( + self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None + ) -> torch.Tensor: + """ + Function to reduce a tensor from several distributed processes to one aggregated tensor. + + Args: + tensor: the tensor to sync and reduce + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + + Return: + reduced value + """ + raise NotImplementedError() + + def __getstate__(self): + return { + "trainer": self.trainer, + "nickname": self.nickname, + "cluster_environment": self.cluster_environment, + "dist": self.dist, + "ddp_plugin": self.ddp_plugin, + } + + def __setstate__(self, d): + self.trainer = d["trainer"] + self.nickname = d["nickname"] + self.cluster_environment = d["cluster_environment"] + self.dist = d["dist"] + self.ddp_plugin = d["ddp_plugin"] + + +# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos... +class BackendType(Enum): + DP = "dp" + DDP = "ddp" + DDP2 = "ddp2" + DDP_SPAWN = "ddp_spawn" + # decuple distrib and device + DDP_CPU = "ddp_cpu" + HOROVOD = "horovod" + # this is rather device + TPU = "tpu" diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/old/accelerator_connector.py similarity index 100% rename from pytorch_lightning/accelerators/accelerator_connector.py rename to pytorch_lightning/accelerators/old/accelerator_connector.py diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/old/cpu_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/cpu_accelerator.py rename to pytorch_lightning/accelerators/old/cpu_accelerator.py diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/old/ddp2_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/ddp2_accelerator.py rename to pytorch_lightning/accelerators/old/ddp2_accelerator.py diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/old/ddp_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/ddp_accelerator.py rename to pytorch_lightning/accelerators/old/ddp_accelerator.py diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py rename to pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py rename to pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/ddp_hpc_accelerator.py rename to pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/ddp_spawn_accelerator.py rename to pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/old/dp_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/dp_accelerator.py rename to pytorch_lightning/accelerators/old/dp_accelerator.py diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/old/gpu_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/gpu_accelerator.py rename to pytorch_lightning/accelerators/old/gpu_accelerator.py diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/old/horovod_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/horovod_accelerator.py rename to pytorch_lightning/accelerators/old/horovod_accelerator.py diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/old/tpu_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/tpu_accelerator.py rename to pytorch_lightning/accelerators/old/tpu_accelerator.py From f9c1e8d557d02ffd5dd1c774e8403d1a743a798c Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 9 Nov 2020 17:19:18 +0100 Subject: [PATCH 002/157] add initial draft of new accelerators --- pytorch_lightning/accelerators/accelerator.py | 333 ++++++++---------- 1 file changed, 141 insertions(+), 192 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 1b3ae6f23058a..3d1b5038dcc20 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,79 +1,69 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from contextlib import contextmanager -from typing import Any, Optional, Union +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities import AMPType +from typing import Any, Union +import math import torch from torch.optim import Optimizer -from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.parsing import AttributeDict +from pytorch_lightning.core import LightningModule +from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin, PrecisionPlugin -if torch.distributed.is_available(): - from torch.distributed import ReduceOp -else: - class ReduceOp: - SUM = None +from pytorch_lightning.utilities.apply_func import move_data_to_device -class Accelerator(object): +class NewAccelerator(object): + root_device: Union[str, torch.device] - def __init__(self, - trainer: Optional = None, - cluster_environment: Optional[ClusterEnvironment] = None, - ddp_plugin: Optional[DDPPlugin] = None): - self.trainer = trainer - self.nickname = None - self.cluster_environment = cluster_environment - self.dist = AttributeDict(rank=0, device=None) - self.ddp_plugin = ddp_plugin + def __init__( + self, + model_ref: LightningModule, + root_device: Union[str, torch.device], + precision_plugin: PrecisionPlugin, + gradient_clip_val, + ): + self.model_ref = model_ref + self.precision_plugin = precision_plugin + self.gradient_clip_val = gradient_clip_val - if trainer is not None: - self.train_loop = self.trainer.train - self.validation_loop = self.trainer.run_evaluation - self.test_loop = self.trainer.run_evaluation + self.optimizers = None + self.lr_schedulers = None + self.optimizer_frequencies = None + self.root_device = root_device def setup(self, model): - pass + self.setup_optimizers(model) + self.connect_precision_plugin() def teardown(self): - # Ensure if necessary all processes are finished - self.barrier() - - def barrier(self, name: Optional[str] = None): pass - def broadcast(self, obj, src=0): - return obj - - def train_or_test(self): - if self.trainer.testing: - results = self.trainer.run_test() - else: - results = self.trainer.train() - return results - def batch_to_device(self, batch: Any, device: torch.device): - model = self.trainer.get_model() + model = self.model_ref if model is not None: return model.transfer_batch_to_device(batch, device) return move_data_to_device(batch, device) + def training_step(self, args): + batch = self.to_device(args[0]) + + args[0] = batch + + return self.model_ref.training_step(*args) + + def validation_step(self, args): + batch = self.to_device(args[0]) + + args[0] = batch + + return self.model_ref.validation_step(*args) + + def test_step(self, args): + batch = self.to_device(args[0]) + + args[0] = batch + return self.model_ref.test_step(*args) + def training_step_end(self, output): return output @@ -87,28 +77,36 @@ def process_dataloader(self, dataloader): return dataloader def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): - automatic_optimization = self.trainer.train_loop.automatic_optimization - - if not automatic_optimization and self.ddp_plugin is not None: - # Manually prepare for reduce as user calling backwards manually - self.ddp_plugin.on_before_manual_backward(self.trainer.model, closure_loss) + return self.precision_plugin.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) + + def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): + model_ref = self.model_ref + is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) + native_amp = self.trainer.amp_backend == AMPType.NATIVE + + self.precision_plugin.pre_optimizer_step(optimizer) + + # model hook + model_ref.optimizer_step( + epoch=current_epoch, + batch_idx=batch_idx, + optimizer=optimizer, + optimizer_idx=opt_idx, + optimizer_closure=lambda_closure, + on_tpu=False, # TPUAccelerator class sets this as True + using_native_amp=native_amp, + using_lbfgs=is_lbfgs, + ) - if self.trainer.precision == 16: - closure_loss = self.trainer.precision_connector.backend.backward( - closure_loss, optimizer, opt_idx, *args, **kwargs - ) - else: - # do backward pass - model = self.trainer.get_model() - model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) + self.precision_plugin.post_optimizer_step() - # once backward has been applied, release graph - closure_loss = closure_loss.detach() - return closure_loss + def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): + model_ref = self.model_ref + model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx) def clip_gradients(self, optimizer, clip_val=None): # use the trainer's clip val if none passed - grad_clip_val = self.trainer.gradient_clip_val + grad_clip_val = self.gradient_clip_val if clip_val is not None: grad_clip_val = clip_val grad_clip_val = float(grad_clip_val) @@ -117,12 +115,37 @@ def clip_gradients(self, optimizer, clip_val=None): return self._clip_gradients(optimizer, grad_clip_val) - def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): - if self.trainer.amp_backend: - self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type) + model = self.model_ref + + # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX + if self.trainer.amp_backend == AMPType.APEX: + parameters = self.precision_plugin.master_params(optimizer) else: - model = self.trainer.get_model() - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) + parameters = model.parameters() + + max_norm = grad_clip_val + norm_type = float(2.0) + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + + device = parameters[0].device + + if norm_type == math.inf: + total_norm = max(p.grad.data.abs().max() for p in parameters) + else: + out = torch.empty(len(parameters), device=device) + for i, p in enumerate(parameters): + torch.norm(p.grad.data.to(device), norm_type, out=out[i]) + total_norm = torch.norm(out, norm_type) + + eps = self.precision_plugin.EPSILON + + clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps) + clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) + for p in parameters: + p.grad.data.mul_(clip_coef.to(p.grad.data.device)) def on_train_epoch_end(self, outputs): pass @@ -130,126 +153,52 @@ def on_train_epoch_end(self, outputs): def on_train_end(self): pass + # TODO: Check if we can change logic for early stopping to accelerator/trainer completely or have a separate connector (should be self contained) def early_stopping_should_stop(self, pl_module): return self.trainer.should_stop def setup_optimizers(self, model): - if self.trainer.testing: + # TODO: Check if we can change logic for early stopping to trainer completely (should be self contained) + if self.trainer.testing is True: return optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) - self.trainer.optimizers = optimizers - self.trainer.lr_schedulers = lr_schedulers - self.trainer.optimizer_frequencies = optimizer_frequencies - - def init_ddp_connection( - self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True - ) -> None: - self.ddp_plugin.init_ddp_connection( - self.trainer, - self.cluster_environment, - global_rank, - world_size, - is_slurm_managing_tasks, + self.optimizers = optimizers + self.lr_schedulers = lr_schedulers + self.optimizer_frequencies = optimizer_frequencies + + def connect_precision_plugin(self): + model, optimizers, schedulers = self.precision_plugin.connect( + self.model_ref, self.optimizers, self.lr_schedulers ) - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - """ - Function to reduce a tensor from several distributed processes to one aggregated tensor. - - Args: - tensor: the tensor to sync and reduce - group: the process group to gather results from. Defaults to all processes (world) - reduce_op: the reduction operation. Defaults to sum. - Can also be a string of 'avg', 'mean' to calculate the mean during reduction. - - Return: - reduced value - """ - raise NotImplementedError() - - def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False): - """ - Function to gather a tensor from several distributed processes - - Args: - tensor: tensor of shape (batch, ...) - group: the process group to gather results from. Defaults to all processes (world) - sync_grads: flag that allows users to synchronize gradients for all_gather op - - Return: - A tensor of shape (world_size, batch, ...) - """ - raise NotImplementedError() - - def optimizer_state(self, optimizer: Optimizer) -> dict: - """ - Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom - plugins. - Return: - Optimizer state dict - """ - if self.ddp_plugin: - return self.ddp_plugin.optimizer_state(optimizer) - return optimizer.state_dict() - - def get_reference_model(self, model) -> LightningModule: - """ - Override to modify returning base :class:`LightningModule` - when accessing variable and functions if the accelerator has wrapped the model. - - Example:: - ref_model = accelerator.get_reference_model(model) - ref_model.training_step(...) - - Args: - model: Accelerator model. - - Returns: Reference :class:`LightningModule`. - - """ - return model - - def __getstate__(self): - return { - 'trainer': self.trainer, - 'nickname': self.nickname, - 'cluster_environment': self.cluster_environment, - 'dist': self.dist, - 'ddp_plugin': self.ddp_plugin - } - - def __setstate__(self, d): - self.trainer = d['trainer'] - self.nickname = d['nickname'] - self.cluster_environment = d['cluster_environment'] - self.dist = d['dist'] - self.ddp_plugin = d['ddp_plugin'] - - def on_save(self, checkpoint): - return checkpoint - - @property - def rpc_enabled(self): - return self.ddp_plugin is not None and isinstance(self.ddp_plugin, RPCPlugin) - - @property - def distributed_sampler_kwargs(self): - raise NotImplementedError - - @property - def require_distributed_sampler(self): - raise NotImplementedError - - @contextmanager - def block_ddp_plugin_sync_behaviour(self): - """ - Blocks ddp sync gradients behaviour on backwards pass. - This is useful for skipping sync when accumulating gradients, reducing communication overhead - Returns: context manager with sync behaviour off - """ - cm = self.ddp_plugin.block_backward_sync(self.trainer.model) if self.ddp_plugin else None - yield cm + self.model_ref = model + self.optimizers = optimizers + self.schedulers = schedulers + + def to_device(self, batch): + return self.batch_to_device(batch, self.root_device) + + +class NewCPUAccelerator(NewAccelerator): + def setup(self, model): + if isinstance(self.precision_plugin, MixedPrecisionPlugin): + MisconfigurationException("amp + cpu is not supported. Please use a GPU option") + + if "cpu" not in str(self.root_device): + raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead") + + return super().setup(model) + + +class NewGPUAccelerator(NewAccelerator): + def setup(self, model): + if "cuda" not in str(self.root_device): + raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + torch.cuda.set_device(self.root_device) + self.model_ref.to(self.root_device) + + return super().setup(model) + + +# TODO: Add NewTPUAccelerator From 28ae4037ead0723f006e4cef2d6e30fb45dacf25 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 9 Nov 2020 17:19:30 +0100 Subject: [PATCH 003/157] add initial data parallel draft --- .../accelerators/data_parallel.py | 325 ++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 pytorch_lightning/accelerators/data_parallel.py diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py new file mode 100644 index 0000000000000..9a6481c65c5db --- /dev/null +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -0,0 +1,325 @@ +from abc import ABC, abstractmethod + +from torch.nn.parallel.distributed import DistributedDataParallel +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.utilities.seed import seed_everything +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.distributed.dist import LightningDistributed +import torch +import os +from pytorch_lightning.core.step_result import Result +from typing import Any, Dict, List, Optional, Union +from pytorch_lightning.overrides.data_parallel import LightningDataParallel, LightningDistributedDataParallel +from torch.nn.parallel.data_parallel import DataParallel +import sys +from os.path import abspath +from time import sleep +import subprocess +from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only +import numpy as np +import torch.distributed as torch_distrib +from pytorch_lightning import _logger as log + +from pytorch_lightning.utilities.distributed import sync_ddp_if_available + +try: + from hydra.utils import to_absolute_path, get_original_cwd + from hydra.core.hydra_config import HydraConfig +except ImportError: + HYDRA_AVAILABLE = False +else: + HYDRA_AVAILABLE = True + +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + + class ReduceOp: + SUM = None + + +class ParallelPlugin(ABC): + def __init__(self): + self.model = None + + @abstractmethod + def reduce(self, output): + raise NotImplementedError + + @abstractmethod + @property + def root_device(self): + raise NotImplementedError + + +class DataParallelPlugin(ParallelPlugin): + def __init__(self, parallel_device_ids): + super().__init__() + self.parallel_device_ids = parallel_device_ids + + def setup(self, model): + self.model = LightningDataParallel(model, self.parallel_device_ids) + + def reduce(self, output): + if isinstance(output, Result): + output.dp_reduce() + + elif isinstance(output, torch.Tensor): + output = output.mean() + + return output + + @property + def root_device(self): + return self.parallel_device_ids[0] + + +class DistributedDataParallelPlugin(ParallelPlugin): + def __init__(self, parallel_device_ids, num_nodes, num_processes, **ddp_kwargs): + super().__init__(self) + + self.task_idx = None + self._has_spawned_children = False + self.interactive_ddp_procs = [] + self.dist = LightningDistributed() + self.parallel_device_ids = parallel_device_ids + self.num_nodes = num_nodes + self.num_processes = num_processes + self._ddp_kwargs: Dict[str, Any] = ddp_kwargs + + def setup(self, model): + # start the other scripts + if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": + self._call_children_scripts() + + # set the task idx + self.task_idx = int(os.environ["LOCAL_RANK"]) + + def _call_children_scripts(self): + assert self.trainer.global_rank == 0 + self._check_can_spawn_children() + self._has_spawned_children = True + + os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1") + os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port())) + + # allow the user to pass the node rank + node_rank = "0" + node_rank = os.environ.get("NODE_RANK", node_rank) + node_rank = os.environ.get("GROUP_RANK", node_rank) + os.environ["NODE_RANK"] = node_rank + os.environ["LOCAL_RANK"] = "0" + + # when user is using hydra find the absolute path + path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path + + # pull out the commands used to run the script and resolve the abs file path + command = sys.argv + try: + full_path = path_lib(command[0]) + except Exception as e: + full_path = abspath(command[0]) + + command[0] = full_path + # use the same python interpreter and actually running + command = [sys.executable] + command + + # the visible devices tell us how many GPUs we want to use. + # when the trainer script was called the device has already been scoped by the time + # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone + # but forward the GPUs selected via environment variables + if self.trainer.data_parallel_device_ids is None: + raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") + + os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) + os.environ["PL_IN_DDP_SUBPROCESS"] = "1" + + # TODO: Change t + if self.trainer.logger is not None: + os.environ["PL_EXP_VERSION"] = str(self.trainer.logger.version) + + num_gpus = len(self.parallel_device_ids) + os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" + + self.interactive_ddp_procs = [] + for local_rank in range(1, self.num_processes): + env_copy = os.environ.copy() + env_copy["LOCAL_RANK"] = f"{local_rank}" + + # remove env var if global seed not set + if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: + del env_copy["PL_GLOBAL_SEED"] + + # start process + # if hydra is available and initialized, make sure to set the cwd correctly + cwd: Optional[str] = None + if HYDRA_AVAILABLE: + if HydraConfig.initialized(): + cwd = get_original_cwd() + proc = subprocess.Popen(command, env=env_copy, cwd=cwd) + self.interactive_ddp_procs.append(proc) + + # starting all processes at once can cause issues + # with dataloaders delay between 1-10 seconds + delay = np.random.uniform(1, 5, 1)[0] + sleep(delay) + + def barrier(self, name: Optional[str] = None): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + # TODO: Refactor This! Not sure we still need the whole method here. Should be dione with some additional setup and cleaning logic + def ddp_train(self, process_idx, model): + """ + Entry point for ddp + + Args: + process_idx: + mp_queue: multiprocessing queue + model: + + Returns: + Dict with evaluation results + + """ + seed = os.environ.get("PL_GLOBAL_SEED") + if seed is not None: + seed_everything(int(seed)) + + # show progressbar only on progress_rank 0 + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: + self.trainer.progress_bar_callback.disable() + + # determine which process we are and world size + self.set_world_ranks(process_idx) + + # set warning rank + rank_zero_only.rank = self.trainer.global_rank + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + model.trainer = self.trainer + self.init_ddp_connection( + self.trainer.global_rank, + self.trainer.world_size, + self.trainer.is_slurm_managing_tasks + ) + + # call setup after the ddp process has connected + self.trainer.call_setup_hook(model) + + # on world_size=0 let everyone know training is starting + if self.trainer.is_global_zero and not torch.distributed.is_initialized(): + log.info('-' * 100) + log.info(f'distributed_backend={self.trainer.distributed_backend}') + log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') + log.info('-' * 100) + + # call sync_bn before .cuda(), configure_apex and configure_ddp + if self.trainer.sync_batchnorm: + model = self.configure_sync_batchnorm(model) + + # move the model to the correct device + self.model_to_device(model, process_idx) + + # CHOOSE OPTIMIZER + # allow for lr schedulers as well + self.setup_optimizers(model) + + # set model properties before going into wrapper + self.trainer.model_connector.copy_trainer_model_properties(model) + + # 16-bit + model = self.trainer.precision_connector.connect(model) + + # device ids change depending on the DDP setup + device_ids = self.get_device_ids() + + # allow user to configure ddp + model = self.configure_ddp(model, device_ids) + + # set up training routine + self.barrier('ddp_setup') + self.trainer.train_loop.setup_training(model) + + # train or test + results = self.train_or_test() + + # clean up memory + torch.cuda.empty_cache() + + return results + + def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: + os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) + torch_backend = "nccl" if self.trainer.on_gpu else "gloo" + + if not torch.distributed.is_initialized(): + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + + def configure_ddp( + self, model: LightningModule, device_ids: List[int] + ) -> LightningDistributedDataParallel: + """ + Pass through all customizations from constructor to `LightningDistributedDataParallel`. + Override to define a custom DDP implementation. + + .. note:: Only requirement is that your DDP implementation subclasses LightningDistributedDataParallel + + + The default implementation is:: + + def configure_ddp(self, model, device_ids): + model = LightningDistributedDataParallel( + model, device_ids=device_ids, find_unused_parameters=True + ) + return model + + Args: + model: the lightningModule + device_ids: the list of devices available + + Returns: + the model wrapped in LightningDistributedDataParallel + + """ + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get( + "find_unused_parameters", True + ) + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + **self._ddp_kwargs, + ) + return model + + def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) + + return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + """ + + """ + return sync_ddp_if_available(tensor, group, reduce_op) From fe7573f812d8783a3d9ea91658687f174e56ef38 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 9 Nov 2020 17:19:39 +0100 Subject: [PATCH 004/157] add initial precision draft --- pytorch_lightning/accelerators/precision.py | 150 ++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 pytorch_lightning/accelerators/precision.py diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py new file mode 100644 index 0000000000000..19a375272e95f --- /dev/null +++ b/pytorch_lightning/accelerators/precision.py @@ -0,0 +1,150 @@ +from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties +from pytorch_lightning.core.lightning import LightningModule +from typing import List, Tuple +import torch +from torch.optim import Optimizer + +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities import AMPType, rank_zero_warn + +try: + from apex import amp +except ImportError: + amp = None + + +class PrecisionPlugin(object): + EPSILON = 1e-6 + precision = 32 + + def pre_optimizer_step(self, optimizer, optiizer_idx): + pass + + def post_optimizer_step(self, optimizer, optimizer_idx): + pass + + def master_params(self, optimizer): + for group in optimizer.param_groups: + for p in group["params"]: + yield p + + def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): + return model, optimizers, lr_schedulers + + +class MixedPrecisionPlugin(PrecisionPlugin): + EPSILON = 1e-5 + backend: AMPType + precision = "mixed" + + +class NativeMixedPrecisionPlugin(MixedPrecisionPlugin): + def __init__(self): + self.backend = AMPType.NATIVE + self.scaler = torch.cuda.amp.GradScaler() + + def pre_optimizer_step(self, optimizer, optimizer_idx): + if isinstance(optimizer, torch.optim.LBFGS): + raise MisconfigurationException( + f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})." + " To request, please file a Github issue in PyTorch and tag @mcarilli" + ) + + def post_optimizer_step(self, optimizer, optimizer_idx): + self.scaler.update() + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + closure_loss = self.scaler.scale(closure_loss) + + # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) + automatic_optimization = self.trainer.train_loop.automatic_optimization + + # do backward pass + if automatic_optimization: + model = self.trainer.get_model() + model.backward(closure_loss, optimizer, opt_idx) + else: + closure_loss.backward(*args, **kwargs) + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + + # unscale gradient to allow analyze within `on_after_backward` + # TODO: Check from where we can get the should_accumulate value (maybe pass it as argument?) + if not self.trainer.train_loop.should_accumulate() and automatic_optimization: + self.scaler.unscale_(optimizer) + + return closure_loss + + +class ApexMixedPrecisionPlugin(MixedPrecisionPlugin): + def __init__(self): + self.backend = AMPType.APEX + + def connect(self, model, optimizers, lr_schedulers): + model, optimizers = self.configure_apex(amp, model, optimizers, self.trainer.amp_level) + reinit_scheduler_properties(optimizers, lr_schedulers) + return model, optimizers, lr_schedulers + + def training_step(self, fx, args): + output = fx(args) + return output + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + closure_loss = amp.scale_loss(closure_loss, optimizer) + + # enter apex context + context = closure_loss + closure_loss = closure_loss.__enter__() + + # do backward pass + if self.trainer.train_loop.automatic_optimization: + model = self.trainer.get_model() + model.backward(closure_loss, optimizer, opt_idx) + else: + closure_loss.backward(*args, **kwargs) + + # exit amp context + a, b, c = None, None, None + error = context.__exit__(a, b, c) + if error: + rank_zero_warn(a, b, c) + raise Exception("apex unscale error") + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + return closure_loss + + def configure_apex( + self, + amp: object, + model: LightningModule, + optimizers: List[Optimizer], + amp_level: str, + ) -> Tuple[LightningModule, List[Optimizer]]: + r""" + Override to init AMP your own way. + Must return a model and list of optimizers. + + Args: + amp: pointer to amp library object. + model: pointer to current :class:`LightningModule`. + optimizers: list of optimizers passed in :meth:`configure_optimizers`. + amp_level: AMP mode chosen ('O1', 'O2', etc...) + + Return: + Apex wrapped model and optimizers + + Examples: + .. code-block:: python + + # Default implementation used by Trainer. + def configure_apex(self, amp, model, optimizers, amp_level): + model, optimizers = amp.initialize( + model, optimizers, opt_level=amp_level, + ) + + return model, optimizers + """ + model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level) + return model, optimizers \ No newline at end of file From 9fd48a1cdf7d9946b74e6d4b6e04c75a2d52869d Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 9 Nov 2020 17:19:48 +0100 Subject: [PATCH 005/157] scheduler helper functions --- .../accelerators/scheduler_properties.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 pytorch_lightning/accelerators/scheduler_properties.py diff --git a/pytorch_lightning/accelerators/scheduler_properties.py b/pytorch_lightning/accelerators/scheduler_properties.py new file mode 100644 index 0000000000000..6835df4499385 --- /dev/null +++ b/pytorch_lightning/accelerators/scheduler_properties.py @@ -0,0 +1,25 @@ +from torch import optim + + +def reinit_scheduler_properties(self, optimizers: list, schedulers: list): + # Reinitialize optimizer.step properties added by schedulers + for scheduler in schedulers: + scheduler = scheduler['scheduler'] + + for optimizer in optimizers: + state = None + idx = 0 + + # check that we dont mix users optimizers and schedulers + if scheduler.optimizer == optimizer: + # Find the mro belonging to the base lr scheduler class + for i, mro in enumerate(scheduler.__class__.__mro__): + if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau): + idx = i + state = scheduler.state_dict() + else: + state = None + + scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) + if state is not None: + scheduler.load_state_dict(state) \ No newline at end of file From b961aaf054bda242a361cba30d31ae776588b029 Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 11 Nov 2020 16:58:23 +0100 Subject: [PATCH 006/157] define base plugin api --- pytorch_lightning/accelerators/base_plugin.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 pytorch_lightning/accelerators/base_plugin.py diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py new file mode 100644 index 0000000000000..acd90e41f60df --- /dev/null +++ b/pytorch_lightning/accelerators/base_plugin.py @@ -0,0 +1,31 @@ +import contextlib +import torch + +class Plugin(object): + + def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): + return model, optimizers, lr_schedulers + + def pre_optimizer_step(self, optimizer, optiizer_idx): + pass + + def post_optimizer_step(self, optimizer, optimizer_idx): + pass + + def pre_training(self): + pass + + def post_training(self): + pass + + @contextlib.contextmanager + def train_step_context(self): + yield + + @contextlib.contextmanager + def val_step_context(self): + yield + + @contextlib.contextmanager + def test_step_context(self): + yield \ No newline at end of file From 532ad5dcaeb6599629b4e33aa87b30292b8508f0 Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 11 Nov 2020 16:58:32 +0100 Subject: [PATCH 007/157] base plugin integration --- pytorch_lightning/accelerators/accelerator.py | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3d1b5038dcc20..ccfc093fde5a5 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,3 +1,5 @@ +from pytorch_lightning.accelerators.data_parallel import ParallelPlugin +from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import AMPType from typing import Any, Union @@ -20,10 +22,12 @@ def __init__( model_ref: LightningModule, root_device: Union[str, torch.device], precision_plugin: PrecisionPlugin, + parallel_plugin: ParallelPlugin, gradient_clip_val, ): self.model_ref = model_ref self.precision_plugin = precision_plugin + self.parallel_plugin = parallel_plugin self.gradient_clip_val = gradient_clip_val self.optimizers = None @@ -33,7 +37,8 @@ def __init__( def setup(self, model): self.setup_optimizers(model) - self.connect_precision_plugin() + self.connect_plugin(self.precision_plugin) + self.connect_plugin(self.parallel_plugin) def teardown(self): pass @@ -49,29 +54,27 @@ def training_step(self, args): args[0] = batch - return self.model_ref.training_step(*args) + with self.precision_plugin.train_step_context(): + with self.parallel_plugin.train_step_context(): + return self.model_ref.training_step(*args) def validation_step(self, args): batch = self.to_device(args[0]) args[0] = batch - return self.model_ref.validation_step(*args) + with self.precision_plugin.val_step_context(): + with self.parallel_plugin.val_step_context(): + return self.model_ref.validation_step(*args) def test_step(self, args): batch = self.to_device(args[0]) args[0] = batch - return self.model_ref.test_step(*args) - def training_step_end(self, output): - return output - - def test_step_end(self, output): - return output - - def validation_step_end(self, output): - return output + with self.precision_plugin.test_step_context(): + with self.parallel_plugin.test_step_context(): + return self.model_ref.test_step(*args) def process_dataloader(self, dataloader): return dataloader @@ -167,8 +170,8 @@ def setup_optimizers(self, model): self.lr_schedulers = lr_schedulers self.optimizer_frequencies = optimizer_frequencies - def connect_precision_plugin(self): - model, optimizers, schedulers = self.precision_plugin.connect( + def connect_plugin(self, plugin: Plugin): + model, optimizers, schedulers = plugin.connect( self.model_ref, self.optimizers, self.lr_schedulers ) @@ -176,6 +179,7 @@ def connect_precision_plugin(self): self.optimizers = optimizers self.schedulers = schedulers + def to_device(self, batch): return self.batch_to_device(batch, self.root_device) From f52ad64e5c233aabb664d80bc899bacc1dacfcce Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 11 Nov 2020 16:58:50 +0100 Subject: [PATCH 008/157] continue ddp plugin --- .../accelerators/data_parallel.py | 379 ++++++++++++++++-- 1 file changed, 344 insertions(+), 35 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 9a6481c65c5db..e506041384ad3 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -1,4 +1,7 @@ from abc import ABC, abstractmethod +from contextlib import contextmanager +from os import stat +from pytorch_lightning.accelerators.base_plugin import Plugin from torch.nn.parallel.distributed import DistributedDataParallel from pytorch_lightning.core.lightning import LightningModule @@ -19,6 +22,8 @@ import numpy as np import torch.distributed as torch_distrib from pytorch_lightning import _logger as log +import contextlib +import torch.multiprocessing as mp from pytorch_lightning.utilities.distributed import sync_ddp_if_available @@ -38,12 +43,15 @@ class ReduceOp: SUM = None -class ParallelPlugin(ABC): - def __init__(self): +class TrainingTypePlugin(Plugin, ABC): + def __init__(self, logger=None): self.model = None + self.global_rank = 0 + self.logger = logger @abstractmethod - def reduce(self, output): + @property + def on_gpu(self): raise NotImplementedError @abstractmethod @@ -51,12 +59,86 @@ def reduce(self, output): def root_device(self): raise NotImplementedError + @abstractmethod + def model_to_device(self): + raise NotImplementedError -class DataParallelPlugin(ParallelPlugin): - def __init__(self, parallel_device_ids): - super().__init__() + @abstractmethod + @property + def is_global_zero(self): + raise NotImplementedError + + @abstractmethod + def barrier(self): + raise NotImplementedError + +class SingleDevicePlugin(TrainingTypePlugin): + def __init__(self, device, logger=None): + super().__init__(logger=logger) + self.device: torch.device = device + + @property + def on_gpu(self): + return self.device.type == "cuda" and torch.cuda.is_available() + + def reduce(self, output): + return output + + @property + def root_device(self): + return self.device + + def model_to_device(self): + self.model.to(self.root_device) + + def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): + self.model = model + + @property + def is_global_zero(self): + return True + + def barrier(self): + pass + + + +class ParallelPlugin(TrainingTypePlugin, ABC): + def __init__(self, parallel_device_ids, logger=None, cluster_environment=None): + super().__init__(logger=logger) self.parallel_device_ids = parallel_device_ids + self.local_rank = 0 + self.world_size = 1 + self.cluster_environment = cluster_environment + @abstractmethod + def reduce(self, output): + raise NotImplementedError + + @abstractmethod + @property + def root_device(self): + raise NotImplementedError + + @property + def on_gpu(self): + return self.parallel_device_ids and torch.cuda.is_available() + + @abstractmethod + def setup(self, model): + raise NotImplementedError + + def connect(self, model): + self.setup(model) + + return self.model + + @property + def is_global_zero(self) -> bool: + return self.global_rank == 0 + + +class DataParallelPlugin(ParallelPlugin): def setup(self, model): self.model = LightningDataParallel(model, self.parallel_device_ids) @@ -73,16 +155,252 @@ def reduce(self, output): def root_device(self): return self.parallel_device_ids[0] + def barrier(self): + pass + + +class DDPPlugin(ParallelPlugin): + + distributed_backend = "ddp" + + def __init__(self, parallel_device_ids, logger=None, cluster_environment=None) -> None: + super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) + self._has_spawned_children = False + self.interactive_ddp_procs = [] + self.dist = LightningDistributed() + + @property + def root_device(self): + return self.parallel_device_ids[self.local_rank] + + def setup(self, model): + + self.model = model + + # start the other scripts + if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": + self._call_children_scripts() + + # set the task idx + self.task_idx = int(os.environ["LOCAL_RANK"]) + + def _call_children_scripts(self): + + # bookkeeping of spawned processes + assert self.global_rank == 0 + self._check_can_spawn_children() + self._has_spawned_children = True + + # DDP Environment variables + os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1") + os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port())) + + # allow the user to pass the node rank + node_rank = "0" + node_rank = os.environ.get("NODE_RANK", node_rank) + node_rank = os.environ.get("GROUP_RANK", node_rank) + os.environ["NODE_RANK"] = node_rank + os.environ["LOCAL_RANK"] = "0" + + # when user is using hydra find the absolute path + path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path + + # pull out the commands used to run the script and resolve the abs file path + command = sys.argv + try: + full_path = path_lib(command[0]) + except Exception as e: + full_path = abspath(command[0]) + + command[0] = full_path + # use the same python interpreter and actually running + command = [sys.executable] + command + + # the visible devices tell us how many GPUs we want to use. + # when the trainer script was called the device has already been scoped by the time + # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone + # but forward the GPUs selected via environment variables + if self.parallel_device_ids is None: + raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") + + os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) + os.environ["PL_IN_DDP_SUBPROCESS"] = "1" + + if self.logger is not None: + os.environ["PL_EXP_VERSION"] = str(self.logger.version) + + num_gpus = len(self.data_parallel_device_ids) + # TODO: Add num_nodes (pass it in?) + os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" + + self.interactive_ddp_procs = [] + + # TODO: Add num_processes (pass it in?) + for local_rank in range(1, self.num_processes): + env_copy = os.environ.copy() + env_copy["LOCAL_RANK"] = f"{local_rank}" + + # remove env var if global seed not set + if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: + del env_copy["PL_GLOBAL_SEED"] + + # start process + # if hydra is available and initialized, make sure to set the cwd correctly + cwd: Optional[str] = None + if HYDRA_AVAILABLE: + if HydraConfig.initialized(): + cwd = get_original_cwd() + proc = subprocess.Popen(command, env=env_copy, cwd=cwd) + self.interactive_ddp_procs.append(proc) + + # starting all processes at once can cause issues + # with dataloaders delay between 1-10 seconds + delay = np.random.uniform(1, 5, 1)[0] + sleep(delay) + + def _check_can_spawn_children(self): + if self._has_spawned_children: + raise RuntimeError( + "You tried to run `.fit` or `.test` multiple times in the same script." + " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead." + ) + + def set_world_ranks(self): + self.local_rank = self.task_idx + self.global_rank = self.node_rank * self.num_processes + self.task_idx + self.world_size = self.num_nodes * self.num_processes -class DistributedDataParallelPlugin(ParallelPlugin): + def configure_ddp(self): + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) + self.model = LightningDistributedDataParallel( + self.model, + device_ids=self.determine_ddp_device_ids(), + **self._ddp_kwargs, + ) + + def determine_ddp_device_ids(self): + return [self.root_device] + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + # TODO: From where to get cluster environment? + os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) + torch_backend = "nccl" if self.on_gpu else "gloo" + + if not torch.distributed.is_initialized(): + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + + def pre_training(self): + seed = os.environ.get("PL_GLOBAL_SEED") + if seed is not None: + seed_everything(int(seed)) + + # show progressbar only on progress_rank 0 + # TODO: check where to move this. Cannot stay here, since we won't have access to progressbar here + if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None: + self.trainer.progress_bar_callback.disable() + + # determine which process we are and world size + self.set_world_ranks() + + # set warning rank + rank_zero_only.rank = self.global_rank + + # TODO: This has to be done somewhere else! + self.model.trainer = self.trainer + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) + + # on world_size=0 let everyone know training is starting + if self.is_global_zero and not torch.distributed.is_initialized(): + log.info("-" * 100) + log.info(f"distributed_backend={self.distributed_backend}") + log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + log.info("-" * 100) + + self.model = self.configure_sync_batchnorm(self.model) + + # move the model to the correct device + self.model_to_device() + + self.configure_ddp() + + self.barrier() + + def post_training(self): + torch.cuda.empty_cache() + + if "WORLD_SIZE" in os.environ: + del os.environ["WORLD_SIZE"] + + @staticmethod + def configure_sync_batchnorm(model: LightningModule) -> LightningModule: + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) + + return model + + def barrier(self): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + def model_to_device(self): + # TODO: Can we easily make this a property that falls back here? + # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank] + torch.cuda.set_device(self.root_device) + self.model.cuda(self.root_device) + + def reduce(self, output, group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None): + + if isinstance(output, torch.Tensor): + output = sync_ddp_if_available(output, group, reduce_op) + + return output + + + + + + + + + + + + + + + + + + + +class MidDistributedDataParallelPlugin(ParallelPlugin): def __init__(self, parallel_device_ids, num_nodes, num_processes, **ddp_kwargs): - super().__init__(self) + super().__init__(parallel_device_ids) self.task_idx = None self._has_spawned_children = False self.interactive_ddp_procs = [] self.dist = LightningDistributed() - self.parallel_device_ids = parallel_device_ids self.num_nodes = num_nodes self.num_processes = num_processes self._ddp_kwargs: Dict[str, Any] = ddp_kwargs @@ -128,15 +446,14 @@ def _call_children_scripts(self): # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables - if self.trainer.data_parallel_device_ids is None: + if self.parallel_device_ids is None: raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - # TODO: Change t - if self.trainer.logger is not None: - os.environ["PL_EXP_VERSION"] = str(self.trainer.logger.version) + if self.logger is not None: + os.environ["PL_EXP_VERSION"] = str(self.logger.version) num_gpus = len(self.parallel_device_ids) os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" @@ -186,6 +503,7 @@ def ddp_train(self, process_idx, model): if seed is not None: seed_everything(int(seed)) + # TODO: move this somewhere else! # show progressbar only on progress_rank 0 if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() @@ -201,9 +519,7 @@ def ddp_train(self, process_idx, model): # where to store ip_table model.trainer = self.trainer self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks + self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks ) # call setup after the ddp process has connected @@ -211,10 +527,10 @@ def ddp_train(self, process_idx, model): # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend}') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) + log.info("-" * 100) + log.info(f"distributed_backend={self.trainer.distributed_backend}") + log.info(f"All DDP processes registered. Starting ddp with {self.trainer.world_size} processes") + log.info("-" * 100) # call sync_bn before .cuda(), configure_apex and configure_ddp if self.trainer.sync_batchnorm: @@ -240,7 +556,7 @@ def ddp_train(self, process_idx, model): model = self.configure_ddp(model, device_ids) # set up training routine - self.barrier('ddp_setup') + self.barrier("ddp_setup") self.trainer.train_loop.setup_training(model) # train or test @@ -255,15 +571,13 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) - torch_backend = "nccl" if self.trainer.on_gpu else "gloo" + torch_backend = "nccl" if self.on_gpu else "gloo" if not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> LightningDistributedDataParallel: + def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel: """ Pass through all customizations from constructor to `LightningDistributedDataParallel`. Override to define a custom DDP implementation. @@ -288,9 +602,7 @@ def configure_ddp(self, model, device_ids): """ # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get( - "find_unused_parameters", True - ) + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) model = LightningDistributedDataParallel( model, device_ids=device_ids, @@ -315,11 +627,8 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: return model - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - """ - - """ + def sync_tensor( + self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None + ) -> torch.Tensor: + """""" return sync_ddp_if_available(tensor, group, reduce_op) From bcfb4e7cb723ddc3e1dbdce14bff086a4e95d0de Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 11 Nov 2020 16:59:06 +0100 Subject: [PATCH 009/157] minor changes precision plugin --- pytorch_lightning/accelerators/precision.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py index 19a375272e95f..0b53e3addbbd7 100644 --- a/pytorch_lightning/accelerators/precision.py +++ b/pytorch_lightning/accelerators/precision.py @@ -1,3 +1,4 @@ +from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties from pytorch_lightning.core.lightning import LightningModule from typing import List, Tuple @@ -13,7 +14,7 @@ amp = None -class PrecisionPlugin(object): +class PrecisionPlugin(Plugin): EPSILON = 1e-6 precision = 32 From bf8a87a659d5b8218bba872e188caedf2c013a21 Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 11 Nov 2020 16:59:30 +0100 Subject: [PATCH 010/157] start ddp plugin --- .../accelerators/data_parallel.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index e506041384ad3..50c27a1722ac4 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -375,6 +375,25 @@ def reduce(self, output, group: Optional[Any] = None, return output +class DDPSpawnPlugin(ParallelPlugin): + def __init__(self, parallel_device_ids, logger=None, cluster_environment=None): + super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) + + self.dist = LightningDistributed() + # TODO: how to get in nprocs? probably pass it + self.nprocs = nprocs + self.mp_queue = None + + def setup(self, model): + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) + + # pass in a state q + smp = mp.get_context('spawn') + self.mp_queue = smp.SimpleQueue() + + def pre_training(self, process_idx = None, mp_queue=None, ): + # TODO: use a mixture of os.fork and multiprocesing queue for ddp here + os.fork() From 8482c0b68976817ce3562bcb52fc49da673548f6 Mon Sep 17 00:00:00 2001 From: justusschock Date: Thu, 12 Nov 2020 17:26:38 +0100 Subject: [PATCH 011/157] initail version ddp spawn --- pytorch_lightning/accelerators/base_plugin.py | 4 +- .../accelerators/data_parallel.py | 151 +++++++++++++++++- 2 files changed, 146 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py index acd90e41f60df..1fdae7270fe47 100644 --- a/pytorch_lightning/accelerators/base_plugin.py +++ b/pytorch_lightning/accelerators/base_plugin.py @@ -5,7 +5,7 @@ class Plugin(object): def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): return model, optimizers, lr_schedulers - + def pre_optimizer_step(self, optimizer, optiizer_idx): pass @@ -15,7 +15,7 @@ def post_optimizer_step(self, optimizer, optimizer_idx): def pre_training(self): pass - def post_training(self): + def post_training(self, results, best_model_path): pass @contextlib.contextmanager diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 50c27a1722ac4..0ef2987804450 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -1,6 +1,8 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from os import stat +import re +from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.accelerators.base_plugin import Plugin from torch.nn.parallel.distributed import DistributedDataParallel @@ -24,8 +26,7 @@ from pytorch_lightning import _logger as log import contextlib import torch.multiprocessing as mp - -from pytorch_lightning.utilities.distributed import sync_ddp_if_available +from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn try: from hydra.utils import to_absolute_path, get_original_cwd @@ -267,6 +268,7 @@ def _check_can_spawn_children(self): def set_world_ranks(self): self.local_rank = self.task_idx + # TODO: check from where we get node_rank and num_processes self.global_rank = self.node_rank * self.num_processes + self.task_idx self.world_size = self.num_nodes * self.num_processes @@ -315,8 +317,12 @@ def pre_training(self): # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table + # TODO: CHeck is_slurm_managing_tasks self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) + # TODO: Move this somewhere else + self.trainer.call_setup_hook(self.model) + # on world_size=0 let everyone know training is starting if self.is_global_zero and not torch.distributed.is_initialized(): log.info("-" * 100) @@ -329,11 +335,15 @@ def pre_training(self): # move the model to the correct device self.model_to_device() + # TODO: Check where this can be moved + # set model properties before going into wrapper + self.trainer.model_connector.copy_trainer_model_properties(self.model) + self.configure_ddp() self.barrier() - def post_training(self): + def post_training(self, results, best_model_path): torch.cuda.empty_cache() if "WORLD_SIZE" in os.environ: @@ -375,14 +385,17 @@ def reduce(self, output, group: Optional[Any] = None, return output + class DDPSpawnPlugin(ParallelPlugin): - def __init__(self, parallel_device_ids, logger=None, cluster_environment=None): + def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0): super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) + self.process_idx = None self.dist = LightningDistributed() # TODO: how to get in nprocs? probably pass it self.nprocs = nprocs self.mp_queue = None + self.proc_offset = proc_offset def setup(self, model): os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) @@ -391,13 +404,137 @@ def setup(self, model): smp = mp.get_context('spawn') self.mp_queue = smp.SimpleQueue() - def pre_training(self, process_idx = None, mp_queue=None, ): - # TODO: use a mixture of os.fork and multiprocesing queue for ddp here - os.fork() + def set_world_ranks(self): + self.local_rank = self.process_idx + # check from where we get node_rank, num_processes and num_nodes + self.global_rank = self.node_rank * self.num_processes + self.self.process_idx + self.world_size = self.num_nodes * self.num_processes + + def pre_training(self): + + # TODO: Check if current process can be used as one training proc + # start from one since current process is proc 0 + for proc_idx in range(1, self.nprocs): + # use os.fork, since this enables us to continue from here + # instead of spawning with separate function + pid = os.fork() + + # set in child processes (PID=0). All previous child processes + # should already have their process_idx assigned + if pid == 0 and self.process_idx is None: + self.process_idx = proc_idx + self.proc_offset + + # set process idx for current process + if pid != 0: + self.process_idx = 0 + self.proc_offset + + # TODO: Check where to put that since we don't have access to the pbar here + # show progressbar only on progress_rank 0 + if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: + self.trainer.progress_bar_callback.disable() + + self.set_world_ranks() + # set warning rank + rank_zero_only.rank = self.global_rank + # TODO: This has to be done somewhere else! + self.model.trainer = self.trainer + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + # TODO: CHeck is_slurm_managing_tasks + self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) + + # TODO: Move this somewhere else + self.trainer.call_setup_hook(self.model) + + # on world_size=0 let everyone know training is starting + if self.is_global_zero and not torch.distributed.is_initialized(): + log.info("-" * 100) + log.info(f"distributed_backend={self.distributed_backend}") + log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + log.info("-" * 100) + + self.model = self.configure_sync_batchnorm(self.model) + + # move the model to the correct device + self.model_to_device() + + # TODO: Check where this can be moved + # set model properties before going into wrapper + self.trainer.model_connector.copy_trainer_model_properties(self.model) + + self.configure_ddp() + + self.barrier() + + def post_training(self, results, best_model_path): + # get original model + # TODO: How To get this? is this simply self.model? + model = self.trainer.get_model() + + # persist info in ddp_spawn + self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) + # clean up memory + torch.cuda.empty_cache() + + if self.process_idx == 0: + # restore main state with best weights + best_path = self.mp_queue.get() + results = self.mp_queue.get() + last_path = self.mp_queue.get() + + # recover the weights of the processes trained in the children + self.__recover_child_process_weights(model, best_path, last_path) + + def configure_ddp(self): + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) + self.model = LightningDistributedDataParallel( + self.model, + device_ids=self.determine_ddp_device_ids(), + **self._ddp_kwargs, + ) + + def determine_ddp_device_ids(self): + return [self.root_device] + def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None): + + if self.global_rank == 0 and self.mp_queue is not None: + rank_zero_warn('cleaning up ddp environment...') + # todo, pass complete checkpoint as state dictionary + self.mp_queue.put(best_model_path) + self.mp_queue.put(results) + + # save the last weights + last_path = None + # TODO: From where to get self.trainer.testing? + if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: + last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) + atomic_save(self.model.state_dict(), last_path) + self.mp_queue.put(last_path) + + + def __recover_child_process_weights(self, model, best_path, last_path): + # TODO: Where can we set this? + # transfer back the best path to the trainer + if self.trainer.checkpoint_callback: + self.trainer.checkpoint_callback.best_model_path = best_path + # todo, pass also best score + + # load last weights + # TODO: How to get self.trainer.testing? + if last_path is not None and not self.trainer.testing: + ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) + model.load_state_dict(ckpt) + + # TODO: Where to set this? + # Do we really need to set this or can we just make the trainer property forward our current property here? + self.trainer.model = model From 12d2c59dc3e5110ed5caf840aa3200550ab70724 Mon Sep 17 00:00:00 2001 From: justusschock Date: Thu, 12 Nov 2020 17:27:31 +0100 Subject: [PATCH 012/157] remove deprecated implementation --- .../accelerators/data_parallel.py | 252 ------------------ 1 file changed, 252 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 0ef2987804450..fc5c2958f1af1 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -536,255 +536,3 @@ def __recover_child_process_weights(self, model, best_path, last_path): # Do we really need to set this or can we just make the trainer property forward our current property here? self.trainer.model = model - - - - - - - - - - - - - -class MidDistributedDataParallelPlugin(ParallelPlugin): - def __init__(self, parallel_device_ids, num_nodes, num_processes, **ddp_kwargs): - super().__init__(parallel_device_ids) - - self.task_idx = None - self._has_spawned_children = False - self.interactive_ddp_procs = [] - self.dist = LightningDistributed() - self.num_nodes = num_nodes - self.num_processes = num_processes - self._ddp_kwargs: Dict[str, Any] = ddp_kwargs - - def setup(self, model): - # start the other scripts - if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": - self._call_children_scripts() - - # set the task idx - self.task_idx = int(os.environ["LOCAL_RANK"]) - - def _call_children_scripts(self): - assert self.trainer.global_rank == 0 - self._check_can_spawn_children() - self._has_spawned_children = True - - os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1") - os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port())) - - # allow the user to pass the node rank - node_rank = "0" - node_rank = os.environ.get("NODE_RANK", node_rank) - node_rank = os.environ.get("GROUP_RANK", node_rank) - os.environ["NODE_RANK"] = node_rank - os.environ["LOCAL_RANK"] = "0" - - # when user is using hydra find the absolute path - path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path - - # pull out the commands used to run the script and resolve the abs file path - command = sys.argv - try: - full_path = path_lib(command[0]) - except Exception as e: - full_path = abspath(command[0]) - - command[0] = full_path - # use the same python interpreter and actually running - command = [sys.executable] + command - - # the visible devices tell us how many GPUs we want to use. - # when the trainer script was called the device has already been scoped by the time - # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone - # but forward the GPUs selected via environment variables - if self.parallel_device_ids is None: - raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") - - os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) - os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - - if self.logger is not None: - os.environ["PL_EXP_VERSION"] = str(self.logger.version) - - num_gpus = len(self.parallel_device_ids) - os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" - - self.interactive_ddp_procs = [] - for local_rank in range(1, self.num_processes): - env_copy = os.environ.copy() - env_copy["LOCAL_RANK"] = f"{local_rank}" - - # remove env var if global seed not set - if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: - del env_copy["PL_GLOBAL_SEED"] - - # start process - # if hydra is available and initialized, make sure to set the cwd correctly - cwd: Optional[str] = None - if HYDRA_AVAILABLE: - if HydraConfig.initialized(): - cwd = get_original_cwd() - proc = subprocess.Popen(command, env=env_copy, cwd=cwd) - self.interactive_ddp_procs.append(proc) - - # starting all processes at once can cause issues - # with dataloaders delay between 1-10 seconds - delay = np.random.uniform(1, 5, 1)[0] - sleep(delay) - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - # TODO: Refactor This! Not sure we still need the whole method here. Should be dione with some additional setup and cleaning logic - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - seed = os.environ.get("PL_GLOBAL_SEED") - if seed is not None: - seed_everything(int(seed)) - - # TODO: move this somewhere else! - # show progressbar only on progress_rank 0 - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, self.trainer.world_size, self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.trainer.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.trainer.world_size} processes") - log.info("-" * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.barrier("ddp_setup") - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: - os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) - torch_backend = "nccl" if self.on_gpu else "gloo" - - if not torch.distributed.is_initialized(): - log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - - def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel: - """ - Pass through all customizations from constructor to `LightningDistributedDataParallel`. - Override to define a custom DDP implementation. - - .. note:: Only requirement is that your DDP implementation subclasses LightningDistributedDataParallel - - - The default implementation is:: - - def configure_ddp(self, model, device_ids): - model = LightningDistributedDataParallel( - model, device_ids=device_ids, find_unused_parameters=True - ) - return model - - Args: - model: the lightningModule - device_ids: the list of devices available - - Returns: - the model wrapped in LightningDistributedDataParallel - - """ - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - model = LightningDistributedDataParallel( - model, - device_ids=device_ids, - **self._ddp_kwargs, - ) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - - def sync_tensor( - self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None - ) -> torch.Tensor: - """""" - return sync_ddp_if_available(tensor, group, reduce_op) From 8d83db883f7316df4f5fc4a339809ac1751fa0b1 Mon Sep 17 00:00:00 2001 From: justusschock Date: Thu, 12 Nov 2020 17:28:21 +0100 Subject: [PATCH 013/157] add comment on whats missing --- pytorch_lightning/accelerators/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index fc5c2958f1af1..2c7f9ae4c5924 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -536,3 +536,4 @@ def __recover_child_process_weights(self, model, best_path, last_path): # Do we really need to set this or can we just make the trainer property forward our current property here? self.trainer.model = model +# STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP \ No newline at end of file From 22e1e31ef84e5991d536711bbb5bc7e9779375f9 Mon Sep 17 00:00:00 2001 From: justusschock Date: Fri, 20 Nov 2020 11:16:09 +0100 Subject: [PATCH 014/157] latest state --- pytorch_lightning/accelerators/accelerator.py | 27 +- .../accelerators/accelerator_connector.py | 249 ++++++++++++++++++ pytorch_lightning/accelerators/base_plugin.py | 6 +- .../accelerators/data_parallel.py | 111 ++++++-- pytorch_lightning/accelerators/precision.py | 16 +- 5 files changed, 360 insertions(+), 49 deletions(-) create mode 100644 pytorch_lightning/accelerators/accelerator_connector.py diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index ccfc093fde5a5..21e0f191e384e 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.data_parallel import ParallelPlugin +from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import AMPType @@ -15,30 +15,31 @@ class NewAccelerator(object): - root_device: Union[str, torch.device] def __init__( self, model_ref: LightningModule, - root_device: Union[str, torch.device], precision_plugin: PrecisionPlugin, - parallel_plugin: ParallelPlugin, + training_type_plugin: TrainingTypePlugin, gradient_clip_val, ): self.model_ref = model_ref self.precision_plugin = precision_plugin - self.parallel_plugin = parallel_plugin + self.training_type_plugin = training_type_plugin self.gradient_clip_val = gradient_clip_val self.optimizers = None self.lr_schedulers = None self.optimizer_frequencies = None - self.root_device = root_device def setup(self, model): + self.connect_training_type_plugin() self.setup_optimizers(model) - self.connect_plugin(self.precision_plugin) - self.connect_plugin(self.parallel_plugin) + self.connect_precision_plugin() + + @property + def root_device(self): + return self.training_type_plugin.root_device def teardown(self): pass @@ -55,7 +56,7 @@ def training_step(self, args): args[0] = batch with self.precision_plugin.train_step_context(): - with self.parallel_plugin.train_step_context(): + with self.training_type_plugin.train_step_context(): return self.model_ref.training_step(*args) def validation_step(self, args): @@ -64,7 +65,7 @@ def validation_step(self, args): args[0] = batch with self.precision_plugin.val_step_context(): - with self.parallel_plugin.val_step_context(): + with self.training_type_plugin.val_step_context(): return self.model_ref.validation_step(*args) def test_step(self, args): @@ -73,7 +74,7 @@ def test_step(self, args): args[0] = batch with self.precision_plugin.test_step_context(): - with self.parallel_plugin.test_step_context(): + with self.training_type_plugin.test_step_context(): return self.model_ref.test_step(*args) def process_dataloader(self, dataloader): @@ -170,9 +171,9 @@ def setup_optimizers(self, model): self.lr_schedulers = lr_schedulers self.optimizer_frequencies = optimizer_frequencies - def connect_plugin(self, plugin: Plugin): + def connect_training_type_plugin(self, plugin: Plugin): model, optimizers, schedulers = plugin.connect( - self.model_ref, self.optimizers, self.lr_schedulers + self.model_ref ) self.model_ref = model diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py new file mode 100644 index 0000000000000..d9a111f355e68 --- /dev/null +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -0,0 +1,249 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pytorch_lightning import accelerators +import os +import torch + +from pytorch_lightning.utilities import device_parser +from pytorch_lightning.utilities import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning import _logger as log +from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment +from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment +from pytorch_lightning.accelerators.accelerator import Accelerator + +try: + import torch_xla +except ImportError: + XLA_AVAILABLE = False +else: + XLA_AVAILABLE = True + +try: + import horovod.torch as hvd +except (ModuleNotFoundError, ImportError): + HOROVOD_AVAILABLE = False +else: + HOROVOD_AVAILABLE = True + + +class BackendConnector(object): + def __init__( + self, + num_processes, + tpu_cores, + accelerator, + distributed_backend, + auto_select_gpus, + gpus, + num_nodes, + log_gpu_memory, + sync_batchnorm, + benchmark, + replace_sampler_ddp, + deterministic, + ): + + # initialization + self.use_dp = False + self.use_ddp = False + self.use_ddp2 = False + self.use_horovod = False + self.use_single_gpu = False + self.num_gpus = None + + self.num_processes = num_processes + self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + self.accelerator = accelerator + self.distributed_backend = distributed_backend + self.auto_select_gpus = auto_select_gpus + self.gpus = gpus + self.num_nodes = num_nodes + self.log_gpu_memory = log_gpu_memory + self.sync_batchnorm = sync_batchnorm + self.benchmark = benchmark + self.replace_sampler_ddp = replace_sampler_ddp + self.deterministic = deterministic + + # init the default rank if exists + # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks + # this way we only show it on rank 0 + if 'LOCAL_RANK' in os.environ: + rank_zero_only.rank = int(os.environ['LOCAL_RANK']) + + # TODO: Move autoselect GPUS to other place + # for gpus allow int, string and gpu list + # if auto_select_gpus and isinstance(gpus, int): + # self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus) + + self.parallel_devices = device_parser.parse_gpu_ids(self.gpus) + self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices) + self.root_device = torch.device("cpu") + + self.set_distributed_mode() + + # override dist backend when using tpus + if self.on_tpu: + self.distributed_backend = "tpu" + self.use_tpu = True + + # init flags for SLURM+DDP to work + self.world_size = 1 + self.interactive_ddp_procs = [] + + # link up SLURM + # TODO: this should be taken out of here... but depends too much on DDP + self.slurm_connector.on_trainer_init(self.num_nodes) + self.node_rank = self.determine_ddp_node_rank() + self.local_rank = self.determine_local_rank() + self.global_rank = 0 + + # NVIDIA setup + self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids) + + self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') + + self.replace_sampler_ddp = replace_sampler_ddp + + @property + def on_tpu(self): + return self.tpu_cores is not None + + @property + def tpu_id(self): + if self.on_tpu: + return self.tpu_cores[0] + + return None + + @property + def on_gpu(self): + return self.parallel_devices and torch.cuda.is_available() + + def set_distributed_mode(self): + + # No distributed backend + if self.distributed_backend is None: + # horovod multi GPU + if self.has_horovodrun(): + self._set_horovod_backend() + + # DDP CPU + elif self.num_gpus == 0: + if self.num_nodes > 1 or self.num_processes > 1: + self.use_ddp = True + + # Single GPU + elif self.num_gpus == 1: + self.use_single_gpu = True + + # Default: DDP-Spawn + elif self.num_gpus > 1: + rank_zero_warn( + 'You requested multiple GPUs but did not specify a backend, e.g.' + ' (distributed_backend="dp"|"ddp"|"ddp2").' + ' Setting distributed_backend="ddp_spawn" for you.' + ) + self.distributed_backend = "ddp_spawn" + + # DP + if self.distributed_backend == "dp": + # do nothing if num_gpus == 0 + if self.num_gpus == 1: + self.use_single_gpu = True + self.use_dp = True + elif self.num_gpus > 1: + self.use_dp = True + + # DDP, DDP-Spawn + elif self.distributed_backend in ("ddp", "ddp_spawn"): + if self.num_gpus == 0: + # DDP CPU + if self.num_nodes > 1 or self.num_processes > 1: + self.use_ddp = True + + # DDP Single GPU + elif self.num_gpus == 1: + self.use_single_gpu = True + self.use_ddp = True + + # DDP Multi GPU + elif self.num_gpus > 1: + self.use_ddp = True + self.num_processes = self.num_gpus + + # DDP2 + elif self.distributed_backend == "ddp2": + # do nothing if num_gpus == 0 + if self.num_gpus >= 1: + self.use_ddp2 = True + + # DDP CPU + elif self.distributed_backend == "ddp_cpu": + if self.num_gpus > 0: + rank_zero_warn( + 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' + ) + self.use_ddp = True + self.data_parallel_device_ids = None + self.on_gpu = False + + # HOROVOD + elif self.distributed_backend == "horovod": + self._set_horovod_backend() + + # throw error to force user ddp or ddp2 choice + if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): + raise MisconfigurationException( + 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' + 'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2' + ) + + rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}') + num_cores = self.tpu_cores if self.tpu_cores is not None else 0 + rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores') + + if torch.cuda.is_available() and not self.on_gpu: + rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.') + + + def _set_horovod_backend(self): + self.check_horovod() + self.use_horovod = True + + # Initialize Horovod to get rank / size info + hvd.init() + if self.on_gpu: + # Horovod assigns one local GPU per process + self.root_gpu = hvd.local_rank() + + def check_horovod(self): + """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" + if not HOROVOD_AVAILABLE: + raise MisconfigurationException( + 'Requested `distributed_backend="horovod"`, but Horovod is not installed.' + 'Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]' + ) + + if self.num_gpus > 1 or self.num_nodes > 1: + raise MisconfigurationException( + 'Horovod does not support setting num_nodes / num_gpus explicitly. Use ' + 'horovodrun / mpirun to configure the number of processes.' + ) + + @staticmethod + def has_horovodrun(): + """Returns True if running with `horovodrun` using Gloo or OpenMPI.""" + return 'OMPI_COMM_WORLD_RANK' in os.environ or 'HOROVOD_RANK' in os.environ \ No newline at end of file diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py index 1fdae7270fe47..401dc549c5327 100644 --- a/pytorch_lightning/accelerators/base_plugin.py +++ b/pytorch_lightning/accelerators/base_plugin.py @@ -3,10 +3,10 @@ class Plugin(object): - def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): - return model, optimizers, lr_schedulers + def connect(self, model: torch.nn.Module, *args, **kwargs): + return model - def pre_optimizer_step(self, optimizer, optiizer_idx): + def pre_optimizer_step(self, optimizer, optimizer_idx): pass def post_optimizer_step(self, optimizer, optimizer_idx): diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 2c7f9ae4c5924..62a8710034af1 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -1,11 +1,8 @@ from abc import ABC, abstractmethod -from contextlib import contextmanager -from os import stat import re from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.accelerators.base_plugin import Plugin -from torch.nn.parallel.distributed import DistributedDataParallel from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities.seed import seed_everything from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -15,7 +12,6 @@ from pytorch_lightning.core.step_result import Result from typing import Any, Dict, List, Optional, Union from pytorch_lightning.overrides.data_parallel import LightningDataParallel, LightningDistributedDataParallel -from torch.nn.parallel.data_parallel import DataParallel import sys from os.path import abspath from time import sleep @@ -26,7 +22,7 @@ from pytorch_lightning import _logger as log import contextlib import torch.multiprocessing as mp -from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn +from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info try: from hydra.utils import to_absolute_path, get_original_cwd @@ -73,6 +69,37 @@ def is_global_zero(self): def barrier(self): raise NotImplementedError + def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): + if device_ids is None: + return + + # set the correct cuda visible devices (using pci order) + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) + devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) + log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]') + + + def determine_local_rank(self): + return int(os.environ.get('LOCAL_RANK', 0)) + + + def determine_node_rank(self): + + # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK. + # otherwise use given node rank or default to node rank 0 + env_vars = ['NODE_RANK', 'GROUP_RANK'] + node_ids = [(k, os.environ.get(k, None)) for k in env_vars] + node_ids = [(k, v) for k, v in node_ids if v is not None] + if len(node_ids) == 0: + return 0 + if len(node_ids) > 1: + log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.") + k, rank = node_ids.pop() + rank_zero_info(f"Using environment variable {k} for node rank ({rank}).") + return int(rank) + + class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device, logger=None): super().__init__(logger=logger) @@ -90,10 +117,16 @@ def root_device(self): return self.device def model_to_device(self): + if self.on_gpu: + torch.cuda.set_device(self.root_device) + self.model.to(self.root_device) - def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): + def connect(self, model: torch.nn.Module): self.model = model + self.model_to_device() + + return self.model @property def is_global_zero(self): @@ -174,6 +207,18 @@ def __init__(self, parallel_device_ids, logger=None, cluster_environment=None) - def root_device(self): return self.parallel_device_ids[self.local_rank] + def determine_local_rank(self): + if self.is_slurm_managing_tasks: + return int(os.environ['SLURM_LOCALID']) + else: + return super().determine_node_rank() + + def determine_node_rank(self): + if self.is_slurm_managing_tasks: + return int(os.environ['SLURM_NODEID']) + else: + return super().determine_node_rank() + def setup(self, model): self.model = model @@ -269,7 +314,7 @@ def _check_can_spawn_children(self): def set_world_ranks(self): self.local_rank = self.task_idx # TODO: check from where we get node_rank and num_processes - self.global_rank = self.node_rank * self.num_processes + self.task_idx + self.global_rank = self.determine_node_rank() * self.num_processes + self.task_idx self.world_size = self.num_nodes * self.num_processes def configure_ddp(self): @@ -302,8 +347,8 @@ def pre_training(self): # show progressbar only on progress_rank 0 # TODO: check where to move this. Cannot stay here, since we won't have access to progressbar here - if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() + # if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None: + # self.trainer.progress_bar_callback.disable() # determine which process we are and world size self.set_world_ranks() @@ -312,7 +357,7 @@ def pre_training(self): rank_zero_only.rank = self.global_rank # TODO: This has to be done somewhere else! - self.model.trainer = self.trainer + # self.model.trainer = self.trainer # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken @@ -321,7 +366,7 @@ def pre_training(self): self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) # TODO: Move this somewhere else - self.trainer.call_setup_hook(self.model) + # self.trainer.call_setup_hook(self.model) # on world_size=0 let everyone know training is starting if self.is_global_zero and not torch.distributed.is_initialized(): @@ -337,7 +382,7 @@ def pre_training(self): # TODO: Check where this can be moved # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(self.model) + # self.trainer.model_connector.copy_trainer_model_properties(self.model) self.configure_ddp() @@ -393,7 +438,7 @@ def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, p self.dist = LightningDistributed() # TODO: how to get in nprocs? probably pass it - self.nprocs = nprocs + self.num_processes = num_processes self.mp_queue = None self.proc_offset = proc_offset @@ -407,14 +452,14 @@ def setup(self, model): def set_world_ranks(self): self.local_rank = self.process_idx # check from where we get node_rank, num_processes and num_nodes - self.global_rank = self.node_rank * self.num_processes + self.self.process_idx + self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx self.world_size = self.num_nodes * self.num_processes def pre_training(self): # TODO: Check if current process can be used as one training proc # start from one since current process is proc 0 - for proc_idx in range(1, self.nprocs): + for proc_idx in range(1, self.num_processes): # use os.fork, since this enables us to continue from here # instead of spawning with separate function pid = os.fork() @@ -430,8 +475,8 @@ def pre_training(self): # TODO: Check where to put that since we don't have access to the pbar here # show progressbar only on progress_rank 0 - if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() + # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: + # self.trainer.progress_bar_callback.disable() self.set_world_ranks() @@ -439,7 +484,7 @@ def pre_training(self): rank_zero_only.rank = self.global_rank # TODO: This has to be done somewhere else! - self.model.trainer = self.trainer + # self.model.trainer = self.trainer # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken @@ -448,7 +493,7 @@ def pre_training(self): self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) # TODO: Move this somewhere else - self.trainer.call_setup_hook(self.model) + # self.trainer.call_setup_hook(self.model) # on world_size=0 let everyone know training is starting if self.is_global_zero and not torch.distributed.is_initialized(): @@ -464,7 +509,7 @@ def pre_training(self): # TODO: Check where this can be moved # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(self.model) + # self.trainer.model_connector.copy_trainer_model_properties(self.model) self.configure_ddp() @@ -473,7 +518,8 @@ def pre_training(self): def post_training(self, results, best_model_path): # get original model # TODO: How To get this? is this simply self.model? - model = self.trainer.get_model() + # model = self.trainer.get_model() + model = self.model # persist info in ddp_spawn self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) @@ -513,7 +559,8 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_pat # save the last weights last_path = None # TODO: From where to get self.trainer.testing? - if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: + # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: + if best_model_path is not None and len(best_model_path) > 0: last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) atomic_save(self.model.state_dict(), last_path) self.mp_queue.put(last_path) @@ -522,18 +569,30 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_pat def __recover_child_process_weights(self, model, best_path, last_path): # TODO: Where can we set this? # transfer back the best path to the trainer - if self.trainer.checkpoint_callback: - self.trainer.checkpoint_callback.best_model_path = best_path + # if self.trainer.checkpoint_callback: + # self.trainer.checkpoint_callback.best_model_path = best_path # todo, pass also best score # load last weights # TODO: How to get self.trainer.testing? - if last_path is not None and not self.trainer.testing: + if last_path is not None: # and not self.trainer.testing: ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt) # TODO: Where to set this? # Do we really need to set this or can we just make the trainer property forward our current property here? - self.trainer.model = model + # self.trainer.model = model + + def determine_local_rank(self): + if self.is_slurm_managing_tasks: + return int(os.environ['SLURM_LOCALID']) + else: + return super().determine_node_rank() + + def determine_node_rank(self): + if self.is_slurm_managing_tasks: + return int(os.environ['SLURM_NODEID']) + else: + return super().determine_node_rank() # STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP \ No newline at end of file diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py index 0b53e3addbbd7..ca41e8242f104 100644 --- a/pytorch_lightning/accelerators/precision.py +++ b/pytorch_lightning/accelerators/precision.py @@ -1,3 +1,4 @@ +from contextlib import contextmanager from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties from pytorch_lightning.core.lightning import LightningModule @@ -18,7 +19,7 @@ class PrecisionPlugin(Plugin): EPSILON = 1e-6 precision = 32 - def pre_optimizer_step(self, optimizer, optiizer_idx): + def pre_optimizer_step(self, optimizer, optimizer_idx): pass def post_optimizer_step(self, optimizer, optimizer_idx): @@ -77,20 +78,21 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): return closure_loss + @contextmanager + def train_step_context(self): + yield torch.cuda.amp.autocast() + class ApexMixedPrecisionPlugin(MixedPrecisionPlugin): - def __init__(self): + def __init__(self, amp_level): self.backend = AMPType.APEX + self.amp_level = amp_level def connect(self, model, optimizers, lr_schedulers): - model, optimizers = self.configure_apex(amp, model, optimizers, self.trainer.amp_level) + model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level) reinit_scheduler_properties(optimizers, lr_schedulers) return model, optimizers, lr_schedulers - def training_step(self, fx, args): - output = fx(args) - return output - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = amp.scale_loss(closure_loss, optimizer) From eac87c38d04f6968108a5ec3df77721c4743be21 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 30 Nov 2020 17:10:04 +0100 Subject: [PATCH 015/157] update accelerator for model to live in traintype plugin --- pytorch_lightning/accelerators/accelerator.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 21e0f191e384e..9d84c2cbadc49 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -18,12 +18,10 @@ class NewAccelerator(object): def __init__( self, - model_ref: LightningModule, precision_plugin: PrecisionPlugin, training_type_plugin: TrainingTypePlugin, gradient_clip_val, ): - self.model_ref = model_ref self.precision_plugin = precision_plugin self.training_type_plugin = training_type_plugin self.gradient_clip_val = gradient_clip_val @@ -37,6 +35,18 @@ def setup(self, model): self.setup_optimizers(model) self.connect_precision_plugin() + @property + def model(self): + return self.training_type_plugin.model + + @model.setter + def model(self, new_model): + self.training_type_plugin.model = new_model + + @property + def lightning_module(self): + return self.training_type_plugin.lightning_module + @property def root_device(self): return self.training_type_plugin.root_device @@ -84,6 +94,8 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): return self.precision_plugin.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): + # TODO: Check out if this can be simplified with new LightningOptimizer! + model_ref = self.model_ref is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) native_amp = self.trainer.amp_backend == AMPType.NATIVE @@ -171,12 +183,15 @@ def setup_optimizers(self, model): self.lr_schedulers = lr_schedulers self.optimizer_frequencies = optimizer_frequencies - def connect_training_type_plugin(self, plugin: Plugin): - model, optimizers, schedulers = plugin.connect( + def connect_training_type_plugin(self, plugin: TrainingTypePlugin): + plugin.connect( self.model_ref ) - self.model_ref = model + def connect_precision_plugin(self, plugin: PrecisionPlugin): + model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers) + + self.model = model self.optimizers = optimizers self.schedulers = schedulers From d111471a62b762dd1ac2dd1dc8fa04bd61c57fe3 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 30 Nov 2020 17:10:23 +0100 Subject: [PATCH 016/157] add general plugin interface --- pytorch_lightning/accelerators/base_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py index 401dc549c5327..42b3e1f00b932 100644 --- a/pytorch_lightning/accelerators/base_plugin.py +++ b/pytorch_lightning/accelerators/base_plugin.py @@ -4,7 +4,7 @@ class Plugin(object): def connect(self, model: torch.nn.Module, *args, **kwargs): - return model + pass def pre_optimizer_step(self, optimizer, optimizer_idx): pass From 3d6c4b89dadcf824eb64208798a7572b8da09f3f Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 30 Nov 2020 17:10:39 +0100 Subject: [PATCH 017/157] add model properties --- .../accelerators/data_parallel.py | 334 ++++++++++-------- 1 file changed, 177 insertions(+), 157 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 62a8710034af1..8281e39e71134 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -42,7 +42,7 @@ class ReduceOp: class TrainingTypePlugin(Plugin, ABC): def __init__(self, logger=None): - self.model = None + self._model = None self.global_rank = 0 self.logger = logger @@ -99,6 +99,18 @@ def determine_node_rank(self): rank_zero_info(f"Using environment variable {k} for node rank ({rank}).") return int(rank) + @property + def model(self): + return self._model + + @model.setter + def model(self, new_model): + self._model = new_model + + @property + def lightning_module(self): + return self._model + class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device, logger=None): @@ -120,10 +132,10 @@ def model_to_device(self): if self.on_gpu: torch.cuda.set_device(self.root_device) - self.model.to(self.root_device) + self._model.to(self.root_device) def connect(self, model: torch.nn.Module): - self.model = model + self._model = model self.model_to_device() return self.model @@ -174,7 +186,7 @@ def is_global_zero(self) -> bool: class DataParallelPlugin(ParallelPlugin): def setup(self, model): - self.model = LightningDataParallel(model, self.parallel_device_ids) + self._model = LightningDataParallel(model, self.parallel_device_ids) def reduce(self, output): if isinstance(output, Result): @@ -189,6 +201,10 @@ def reduce(self, output): def root_device(self): return self.parallel_device_ids[0] + @property + def lightning_module(self): + return self._model.module + def barrier(self): pass @@ -221,7 +237,7 @@ def determine_node_rank(self): def setup(self, model): - self.model = model + self._model = model # start the other scripts if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": @@ -230,6 +246,10 @@ def setup(self, model): # set the task idx self.task_idx = int(os.environ["LOCAL_RANK"]) + @property + def lightning_module(self): + return self._model.module + def _call_children_scripts(self): # bookkeeping of spawned processes @@ -320,7 +340,7 @@ def set_world_ranks(self): def configure_ddp(self): # if unset, default `find_unused_parameters` `True` self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self.model = LightningDistributedDataParallel( + self._model = LightningDistributedDataParallel( self.model, device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs, @@ -431,168 +451,168 @@ def reduce(self, output, group: Optional[Any] = None, return output -class DDPSpawnPlugin(ParallelPlugin): - def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0): - super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) - self.process_idx = None +# class DDPSpawnPlugin(ParallelPlugin): +# def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0): +# super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) +# self.process_idx = None - self.dist = LightningDistributed() - # TODO: how to get in nprocs? probably pass it - self.num_processes = num_processes - self.mp_queue = None - self.proc_offset = proc_offset +# self.dist = LightningDistributed() +# # TODO: how to get in nprocs? probably pass it +# self.num_processes = num_processes +# self.mp_queue = None +# self.proc_offset = proc_offset - def setup(self, model): - os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) +# def setup(self, model): +# os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) - # pass in a state q - smp = mp.get_context('spawn') - self.mp_queue = smp.SimpleQueue() +# # pass in a state q +# smp = mp.get_context('spawn') +# self.mp_queue = smp.SimpleQueue() - def set_world_ranks(self): - self.local_rank = self.process_idx - # check from where we get node_rank, num_processes and num_nodes - self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx - self.world_size = self.num_nodes * self.num_processes +# def set_world_ranks(self): +# self.local_rank = self.process_idx +# # check from where we get node_rank, num_processes and num_nodes +# self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx +# self.world_size = self.num_nodes * self.num_processes - def pre_training(self): +# def pre_training(self): - # TODO: Check if current process can be used as one training proc - # start from one since current process is proc 0 - for proc_idx in range(1, self.num_processes): - # use os.fork, since this enables us to continue from here - # instead of spawning with separate function - pid = os.fork() +# # TODO: Check if current process can be used as one training proc +# # start from one since current process is proc 0 +# for proc_idx in range(1, self.num_processes): +# # use os.fork, since this enables us to continue from here +# # instead of spawning with separate function +# pid = os.fork() - # set in child processes (PID=0). All previous child processes - # should already have their process_idx assigned - if pid == 0 and self.process_idx is None: - self.process_idx = proc_idx + self.proc_offset +# # set in child processes (PID=0). All previous child processes +# # should already have their process_idx assigned +# if pid == 0 and self.process_idx is None: +# self.process_idx = proc_idx + self.proc_offset - # set process idx for current process - if pid != 0: - self.process_idx = 0 + self.proc_offset +# # set process idx for current process +# if pid != 0: +# self.process_idx = 0 + self.proc_offset - # TODO: Check where to put that since we don't have access to the pbar here - # show progressbar only on progress_rank 0 - # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: - # self.trainer.progress_bar_callback.disable() +# # TODO: Check where to put that since we don't have access to the pbar here +# # show progressbar only on progress_rank 0 +# # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: +# # self.trainer.progress_bar_callback.disable() - self.set_world_ranks() +# self.set_world_ranks() - # set warning rank - rank_zero_only.rank = self.global_rank +# # set warning rank +# rank_zero_only.rank = self.global_rank - # TODO: This has to be done somewhere else! - # self.model.trainer = self.trainer - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - # TODO: CHeck is_slurm_managing_tasks - self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) - - # TODO: Move this somewhere else - # self.trainer.call_setup_hook(self.model) - - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - - self.model = self.configure_sync_batchnorm(self.model) - - # move the model to the correct device - self.model_to_device() - - # TODO: Check where this can be moved - # set model properties before going into wrapper - # self.trainer.model_connector.copy_trainer_model_properties(self.model) - - self.configure_ddp() - - self.barrier() - - def post_training(self, results, best_model_path): - # get original model - # TODO: How To get this? is this simply self.model? - # model = self.trainer.get_model() - model = self.model - - # persist info in ddp_spawn - self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) - - # clean up memory - torch.cuda.empty_cache() - - if self.process_idx == 0: - # restore main state with best weights - best_path = self.mp_queue.get() - results = self.mp_queue.get() - last_path = self.mp_queue.get() - - # recover the weights of the processes trained in the children - self.__recover_child_process_weights(model, best_path, last_path) - - def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self.model = LightningDistributedDataParallel( - self.model, - device_ids=self.determine_ddp_device_ids(), - **self._ddp_kwargs, - ) - - def determine_ddp_device_ids(self): - return [self.root_device] - - def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None): - - if self.global_rank == 0 and self.mp_queue is not None: - rank_zero_warn('cleaning up ddp environment...') - # todo, pass complete checkpoint as state dictionary - self.mp_queue.put(best_model_path) - self.mp_queue.put(results) - - # save the last weights - last_path = None - # TODO: From where to get self.trainer.testing? - # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: - if best_model_path is not None and len(best_model_path) > 0: - last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) - atomic_save(self.model.state_dict(), last_path) - self.mp_queue.put(last_path) - - - def __recover_child_process_weights(self, model, best_path, last_path): - # TODO: Where can we set this? - # transfer back the best path to the trainer - # if self.trainer.checkpoint_callback: - # self.trainer.checkpoint_callback.best_model_path = best_path - # todo, pass also best score - - # load last weights - # TODO: How to get self.trainer.testing? - if last_path is not None: # and not self.trainer.testing: - ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt) - - # TODO: Where to set this? - # Do we really need to set this or can we just make the trainer property forward our current property here? - # self.trainer.model = model - - def determine_local_rank(self): - if self.is_slurm_managing_tasks: - return int(os.environ['SLURM_LOCALID']) - else: - return super().determine_node_rank() - - def determine_node_rank(self): - if self.is_slurm_managing_tasks: - return int(os.environ['SLURM_NODEID']) - else: - return super().determine_node_rank() +# # TODO: This has to be done somewhere else! +# # self.model.trainer = self.trainer + +# # set up server using proc 0's ip address +# # try to init for 20 times at max in case ports are taken +# # where to store ip_table +# # TODO: CHeck is_slurm_managing_tasks +# self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) + +# # TODO: Move this somewhere else +# # self.trainer.call_setup_hook(self.model) + +# # on world_size=0 let everyone know training is starting +# if self.is_global_zero and not torch.distributed.is_initialized(): +# log.info("-" * 100) +# log.info(f"distributed_backend={self.distributed_backend}") +# log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") +# log.info("-" * 100) + +# self.model = self.configure_sync_batchnorm(self.model) + +# # move the model to the correct device +# self.model_to_device() + +# # TODO: Check where this can be moved +# # set model properties before going into wrapper +# # self.trainer.model_connector.copy_trainer_model_properties(self.model) + +# self.configure_ddp() + +# self.barrier() + +# def post_training(self, results, best_model_path): +# # get original model +# # TODO: How To get this? is this simply self.model? +# # model = self.trainer.get_model() +# model = self.model + +# # persist info in ddp_spawn +# self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) + +# # clean up memory +# torch.cuda.empty_cache() + +# if self.process_idx == 0: +# # restore main state with best weights +# best_path = self.mp_queue.get() +# results = self.mp_queue.get() +# last_path = self.mp_queue.get() + +# # recover the weights of the processes trained in the children +# self.__recover_child_process_weights(model, best_path, last_path) + +# def configure_ddp(self): +# # if unset, default `find_unused_parameters` `True` +# self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) +# self.model = LightningDistributedDataParallel( +# self.model, +# device_ids=self.determine_ddp_device_ids(), +# **self._ddp_kwargs, +# ) + +# def determine_ddp_device_ids(self): +# return [self.root_device] + +# def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None): + +# if self.global_rank == 0 and self.mp_queue is not None: +# rank_zero_warn('cleaning up ddp environment...') +# # todo, pass complete checkpoint as state dictionary +# self.mp_queue.put(best_model_path) +# self.mp_queue.put(results) + +# # save the last weights +# last_path = None +# # TODO: From where to get self.trainer.testing? +# # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: +# if best_model_path is not None and len(best_model_path) > 0: +# last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) +# atomic_save(self.model.state_dict(), last_path) +# self.mp_queue.put(last_path) + + +# def __recover_child_process_weights(self, model, best_path, last_path): +# # TODO: Where can we set this? +# # transfer back the best path to the trainer +# # if self.trainer.checkpoint_callback: +# # self.trainer.checkpoint_callback.best_model_path = best_path +# # todo, pass also best score + +# # load last weights +# # TODO: How to get self.trainer.testing? +# if last_path is not None: # and not self.trainer.testing: +# ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) +# model.load_state_dict(ckpt) + +# # TODO: Where to set this? +# # Do we really need to set this or can we just make the trainer property forward our current property here? +# # self.trainer.model = model + +# def determine_local_rank(self): +# if self.is_slurm_managing_tasks: +# return int(os.environ['SLURM_LOCALID']) +# else: +# return super().determine_node_rank() + +# def determine_node_rank(self): +# if self.is_slurm_managing_tasks: +# return int(os.environ['SLURM_NODEID']) +# else: +# return super().determine_node_rank() # STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP \ No newline at end of file From 51740e9be57aea0fe07f9b2bdd453dbd72351bd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 4 Dec 2020 23:30:49 +0100 Subject: [PATCH 018/157] Trainer integration part 1 for CPU accelerator --- pytorch_lightning/accelerators/__init__.py | 0 pytorch_lightning/accelerators/accelerator.py | 43 +++++----- .../accelerators/accelerator_connector.py | 32 ++++++-- .../accelerators/data_parallel.py | 32 +++++--- .../callbacks/model_checkpoint.py | 2 +- pytorch_lightning/core/lightning.py | 11 ++- .../connectors/checkpoint_connector.py | 2 +- .../trainer/connectors/model_connector.py | 5 +- pytorch_lightning/trainer/data_loading.py | 13 +-- pytorch_lightning/trainer/properties.py | 80 ++++++++++++++++--- pytorch_lightning/trainer/trainer.py | 74 +++++++++++------ pytorch_lightning/trainer/training_loop.py | 7 +- 12 files changed, 209 insertions(+), 92 deletions(-) create mode 100644 pytorch_lightning/accelerators/__init__.py diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 9d84c2cbadc49..c4f5bc3a57554 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -30,10 +30,10 @@ def __init__( self.lr_schedulers = None self.optimizer_frequencies = None - def setup(self, model): - self.connect_training_type_plugin() - self.setup_optimizers(model) - self.connect_precision_plugin() + def setup(self, trainer, model): + self.connect_training_type_plugin(self.training_type_plugin, model) + self.setup_optimizers(trainer, model) + self.connect_precision_plugin(self.precision_plugin) @property def model(self): @@ -55,7 +55,7 @@ def teardown(self): pass def batch_to_device(self, batch: Any, device: torch.device): - model = self.model_ref + model = self.model if model is not None: return model.transfer_batch_to_device(batch, device) return move_data_to_device(batch, device) @@ -67,7 +67,7 @@ def training_step(self, args): with self.precision_plugin.train_step_context(): with self.training_type_plugin.train_step_context(): - return self.model_ref.training_step(*args) + return self.model.training_step(*args) def validation_step(self, args): batch = self.to_device(args[0]) @@ -76,7 +76,7 @@ def validation_step(self, args): with self.precision_plugin.val_step_context(): with self.training_type_plugin.val_step_context(): - return self.model_ref.validation_step(*args) + return self.model.validation_step(*args) def test_step(self, args): batch = self.to_device(args[0]) @@ -85,7 +85,7 @@ def test_step(self, args): with self.precision_plugin.test_step_context(): with self.training_type_plugin.test_step_context(): - return self.model_ref.test_step(*args) + return self.model.test_step(*args) def process_dataloader(self, dataloader): return dataloader @@ -96,7 +96,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): # TODO: Check out if this can be simplified with new LightningOptimizer! - model_ref = self.model_ref + model_ref = self.model is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) native_amp = self.trainer.amp_backend == AMPType.NATIVE @@ -173,20 +173,16 @@ def on_train_end(self): def early_stopping_should_stop(self, pl_module): return self.trainer.should_stop - def setup_optimizers(self, model): - # TODO: Check if we can change logic for early stopping to trainer completely (should be self contained) - if self.trainer.testing is True: + def setup_optimizers(self, trainer, model): + if trainer.testing is True: return - - optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) + optimizers, lr_schedulers, optimizer_frequencies = trainer.init_optimizers(model) self.optimizers = optimizers self.lr_schedulers = lr_schedulers self.optimizer_frequencies = optimizer_frequencies - def connect_training_type_plugin(self, plugin: TrainingTypePlugin): - plugin.connect( - self.model_ref - ) + def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: LightningModule): + plugin.connect(model) def connect_precision_plugin(self, plugin: PrecisionPlugin): model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers) @@ -195,30 +191,29 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin): self.optimizers = optimizers self.schedulers = schedulers - def to_device(self, batch): return self.batch_to_device(batch, self.root_device) class NewCPUAccelerator(NewAccelerator): - def setup(self, model): + def setup(self, trainer, model): if isinstance(self.precision_plugin, MixedPrecisionPlugin): MisconfigurationException("amp + cpu is not supported. Please use a GPU option") if "cpu" not in str(self.root_device): raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead") - return super().setup(model) + return super().setup(trainer, model) class NewGPUAccelerator(NewAccelerator): - def setup(self, model): + def setup(self, trainer, model): if "cuda" not in str(self.root_device): raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") torch.cuda.set_device(self.root_device) - self.model_ref.to(self.root_device) + model.to(self.root_device) - return super().setup(model) + return super().setup(trainer, model) # TODO: Add NewTPUAccelerator diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index d9a111f355e68..07fd9eb6f49a4 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -11,10 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union + from pytorch_lightning import accelerators import os import torch +from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator +from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin +from pytorch_lightning.accelerators.precision import PrecisionPlugin from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info @@ -22,7 +27,6 @@ from pytorch_lightning import _logger as log from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment -from pytorch_lightning.accelerators.accelerator import Accelerator try: import torch_xla @@ -62,11 +66,11 @@ def __init__( self.use_ddp2 = False self.use_horovod = False self.use_single_gpu = False - self.num_gpus = None self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - self.accelerator = accelerator + # todo: select accelerator based on trainer flags + self.accelerator = self.select_accelerator(accelerator) self.distributed_backend = distributed_backend self.auto_select_gpus = auto_select_gpus self.gpus = gpus @@ -105,13 +109,13 @@ def __init__( # link up SLURM # TODO: this should be taken out of here... but depends too much on DDP - self.slurm_connector.on_trainer_init(self.num_nodes) - self.node_rank = self.determine_ddp_node_rank() - self.local_rank = self.determine_local_rank() + # self.slurm_connector.on_trainer_init(self.num_nodes) + # self.node_rank = self.determine_ddp_node_rank() + # self.local_rank = self.determine_local_rank() self.global_rank = 0 # NVIDIA setup - self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids) + # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids) self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') @@ -132,6 +136,20 @@ def tpu_id(self): def on_gpu(self): return self.parallel_devices and torch.cuda.is_available() + @property + def num_gpus(self) -> int: + gpus = self.parallel_devices + if gpus is None: + return 0 + return len(gpus) + + def select_accelerator(self, accelerator: Union[str, NewAccelerator]): + return NewCPUAccelerator( + precision_plugin=PrecisionPlugin(), + training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")), + gradient_clip_val=None + ) + def set_distributed_mode(self): # No distributed backend diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 8281e39e71134..9d0b47c1ee345 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -46,13 +46,13 @@ def __init__(self, logger=None): self.global_rank = 0 self.logger = logger - @abstractmethod @property + @abstractmethod def on_gpu(self): raise NotImplementedError - @abstractmethod @property + @abstractmethod def root_device(self): raise NotImplementedError @@ -60,13 +60,17 @@ def root_device(self): def model_to_device(self): raise NotImplementedError - @abstractmethod @property + @abstractmethod def is_global_zero(self): raise NotImplementedError @abstractmethod - def barrier(self): + def barrier(self, name: Optional[str] = None): + raise NotImplementedError + + @abstractmethod + def broadcast(self, obj: object, src: int = 0) -> object: raise NotImplementedError def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): @@ -79,10 +83,8 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]') - def determine_local_rank(self): return int(os.environ.get('LOCAL_RANK', 0)) - def determine_node_rank(self): @@ -144,10 +146,12 @@ def connect(self, model: torch.nn.Module): def is_global_zero(self): return True - def barrier(self): + def barrier(self, *args, **kwargs): pass - + def broadcast(self, obj: object, src: int = 0) -> object: + return obj + class ParallelPlugin(TrainingTypePlugin, ABC): def __init__(self, parallel_device_ids, logger=None, cluster_environment=None): @@ -161,8 +165,8 @@ def __init__(self, parallel_device_ids, logger=None, cluster_environment=None): def reduce(self, output): raise NotImplementedError - @abstractmethod @property + @abstractmethod def root_device(self): raise NotImplementedError @@ -205,9 +209,12 @@ def root_device(self): def lightning_module(self): return self._model.module - def barrier(self): + def barrier(self, *args, **kwargs): pass + def broadcast(self, obj: object, src: int = 0) -> object: + return obj + class DDPPlugin(ParallelPlugin): @@ -432,10 +439,13 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule: return model - def barrier(self): + def barrier(self, *args, **kwargs): if torch_distrib.is_initialized(): torch_distrib.barrier() + def broadcast(self, obj: object, src: int = 0) -> object: + return self.dist.broadcast(obj) + def model_to_device(self): # TODO: Can we easily make this a property that falls back here? # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank] diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 8a89cd2bef23c..32f83190e119d 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -445,7 +445,7 @@ def __resolve_ckpt_dir(self, trainer, pl_module): else f"version_{trainer.logger.version}" ) - version, name = trainer.accelerator_backend.broadcast((version, trainer.logger.name)) + version, name = trainer.training_type_plugin.broadcast((version, trainer.logger.name)) ckpt_path = os.path.join( save_dir, str(name), version, "checkpoints" diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index dd5691d6e4553..33d206b6bc49d 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -126,6 +126,14 @@ def global_step(self) -> int: """Total training batches seen across all epochs""" return self.trainer.global_step if self.trainer else 0 + @property + def global_rank(self): + return self.trainer.global_rank if self.trainer else 0 + + @property + def local_rank(self): + return self.trainer.local_rank if self.trainer else 0 + @example_input_array.setter def example_input_array(self, example: Any) -> None: self._example_input_array = example @@ -253,6 +261,7 @@ def log( f"Logged key: {name} should not contain information about dataloader_idx.") accelerator = self.trainer.accelerator_backend + training_type_plugin = self.trainer.training_type_plugin self._results.log( name, @@ -268,7 +277,7 @@ def log( sync_dist, sync_dist_op, sync_dist_group, - accelerator.sync_tensor, + training_type_plugin.reduce, self._current_dataloader_idx, self.device, ) diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 001b0b9ed3e0d..8d1a482deff15 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -73,7 +73,7 @@ def restore_weights(self, model: LightningModule) -> None: self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer._device_type == DeviceType.GPU) # wait for all to catch up - self.trainer.accelerator_backend.barrier('TrainerIOMixin.restore_weights') + self.trainer.training_type_plugin.barrier('TrainerIOMixin.restore_weights') # clear cache after restore if self.trainer._device_type == DeviceType.GPU: diff --git a/pytorch_lightning/trainer/connectors/model_connector.py b/pytorch_lightning/trainer/connectors/model_connector.py index a3759d1075ee5..a4bf9a6e505e6 100644 --- a/pytorch_lightning/trainer/connectors/model_connector.py +++ b/pytorch_lightning/trainer/connectors/model_connector.py @@ -31,16 +31,13 @@ def copy_trainer_model_properties(self, model): for m in [model, ref_model]: m.trainer = self.trainer + # TODO: add property getters to LightningModule and access through trainer reference m.logger = self.trainer.logger m._device_type = str(self.trainer._device_type) m._distrib_type = str(self.trainer._distrib_type) m.use_amp = self.trainer.amp_backend is not None m.testing = self.trainer.testing - m.tpu_local_core_rank = self.trainer.tpu_local_core_rank - m.tpu_global_core_rank = self.trainer.tpu_global_core_rank m.precision = self.trainer.precision - m.global_rank = self.trainer.global_rank - m.local_rank = self.trainer.local_rank def get_model(self): return self._get_reference_model(self.trainer.model) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 38198c9f39e10..cc5fc492b3a6a 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -21,7 +21,7 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import NewAccelerator from pytorch_lightning.core import LightningModule from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities import rank_zero_warn @@ -51,7 +51,7 @@ class TrainerDataLoadingMixin(ABC): limit_val_batches: Union[int, float] limit_test_batches: Union[int, float] replace_sampler_ddp: bool - accelerator_backend: Accelerator + accelerator_backend: NewAccelerator num_nodes: int num_processes: int distributed_backend: Optional[str] @@ -62,7 +62,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: # ddp_spawn + num_workers > 0 don't mix! tell the user is_dataloader = isinstance(dataloader, DataLoader) - using_spawn = self.distributed_backend == "ddp_spawn" + using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn" if is_dataloader and not on_windows: if dataloader.num_workers > 0 and using_spawn: rank_zero_warn('Dataloader(num_workers>0) and ddp_spawn do not mix well!' @@ -92,8 +92,9 @@ def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader: if not is_dataloader or is_iterable_ds: return dataloader - need_dist_sampler = self.require_distributed_sampler and not isinstance(dataloader.sampler, DistributedSampler) - if self.replace_sampler_ddp and need_dist_sampler: + is_in_dist = self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu + need_dist_sampler = is_in_dist and not isinstance(dataloader.sampler, DistributedSampler) + if self.accelerator_connector.replace_sampler_ddp and need_dist_sampler: if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)): raise MisconfigurationException( 'You seem to have configured a sampler in your DataLoader. This will be replaced ' @@ -314,7 +315,7 @@ def request_dataloader(self, dataloader_fx: Callable) -> DataLoader: dataloader = self._flatten_dl_only(dataloader) if self.accelerator_backend is not None: - self.accelerator_backend.barrier('get_dataloaders') + self.training_type_plugin.barrier('get_dataloaders') return dataloader def _flatten_dl_only(self, dataloaders): diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index c32b24458c297..6dc6802bc9021 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -17,8 +17,9 @@ from argparse import ArgumentParser, Namespace from typing import cast, List, Optional, Type, TypeVar, Union -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase +from pytorch_lightning.accelerators.accelerator import NewAccelerator +from pytorch_lightning.accelerators.accelerator_connector import BackendConnector +from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint, EarlyStopping from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.loggers.base import LightningLoggerBase @@ -42,6 +43,9 @@ if _HOROVOD_AVAILABLE: import horovod.torch as hvd +from pytorch_lightning.utilities.model_utils import is_overridden +from pytorch_lightning.loggers.base import LightningLoggerBase +from pytorch_lightning.loggers.tensorboard import TensorBoardLogger class TrainerProperties(ABC): @@ -59,14 +63,71 @@ class TrainerProperties(ABC): limit_val_batches: int _default_root_dir: str _weights_save_path: str - accelerator_backend: Accelerator - logger: LightningLoggerBase - model_connector: ModelConnector - checkpoint_connector: CheckpointConnector - callbacks: List[Callback] + accelerator_backend: NewAccelerator num_nodes: int num_processes: int + @property + def accelerator(self): + return self.accelerator_connector.accelerator + + @property + def accelerator_backend(self): + # for backward compatibility + return self.accelerator + + @property + def training_type_plugin(self): + return self.accelerator.training_type_plugin + + @property + def global_rank(self): + return self.accelerator.training_type_plugin.global_rank + + @property + def local_rank(self): + # some training types define a local rank + return getattr(self.accelerator.training_type_plugin, "local_rank", 0) + + @property + def world_size(self): + # some training types define a world size + return getattr(self.accelerator.training_type_plugin, "world_size", 1) + + @property + def on_gpu(self): + return self.accelerator_connector.on_gpu + + @property + def on_tpu(self): + return self.accelerator_connector.on_tpu + + @property + def use_dp(self): + return self.accelerator_connector.use_dp + + @property + def use_ddp(self): + return self.accelerator_connector.use_ddp + + @property + def use_ddp2(self): + return self.accelerator_connector.use_ddp2 + + @property + def use_horovod(self): + return self.accelerator_connector.use_horovod + + @property + def use_single_gpu(self): + return self.accelerator_connector.use_single_gpu + + @property + def use_tpu(self): + # TODO update this, what is the difference between use_tpu and on_tpu? + return False + # return self.accelerator_connector.use_tpu + @property def log_dir(self): if self.checkpoint_callback is not None: @@ -173,10 +234,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: @property def num_gpus(self) -> int: - gpus = self.data_parallel_device_ids - if gpus is None: - return 0 - return len(gpus) + return self.accelerator_connector.num_gpus @property def data_parallel(self) -> bool: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7d7cec2335301..94c698cfb8501 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -26,6 +26,8 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector from pytorch_lightning.callbacks import Callback +from pytorch_lightning.accelerators.accelerator_connector import BackendConnector +from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.step_result import EvalResult, Result @@ -56,13 +58,25 @@ from pytorch_lightning.trainer.states import RunningStage, TrainerState from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin +from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector +from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector +from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector +from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector +from pytorch_lightning.trainer.connectors.model_connector import ModelConnector +from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector +from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector +from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector +from pytorch_lightning import _logger as log from pytorch_lightning.tuner.tuning import Tuner from pytorch_lightning.utilities import DeviceType, rank_zero_warn from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach -from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.model_utils import is_overridden +from pytorch_lightning.trainer.properties import TrainerProperties +from pytorch_lightning.plugins.plugin_connector import PluginConnector +from pytorch_lightning.accelerators.accelerator import NewAccelerator # warnings to ignore in trainer warnings.filterwarnings( @@ -111,7 +125,7 @@ def __init__( val_check_interval: Union[int, float] = 1.0, flush_logs_every_n_steps: int = 100, log_every_n_steps: int = 50, - accelerator: Optional[Union[str, Accelerator]] = None, + accelerator: Optional[Union[str, NewAccelerator]] = None, sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = 'top', @@ -302,7 +316,20 @@ def __init__( self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) - self.accelerator_connector = AcceleratorConnector(self) + self.accelerator_connector = BackendConnector( + num_processes, + tpu_cores, + accelerator, + distributed_backend, + auto_select_gpus, + gpus, + num_nodes, + log_gpu_memory, + sync_batchnorm, + benchmark, + replace_sampler_ddp, + deterministic, + ) self.logger_connector = LoggerConnector(self) self.model_connector = ModelConnector(self) self.precision_connector = PrecisionConnector(self) @@ -313,7 +340,6 @@ def __init__( self.checkpoint_connector = CheckpointConnector(self) self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) - self.accelerator_backend = None self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self, multiple_trainloader_mode) self.plugin_connector = PluginConnector(self) @@ -351,20 +377,20 @@ def __init__( ) # init accelerator related flags - self.accelerator_connector.on_trainer_init( - num_processes, - tpu_cores, - accelerator, - distributed_backend, - auto_select_gpus, - gpus, - num_nodes, - log_gpu_memory, - sync_batchnorm, - benchmark, - replace_sampler_ddp, - deterministic, - ) + # self.accelerator_connector.on_trainer_init( + # num_processes, + # tpu_cores, + # accelerator, + # distributed_backend, + # auto_select_gpus, + # gpus, + # num_nodes, + # log_gpu_memory, + # sync_batchnorm, + # benchmark, + # replace_sampler_ddp, + # deterministic, + # ) # init train loop related flags # TODO: remove in 1.3.0 @@ -460,17 +486,19 @@ def fit( # ---------------------------- # SET UP TRAINING # ---------------------------- - self.accelerator_backend = self.accelerator_connector.select_accelerator() + # self.accelerator_backend = self.accelerator_connector.select_accelerator() self.call_hook("on_before_accelerator_backend_setup", model) - self.accelerator_backend.setup(model) + self.accelerator_backend.setup(self, model) # ---------------------------- # INSPECT THESE FOR MAIN LOOPS # ---------------------------- # assign training and eval functions... inspect these to see the train and eval loops :) - self.accelerator_backend.train_loop = self.train - self.accelerator_backend.validation_loop = self.run_evaluation - self.accelerator_backend.test_loop = self.run_evaluation + # self.accelerator_backend.train_loop = self.train + # self.accelerator_backend.validation_loop = self.run_evaluation + # self.accelerator_backend.test_loop = self.run_evaluation + self.train_loop.setup_training(model) + self.train() # ---------------------------- # TRAIN diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 78cb08f22161f..2b1af8dfeea01 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -140,8 +140,9 @@ def setup_training(self, model: LightningModule): ref_model = self.trainer.get_model() # set the ranks and devices - self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank - self.trainer.accelerator_backend.dist.device = ref_model.device + # TODO dist was a AttributeDict, should be moved to plugin? + # self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank + # self.trainer.accelerator_backend.dist.device = ref_model.device # give model convenience properties ref_model.trainer = self.trainer @@ -163,7 +164,7 @@ def setup_training(self, model: LightningModule): self.trainer.logger.save() # wait for all to join if on distributed - self.trainer.accelerator_backend.barrier("setup_training") + self.trainer.accelerator.training_type_plugin.barrier("setup_training") # register auto-resubmit when on SLURM self.trainer.slurm_connector.register_slurm_signal_handlers() From 9e4856898a0e411cd1c948ab1ea4d112289d13bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 6 Dec 2020 03:08:06 +0100 Subject: [PATCH 019/157] test single gpu trainer integration --- pytorch_lightning/accelerators/accelerator.py | 12 +++++++++- .../accelerators/accelerator_connector.py | 23 ++++++++++++++----- pytorch_lightning/trainer/training_loop.py | 7 ------ 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index c4f5bc3a57554..502646011e4de 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -60,6 +60,9 @@ def batch_to_device(self, batch: Any, device: torch.device): return model.transfer_batch_to_device(batch, device) return move_data_to_device(batch, device) + def on_train_start(self): + pass + def training_step(self, args): batch = self.to_device(args[0]) @@ -215,5 +218,12 @@ def setup(self, trainer, model): return super().setup(trainer, model) + def on_train_start(self): + # clear cache before training + # use context because of: + # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() + -# TODO: Add NewTPUAccelerator +# TODO: Add NewTPUAccelerator \ No newline at end of file diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 07fd9eb6f49a4..d0b17c9654a04 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -17,7 +17,7 @@ import os import torch -from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator +from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin from pytorch_lightning.accelerators.precision import PrecisionPlugin from pytorch_lightning.utilities import device_parser @@ -69,8 +69,6 @@ def __init__( self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - # todo: select accelerator based on trainer flags - self.accelerator = self.select_accelerator(accelerator) self.distributed_backend = distributed_backend self.auto_select_gpus = auto_select_gpus self.gpus = gpus @@ -94,10 +92,13 @@ def __init__( self.parallel_devices = device_parser.parse_gpu_ids(self.gpus) self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices) - self.root_device = torch.device("cpu") + # self.root_device = torch.device("cpu") self.set_distributed_mode() + # todo: select accelerator based on trainer flags + self.accelerator = self.select_accelerator(accelerator) + # override dist backend when using tpus if self.on_tpu: self.distributed_backend = "tpu" @@ -143,10 +144,20 @@ def num_gpus(self) -> int: return 0 return len(gpus) + def select_precision_plugin(self): + return PrecisionPlugin() + def select_accelerator(self, accelerator: Union[str, NewAccelerator]): - return NewCPUAccelerator( + + # return NewCPUAccelerator( + # precision_plugin=PrecisionPlugin(), + # training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")), + # gradient_clip_val=None + # ) + + return NewGPUAccelerator( precision_plugin=PrecisionPlugin(), - training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")), + training_type_plugin=SingleDevicePlugin(device=torch.device("cuda", self.root_gpu)), gradient_clip_val=None ) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 2b1af8dfeea01..25540791209ff 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -101,13 +101,6 @@ def should_skip_training(self): return False def on_train_start(self): - # clear cache before training - if self.trainer._device_type == DeviceType.GPU and self.trainer.root_gpu is not None: - # use context because of: - # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 - with torch.cuda.device(f"cuda:{self.trainer.root_gpu}"): - torch.cuda.empty_cache() - # hook self.trainer.call_hook("on_train_start") From 5da773a341acbf39711bf97b6387e6e265441b6b Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 7 Dec 2020 16:43:58 +0100 Subject: [PATCH 020/157] make device changes a bit less hardcoded --- .../accelerators/accelerator_connector.py | 49 +++++++++++-------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index d0b17c9654a04..b7486d60a47b0 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -82,8 +82,8 @@ def __init__( # init the default rank if exists # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks # this way we only show it on rank 0 - if 'LOCAL_RANK' in os.environ: - rank_zero_only.rank = int(os.environ['LOCAL_RANK']) + if "LOCAL_RANK" in os.environ: + rank_zero_only.rank = int(os.environ["LOCAL_RANK"]) # TODO: Move autoselect GPUS to other place # for gpus allow int, string and gpu list @@ -118,7 +118,7 @@ def __init__( # NVIDIA setup # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids) - self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') + self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE") self.replace_sampler_ddp = replace_sampler_ddp @@ -147,6 +147,9 @@ def num_gpus(self) -> int: def select_precision_plugin(self): return PrecisionPlugin() + def select_training_type_plugin(self): + return SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) + def select_accelerator(self, accelerator: Union[str, NewAccelerator]): # return NewCPUAccelerator( @@ -155,10 +158,15 @@ def select_accelerator(self, accelerator: Union[str, NewAccelerator]): # gradient_clip_val=None # ) - return NewGPUAccelerator( - precision_plugin=PrecisionPlugin(), - training_type_plugin=SingleDevicePlugin(device=torch.device("cuda", self.root_gpu)), - gradient_clip_val=None + if self.on_gpu: + acc_cls = NewGPUAccelerator + else: + acc_cls = NewCPUAccelerator + + return acc_cls( + precision_plugin=self.select_precision_plugin(), + training_type_plugin=self.select_training_type_plugin(), + gradient_clip_val=None, ) def set_distributed_mode(self): @@ -181,7 +189,7 @@ def set_distributed_mode(self): # Default: DDP-Spawn elif self.num_gpus > 1: rank_zero_warn( - 'You requested multiple GPUs but did not specify a backend, e.g.' + "You requested multiple GPUs but did not specify a backend, e.g." ' (distributed_backend="dp"|"ddp"|"ddp2").' ' Setting distributed_backend="ddp_spawn" for you.' ) @@ -201,8 +209,8 @@ def set_distributed_mode(self): if self.num_gpus == 0: # DDP CPU if self.num_nodes > 1 or self.num_processes > 1: - self.use_ddp = True - + self.use_ddp = True + # DDP Single GPU elif self.num_gpus == 1: self.use_single_gpu = True @@ -223,7 +231,7 @@ def set_distributed_mode(self): elif self.distributed_backend == "ddp_cpu": if self.num_gpus > 0: rank_zero_warn( - 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' + "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs." ) self.use_ddp = True self.data_parallel_device_ids = None @@ -236,18 +244,17 @@ def set_distributed_mode(self): # throw error to force user ddp or ddp2 choice if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): raise MisconfigurationException( - 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' - 'To silence this warning set distributed_backend=ddp or distributed_backend=ddp2' + "DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. " + "To silence this warning set distributed_backend=ddp or distributed_backend=ddp2" ) - rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}') + rank_zero_info(f"GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}") num_cores = self.tpu_cores if self.tpu_cores is not None else 0 - rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores') + rank_zero_info(f"TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores") if torch.cuda.is_available() and not self.on_gpu: - rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.') + rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.") - def _set_horovod_backend(self): self.check_horovod() self.use_horovod = True @@ -263,16 +270,16 @@ def check_horovod(self): if not HOROVOD_AVAILABLE: raise MisconfigurationException( 'Requested `distributed_backend="horovod"`, but Horovod is not installed.' - 'Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]' + "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" ) if self.num_gpus > 1 or self.num_nodes > 1: raise MisconfigurationException( - 'Horovod does not support setting num_nodes / num_gpus explicitly. Use ' - 'horovodrun / mpirun to configure the number of processes.' + "Horovod does not support setting num_nodes / num_gpus explicitly. Use " + "horovodrun / mpirun to configure the number of processes." ) @staticmethod def has_horovodrun(): """Returns True if running with `horovodrun` using Gloo or OpenMPI.""" - return 'OMPI_COMM_WORLD_RANK' in os.environ or 'HOROVOD_RANK' in os.environ \ No newline at end of file + return "OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ From 42e53beb84717b0ed69636c9023cda430eb6aa3e Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 7 Dec 2020 16:44:13 +0100 Subject: [PATCH 021/157] properly resolve attributes --- pytorch_lightning/accelerators/accelerator.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 502646011e4de..e2f044fab612f 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -70,7 +70,7 @@ def training_step(self, args): with self.precision_plugin.train_step_context(): with self.training_type_plugin.train_step_context(): - return self.model.training_step(*args) + return self.lightning_module.training_step(*args) def validation_step(self, args): batch = self.to_device(args[0]) @@ -79,7 +79,7 @@ def validation_step(self, args): with self.precision_plugin.val_step_context(): with self.training_type_plugin.val_step_context(): - return self.model.validation_step(*args) + return self.lightning_module.validation_step(*args) def test_step(self, args): batch = self.to_device(args[0]) @@ -88,7 +88,7 @@ def test_step(self, args): with self.precision_plugin.test_step_context(): with self.training_type_plugin.test_step_context(): - return self.model.test_step(*args) + return self.lightning_module.test_step(*args) def process_dataloader(self, dataloader): return dataloader @@ -99,14 +99,14 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): # TODO: Check out if this can be simplified with new LightningOptimizer! - model_ref = self.model + model_ref = self.lightning_module is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = self.trainer.amp_backend == AMPType.NATIVE + native_amp = isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE - self.precision_plugin.pre_optimizer_step(optimizer) + self.precision_plugin.pre_optimizer_step(optimizer, opt_idx) # model hook - model_ref.optimizer_step( + res = model_ref.optimizer_step( epoch=current_epoch, batch_idx=batch_idx, optimizer=optimizer, @@ -118,6 +118,7 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl ) self.precision_plugin.post_optimizer_step() + return res def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): model_ref = self.model_ref @@ -134,7 +135,7 @@ def clip_gradients(self, optimizer, clip_val=None): return self._clip_gradients(optimizer, grad_clip_val) - model = self.model_ref + model = self.lightning_module # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX if self.trainer.amp_backend == AMPType.APEX: @@ -198,6 +199,7 @@ def to_device(self, batch): return self.batch_to_device(batch, self.root_device) + class NewCPUAccelerator(NewAccelerator): def setup(self, trainer, model): if isinstance(self.precision_plugin, MixedPrecisionPlugin): From 4c8d24fb27689b3894d3521bda140675fd12a697 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 7 Dec 2020 16:44:36 +0100 Subject: [PATCH 022/157] add properties for accelerator forwarding --- pytorch_lightning/trainer/trainer.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 94c698cfb8501..e114db42956ae 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -445,6 +445,30 @@ def __init__( # Callback system self.on_init_end() + @property + def optimizers(self): + return self.accelerator_backend.optimizers + + @optimizers.setter + def optimizers(self, new_optims): + self.accelerator_backend.optimizers = new_optims + + @property + def lr_schedulers(self): + return self.accelerator_backend.lr_schedulers + + @lr_schedulers.setter + def lr_schedulers(self, new_schedulers): + self.accelerator_backend.lr_schedulers = new_schedulers + + @property + def optimizer_frequencies(self): + return self.accelerator_backend.optimizer_frequencies + + @optimizer_frequencies.setter + def optimizer_frequencies(self, new_freqs): + self.accelerator_backend.optimizer_frequencies = new_freqs + def fit( self, model: LightningModule, From 6faebfa4f5bad37d52ec66c20ab35e8ae83bf6b7 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 7 Dec 2020 16:44:55 +0100 Subject: [PATCH 023/157] correct optimizer_step calls --- pytorch_lightning/trainer/training_loop.py | 27 ++++------------------ 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 25540791209ff..5dcf17f99f7a7 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -484,28 +484,11 @@ def _process_result(self, training_step_output, split_batch): return training_step_output_for_epoch_end def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure): - model_ref = self.trainer.get_model() - - is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - using_native_amp = self.trainer.amp_backend == AMPType.NATIVE - - # native amp + lbfgs is a no go right now - if using_native_amp and is_lbfgs: - raise MisconfigurationException( - 'native PyTorch amp and lbfgs are not compatible.' - ' To request, please file a Github issue in PyTorch and tag @mcarilli') - - # model hook - model_ref.optimizer_step( - self.trainer.current_epoch, - batch_idx, - optimizer, - opt_idx, - train_step_and_backward_closure, - on_tpu=self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE, - using_native_amp=using_native_amp, - using_lbfgs=is_lbfgs, - ) + with self.trainer.profiler.profile("optimizer_step"): + # optimizer step lightningModule hook + self.trainer.accelerator_backend.optimizer_step( + optimizer, self.trainer.current_epoch, batch_idx, opt_idx, train_step_and_backward_closure + ) def on_before_zero_grad(self, optimizer): self.trainer.call_hook('on_before_zero_grad', optimizer) From 29568e1d6200089dd9cccbf2592df80f94e7832b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 7 Dec 2020 16:24:55 +0100 Subject: [PATCH 024/157] call train or test --- pytorch_lightning/trainer/trainer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e114db42956ae..6e91dddb32b12 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -521,8 +521,8 @@ def fit( # self.accelerator_backend.train_loop = self.train # self.accelerator_backend.validation_loop = self.run_evaluation # self.accelerator_backend.test_loop = self.run_evaluation + self.train_loop.setup_training(model) - self.train() # ---------------------------- # TRAIN @@ -530,7 +530,11 @@ def fit( # hook self.call_hook('on_fit_start') - results = self.accelerator_backend.train() + if self.testing: + results = self.run_test() + else: + results = self.train() + self.accelerator_backend.teardown() # ---------------------------- From 33561d779950747e7dd007bca4e92d13c7f26a59 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 7 Dec 2020 17:01:40 +0100 Subject: [PATCH 025/157] make calls to trainstep (ad fix bugs) --- pytorch_lightning/accelerators/accelerator.py | 10 ++++-- pytorch_lightning/accelerators/precision.py | 31 ++++++++++++------- pytorch_lightning/trainer/training_loop.py | 3 ++ 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index e2f044fab612f..7726f143093d5 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -94,7 +94,7 @@ def process_dataloader(self, dataloader): return dataloader def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): - return self.precision_plugin.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) + return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, *args, **kwargs) def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): # TODO: Check out if this can be simplified with new LightningOptimizer! @@ -117,11 +117,11 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl using_lbfgs=is_lbfgs, ) - self.precision_plugin.post_optimizer_step() + self.precision_plugin.post_optimizer_step(optimizer, opt_idx) return res def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): - model_ref = self.model_ref + model_ref = self.lightning_module model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx) def clip_gradients(self, optimizer, clip_val=None): @@ -129,6 +129,10 @@ def clip_gradients(self, optimizer, clip_val=None): grad_clip_val = self.gradient_clip_val if clip_val is not None: grad_clip_val = clip_val + + if grad_clip_val is None: + return + grad_clip_val = float(grad_clip_val) if grad_clip_val <= 0: diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py index ca41e8242f104..d0db65fa12dbb 100644 --- a/pytorch_lightning/accelerators/precision.py +++ b/pytorch_lightning/accelerators/precision.py @@ -33,6 +33,21 @@ def master_params(self, optimizer): def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): return model, optimizers, lr_schedulers + def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs): + # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) + automatic_optimization = model.automatic_optimization + + # do backward pass + if automatic_optimization: + model.backward(closure_loss, optimizer, opt_idx) + else: + closure_loss.backward(*args, **kwargs) + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + + return closure_loss + class MixedPrecisionPlugin(PrecisionPlugin): EPSILON = 1e-5 @@ -55,21 +70,13 @@ def pre_optimizer_step(self, optimizer, optimizer_idx): def post_optimizer_step(self, optimizer, optimizer_idx): self.scaler.update() - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = self.scaler.scale(closure_loss) # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) - automatic_optimization = self.trainer.train_loop.automatic_optimization + automatic_optimization = model.automatic_optimization - # do backward pass - if automatic_optimization: - model = self.trainer.get_model() - model.backward(closure_loss, optimizer, opt_idx) - else: - closure_loss.backward(*args, **kwargs) - - # once backward has been applied, release graph - closure_loss = closure_loss.detach() + closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, *args, **kwargs) # unscale gradient to allow analyze within `on_after_backward` # TODO: Check from where we can get the should_accumulate value (maybe pass it as argument?) @@ -101,7 +108,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = closure_loss.__enter__() # do backward pass - if self.trainer.train_loop.automatic_optimization: + if self.lightning_module: model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5dcf17f99f7a7..231d303c2942f 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -493,6 +493,9 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_ def on_before_zero_grad(self, optimizer): self.trainer.call_hook('on_before_zero_grad', optimizer) + def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): + self.trainer.accelerator_backend.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) + def track_and_norm_grad(self, optimizer): # track gradient norms grad_norm_dic = self._track_gradient_norm() From ef947554c29a83dffed0039e27683fbc42ba8dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 7 Dec 2020 17:26:58 +0100 Subject: [PATCH 026/157] remove gradient_clip_val from accelerator --- pytorch_lightning/accelerators/accelerator.py | 11 ++++------- .../accelerators/accelerator_connector.py | 1 - pytorch_lightning/trainer/training_loop.py | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 7726f143093d5..3d6f4ef92cea7 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -20,11 +20,9 @@ def __init__( self, precision_plugin: PrecisionPlugin, training_type_plugin: TrainingTypePlugin, - gradient_clip_val, ): self.precision_plugin = precision_plugin self.training_type_plugin = training_type_plugin - self.gradient_clip_val = gradient_clip_val self.optimizers = None self.lr_schedulers = None @@ -124,12 +122,11 @@ def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): model_ref = self.lightning_module model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx) - def clip_gradients(self, optimizer, clip_val=None): - # use the trainer's clip val if none passed - grad_clip_val = self.gradient_clip_val - if clip_val is not None: - grad_clip_val = clip_val + def clip_gradients(self, optimizer, clip_val): + # TODO: separate TPU case from here + self._clip_gradients(optimizer, clip_val) + def _clip_gradients(self, optimizer, grad_clip_val): if grad_clip_val is None: return diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index b7486d60a47b0..2412da6e0d773 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -166,7 +166,6 @@ def select_accelerator(self, accelerator: Union[str, NewAccelerator]): return acc_cls( precision_plugin=self.select_precision_plugin(), training_type_plugin=self.select_training_type_plugin(), - gradient_clip_val=None, ) def set_distributed_mode(self): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 231d303c2942f..0087f5d36f52c 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -501,7 +501,7 @@ def track_and_norm_grad(self, optimizer): grad_norm_dic = self._track_gradient_norm() # clip gradients - self.trainer.accelerator_backend.clip_gradients(optimizer) + self.trainer.accelerator_backend.clip_gradients(optimizer, self.trainer.gradient_clip_val) self._cur_grad_norm_dict = grad_norm_dic def _track_gradient_norm(self): From c5e989283d251a7dcd76d32fa7c3d6f8bb0c845c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 7 Dec 2020 17:44:35 +0100 Subject: [PATCH 027/157] add back the step end methods --- pytorch_lightning/accelerators/accelerator.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3d6f4ef92cea7..59d011d4de163 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -88,6 +88,15 @@ def test_step(self, args): with self.training_type_plugin.test_step_context(): return self.lightning_module.test_step(*args) + def training_step_end(self, output): + return output + + def test_step_end(self, output): + return output + + def validation_step_end(self, output): + return output + def process_dataloader(self, dataloader): return dataloader From c02baadc2323f13fa51057aef0a7f96edaa6818f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 7 Dec 2020 17:45:57 +0100 Subject: [PATCH 028/157] add precision todo comment --- pytorch_lightning/accelerators/accelerator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 59d011d4de163..bfd4ba5ad86ac 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -148,6 +148,7 @@ def _clip_gradients(self, optimizer, grad_clip_val): model = self.lightning_module # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX + # ... or we call master_params() and in the default plugin we return the model.parameters() if self.trainer.amp_backend == AMPType.APEX: parameters = self.precision_plugin.master_params(optimizer) else: From ce4eafa532bffd2488eb941cb04f47e65ffb7170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Dec 2020 00:16:45 +0100 Subject: [PATCH 029/157] ddp --- pl_examples/bug_report_model.py | 23 +++------- pytorch_lightning/accelerators/accelerator.py | 4 +- .../accelerators/accelerator_connector.py | 32 ++++++++------ .../accelerators/data_parallel.py | 42 +++++++++++++------ .../trainer/connectors/env_vars_connector.py | 5 +++ pytorch_lightning/trainer/properties.py | 19 ++++++++- pytorch_lightning/trainer/trainer.py | 6 ++- pytorch_lightning/trainer/training_loop.py | 7 +--- pytorch_lightning/utilities/device_parser.py | 8 ++-- 9 files changed, 89 insertions(+), 57 deletions(-) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index 1351048711df4..f480847938e6f 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -36,10 +36,8 @@ class RandomDataset(Dataset): def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) - def __getitem__(self, index): return self.data[index] - def __len__(self): return self.len @@ -55,63 +53,52 @@ class BoringModel(LightningModule): def __init__(self): """ Testing PL Module - Use as follows: - subclass - modify the behavior for what you want - class TestModel(BaseTestModel): def training_step(...): # do your own thing - or: - model = BaseTestModel() model.training_epoch_end = None - """ super().__init__() self.layer = torch.nn.Linear(32, 2) + @property + def automatic_optimization(self): + return True + def forward(self, x): return self.layer(x) - def loss(self, batch, prediction): # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) - def step(self, x): - x = self.layer(x) + x = self(x) out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) return out - def training_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"loss": loss} - def training_step_end(self, training_step_outputs): return training_step_outputs - def training_epoch_end(self, outputs) -> None: torch.stack([x["loss"] for x in outputs]).mean() - def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"x": loss} - def validation_epoch_end(self, outputs) -> None: torch.stack([x['x'] for x in outputs]).mean() - def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"y": loss} - def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() - def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index bfd4ba5ad86ac..82f822c16a918 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -29,7 +29,9 @@ def __init__( self.optimizer_frequencies = None def setup(self, trainer, model): + print(trainer.global_rank, "Accelerator.setup") self.connect_training_type_plugin(self.training_type_plugin, model) + self.training_type_plugin.setup(model) self.setup_optimizers(trainer, model) self.connect_precision_plugin(self.precision_plugin) @@ -53,7 +55,7 @@ def teardown(self): pass def batch_to_device(self, batch: Any, device: torch.device): - model = self.model + model = self.lightning_module if model is not None: return model.transfer_batch_to_device(batch, device) return move_data_to_device(batch, device) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 2412da6e0d773..a9327c87138ed 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -18,7 +18,7 @@ import torch from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator -from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin +from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin from pytorch_lightning.accelerators.precision import PrecisionPlugin from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities import rank_zero_only @@ -48,7 +48,6 @@ def __init__( self, num_processes, tpu_cores, - accelerator, distributed_backend, auto_select_gpus, gpus, @@ -89,7 +88,6 @@ def __init__( # for gpus allow int, string and gpu list # if auto_select_gpus and isinstance(gpus, int): # self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus) - self.parallel_devices = device_parser.parse_gpu_ids(self.gpus) self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices) # self.root_device = torch.device("cpu") @@ -97,7 +95,7 @@ def __init__( self.set_distributed_mode() # todo: select accelerator based on trainer flags - self.accelerator = self.select_accelerator(accelerator) + self.accelerator = self.select_accelerator() # override dist backend when using tpus if self.on_tpu: @@ -148,15 +146,23 @@ def select_precision_plugin(self): return PrecisionPlugin() def select_training_type_plugin(self): - return SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) - - def select_accelerator(self, accelerator: Union[str, NewAccelerator]): - - # return NewCPUAccelerator( - # precision_plugin=PrecisionPlugin(), - # training_type_plugin=SingleDevicePlugin(device=torch.device("cpu")), - # gradient_clip_val=None - # ) + if self.distributed_backend == "ddp": + plugin = DDPPlugin( + parallel_device_ids=self.parallel_devices, + num_nodes=self.num_nodes, + logger=None, + cluster_environment=TorchElasticEnvironment(), # TODO: deterimine this using plugin connector? + is_slurm_managing_tasks=False, # TODO: determine this + ) + else: + # TODO: cover all other cases + plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) + return plugin + + def select_accelerator(self): + if isinstance(self.distributed_backend, NewAccelerator): + # custom accelerator from user + return self.distributed_backend if self.on_gpu: acc_cls = NewGPUAccelerator diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 9d0b47c1ee345..0e63bc2b91f03 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -53,7 +53,7 @@ def on_gpu(self): @property @abstractmethod - def root_device(self): + def root_device(self) -> torch.device: raise NotImplementedError @abstractmethod @@ -203,7 +203,7 @@ def reduce(self, output): @property def root_device(self): - return self.parallel_device_ids[0] + return torch.device("cuda", self.parallel_device_ids[0]) @property def lightning_module(self): @@ -220,15 +220,28 @@ class DDPPlugin(ParallelPlugin): distributed_backend = "ddp" - def __init__(self, parallel_device_ids, logger=None, cluster_environment=None) -> None: + def __init__( + self, + parallel_device_ids, + num_nodes=1, + logger=None, + cluster_environment=None, + is_slurm_managing_tasks=False, + **kwargs: Dict[str, Any], + ) -> None: super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) - self._has_spawned_children = False self.interactive_ddp_procs = [] self.dist = LightningDistributed() + self.num_nodes = num_nodes + self.is_slurm_managing_tasks = is_slurm_managing_tasks + self._ddp_kwargs = kwargs + self._has_spawned_children = False + self.task_idx = None + self.num_processes = len(parallel_device_ids) @property def root_device(self): - return self.parallel_device_ids[self.local_rank] + return torch.device("cuda", self.parallel_device_ids[self.local_rank]) def determine_local_rank(self): if self.is_slurm_managing_tasks: @@ -243,6 +256,7 @@ def determine_node_rank(self): return super().determine_node_rank() def setup(self, model): + print("DDPPlugin.setup") self._model = model @@ -302,7 +316,7 @@ def _call_children_scripts(self): if self.logger is not None: os.environ["PL_EXP_VERSION"] = str(self.logger.version) - num_gpus = len(self.data_parallel_device_ids) + num_gpus = len(self.parallel_device_ids) # TODO: Add num_nodes (pass it in?) os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" @@ -354,7 +368,7 @@ def configure_ddp(self): ) def determine_ddp_device_ids(self): - return [self.root_device] + return [self.root_device.index] def init_ddp_connection(self, global_rank: int, world_size: int) -> None: # TODO: From where to get cluster environment? @@ -390,7 +404,7 @@ def pre_training(self): # try to init for 20 times at max in case ports are taken # where to store ip_table # TODO: CHeck is_slurm_managing_tasks - self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) + self.init_ddp_connection(self.global_rank, self.world_size) # TODO: Move this somewhere else # self.trainer.call_setup_hook(self.model) @@ -402,6 +416,11 @@ def pre_training(self): log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") log.info("-" * 100) + # TODO: I moved this from training loop to here, is it the right place? + # set the ranks and devices + self.dist.rank = self.global_rank + self.dist.device = self.root_device + self.model = self.configure_sync_batchnorm(self.model) # move the model to the correct device @@ -450,14 +469,11 @@ def model_to_device(self): # TODO: Can we easily make this a property that falls back here? # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank] torch.cuda.set_device(self.root_device) - self.model.cuda(self.root_device) - - def reduce(self, output, group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None): + self.model.to(self.root_device) + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) - return output diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py index e4d5670b5fe78..29a6dd137c021 100644 --- a/pytorch_lightning/trainer/connectors/env_vars_connector.py +++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py @@ -28,6 +28,9 @@ def overwrite_by_env_vars(fn: Callable) -> Callable: def overwrite_by_env_vars(self, *args, **kwargs): # get the class cls = self.__class__ + + print("before", kwargs["gpus"]) + if args: # inace any args passed move them to kwargs # parse only the argument names cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)] @@ -37,6 +40,8 @@ def overwrite_by_env_vars(self, *args, **kwargs): # todo: maybe add a warning that some init args were overwritten by Env arguments kwargs.update(vars(parse_env_variables(cls))) + print("after", kwargs["gpus"]) + # all args were already moved to kwargs return fn(self, **kwargs) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 6dc6802bc9021..cb613dc087691 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -76,6 +76,11 @@ def accelerator_backend(self): # for backward compatibility return self.accelerator + @property + def distributed_backend(self): + # for backward compatibility + return self.accelerator_connector.distributed_backend + @property def training_type_plugin(self): return self.accelerator.training_type_plugin @@ -128,6 +133,14 @@ def use_tpu(self): return False # return self.accelerator_connector.use_tpu + @property + def num_nodes(self): + return self.accelerator_connector.num_gpus + + @property + def num_processes(self): + return self.accelerator_connector.num_processes + @property def log_dir(self): if self.checkpoint_callback is not None: @@ -261,7 +274,7 @@ def disable_validation(self) -> bool: @property def enable_validation(self) -> bool: """ Check if we should run validation during training. """ - model_ref = self.model_connector.get_model() + model_ref = self.get_model() val_loop_enabled = is_overridden('validation_step', model_ref) and self.limit_val_batches > 0 return val_loop_enabled @@ -323,7 +336,9 @@ def save_checkpoint(self, filepath, weights_only: bool = False): self.checkpoint_connector.save_checkpoint(filepath, weights_only) def get_model(self): - return self.model_connector.get_model() + # TODO: rename this to lightning_module (see training type plugin) + # backward compatible + return self.training_type_plugin.lightning_module def __getstate__(self): # unwrap optimizer diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 6e91dddb32b12..b71d9ced7e0e6 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -311,6 +311,10 @@ def __init__( self._distrib_type = None self._running_stage = None + distributed_backend = distributed_backend or accelerator + + print("gpus passed into trainer", gpus) + # init connectors self.dev_debugger = InternalDebugger(self) self.config_validator = ConfigValidator(self) @@ -319,7 +323,6 @@ def __init__( self.accelerator_connector = BackendConnector( num_processes, tpu_cores, - accelerator, distributed_backend, auto_select_gpus, gpus, @@ -513,6 +516,7 @@ def fit( # self.accelerator_backend = self.accelerator_connector.select_accelerator() self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator_backend.setup(self, model) + self.training_type_plugin.pre_training() # ---------------------------- # INSPECT THESE FOR MAIN LOOPS diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 0087f5d36f52c..28bbb5a4f722c 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -130,13 +130,10 @@ def setup_training(self, model: LightningModule): # -------------------------- # Setup?? # -------------------------- + # ref_model = self.trainer.get_model() + print(self.trainer.global_rank, type(model)) ref_model = self.trainer.get_model() - # set the ranks and devices - # TODO dist was a AttributeDict, should be moved to plugin? - # self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank - # self.trainer.accelerator_backend.dist.device = ref_model.device - # give model convenience properties ref_model.trainer = self.trainer diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index b1bd62277aa18..9417bc13e8e8b 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, MutableSequence, Optional, Union +from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch @@ -146,9 +146,9 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: return gpus -def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int]]) -> Optional[List[int]]: +def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, ...]]) -> Optional[List[int]]: assert gpus is not None - if isinstance(gpus, MutableSequence): + if isinstance(gpus, (list, tuple)): return list(gpus) # must be an int @@ -177,7 +177,7 @@ def _check_data_type(device_ids: Any) -> None: device_ids: gpus/tpu_cores parameter as passed to the Trainer """ if device_ids is not None and \ - (not isinstance(device_ids, (int, str, MutableSequence)) or isinstance(device_ids, bool)): + (not isinstance(device_ids, (int, str, list, tuple)) or isinstance(device_ids, bool)): raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.") From e6ba00982c3a784851227fee0ee872a92fa4bcb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Dec 2020 00:21:06 +0100 Subject: [PATCH 030/157] clean up --- pytorch_lightning/accelerators/accelerator.py | 1 - pytorch_lightning/accelerators/data_parallel.py | 2 -- pytorch_lightning/trainer/connectors/env_vars_connector.py | 4 ---- pytorch_lightning/trainer/trainer.py | 2 -- pytorch_lightning/trainer/training_loop.py | 4 +--- 5 files changed, 1 insertion(+), 12 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 82f822c16a918..6bc7cdeca612b 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -29,7 +29,6 @@ def __init__( self.optimizer_frequencies = None def setup(self, trainer, model): - print(trainer.global_rank, "Accelerator.setup") self.connect_training_type_plugin(self.training_type_plugin, model) self.training_type_plugin.setup(model) self.setup_optimizers(trainer, model) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 0e63bc2b91f03..801015afaff79 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -256,8 +256,6 @@ def determine_node_rank(self): return super().determine_node_rank() def setup(self, model): - print("DDPPlugin.setup") - self._model = model # start the other scripts diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py index 29a6dd137c021..6b907d288c5ca 100644 --- a/pytorch_lightning/trainer/connectors/env_vars_connector.py +++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py @@ -29,8 +29,6 @@ def overwrite_by_env_vars(self, *args, **kwargs): # get the class cls = self.__class__ - print("before", kwargs["gpus"]) - if args: # inace any args passed move them to kwargs # parse only the argument names cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)] @@ -40,8 +38,6 @@ def overwrite_by_env_vars(self, *args, **kwargs): # todo: maybe add a warning that some init args were overwritten by Env arguments kwargs.update(vars(parse_env_variables(cls))) - print("after", kwargs["gpus"]) - # all args were already moved to kwargs return fn(self, **kwargs) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b71d9ced7e0e6..6582fa6421c80 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -313,8 +313,6 @@ def __init__( distributed_backend = distributed_backend or accelerator - print("gpus passed into trainer", gpus) - # init connectors self.dev_debugger = InternalDebugger(self) self.config_validator = ConfigValidator(self) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 28bbb5a4f722c..e8aefb53ad699 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -130,9 +130,7 @@ def setup_training(self, model: LightningModule): # -------------------------- # Setup?? # -------------------------- - # ref_model = self.trainer.get_model() - print(self.trainer.global_rank, type(model)) - ref_model = self.trainer.get_model() + ref_model = model # give model convenience properties ref_model.trainer = self.trainer From fa4d84432ea4a8857f54481fef6b3245c5108fcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Dec 2020 00:33:16 +0100 Subject: [PATCH 031/157] connect --- pl_examples/bug_report_model.py | 12 ++++++++++++ pytorch_lightning/accelerators/accelerator.py | 2 +- pytorch_lightning/accelerators/data_parallel.py | 2 -- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index f480847938e6f..03ccd47e09d97 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -36,8 +36,10 @@ class RandomDataset(Dataset): def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) + def __getitem__(self, index): return self.data[index] + def __len__(self): return self.len @@ -72,33 +74,43 @@ def automatic_optimization(self): def forward(self, x): return self.layer(x) + def loss(self, batch, prediction): # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) + def step(self, x): x = self(x) out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) return out + def training_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"loss": loss} + def training_step_end(self, training_step_outputs): return training_step_outputs + def training_epoch_end(self, outputs) -> None: torch.stack([x["loss"] for x in outputs]).mean() + def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"x": loss} + def validation_epoch_end(self, outputs) -> None: torch.stack([x['x'] for x in outputs]).mean() + def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"y": loss} + def test_epoch_end(self, outputs) -> None: torch.stack([x["y"] for x in outputs]).mean() + def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 6bc7cdeca612b..8f38c70d69cc0 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -30,7 +30,7 @@ def __init__( def setup(self, trainer, model): self.connect_training_type_plugin(self.training_type_plugin, model) - self.training_type_plugin.setup(model) + # self.training_type_plugin.setup(model) self.setup_optimizers(trainer, model) self.connect_precision_plugin(self.precision_plugin) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 801015afaff79..586597656bb30 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -139,7 +139,6 @@ def model_to_device(self): def connect(self, model: torch.nn.Module): self._model = model self.model_to_device() - return self.model @property @@ -180,7 +179,6 @@ def setup(self, model): def connect(self, model): self.setup(model) - return self.model @property From 8be82a43ebf3dbb194ca017c44f5ae19bb73895a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Dec 2020 00:36:34 +0100 Subject: [PATCH 032/157] clean up --- pytorch_lightning/accelerators/accelerator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 8f38c70d69cc0..a1eb3f4db1d12 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -30,7 +30,6 @@ def __init__( def setup(self, trainer, model): self.connect_training_type_plugin(self.training_type_plugin, model) - # self.training_type_plugin.setup(model) self.setup_optimizers(trainer, model) self.connect_precision_plugin(self.precision_plugin) From 08ce7d323a143d1ffb1f46c15e81f477840658f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Dec 2020 00:40:03 +0100 Subject: [PATCH 033/157] post --- pytorch_lightning/trainer/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 6582fa6421c80..8ca7bedd76ccb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -514,6 +514,8 @@ def fit( # self.accelerator_backend = self.accelerator_connector.select_accelerator() self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator_backend.setup(self, model) + + # TODO: is calling pre-training the correct place here @justus? self.training_type_plugin.pre_training() # ---------------------------- @@ -537,6 +539,8 @@ def fit( else: results = self.train() + # TODO: is calling post training the correct place here @justus? + self.training_type_plugin.post_training(results, self.checkpoint_callback.best_model_path) self.accelerator_backend.teardown() # ---------------------------- From ffbcd4fa80d34f792f5905093f764a5ab4bf7649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Dec 2020 02:45:49 +0100 Subject: [PATCH 034/157] disable progress bar on rank > 0 --- pytorch_lightning/accelerators/data_parallel.py | 13 ------------- pytorch_lightning/trainer/training_loop.py | 3 +++ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 586597656bb30..3946109fc2a13 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -382,20 +382,12 @@ def pre_training(self): if seed is not None: seed_everything(int(seed)) - # show progressbar only on progress_rank 0 - # TODO: check where to move this. Cannot stay here, since we won't have access to progressbar here - # if (self.node_rank != 0 or self.task_idx != 0) and self.trainer.progress_bar_callback is not None: - # self.trainer.progress_bar_callback.disable() - # determine which process we are and world size self.set_world_ranks() # set warning rank rank_zero_only.rank = self.global_rank - # TODO: This has to be done somewhere else! - # self.model.trainer = self.trainer - # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table @@ -412,7 +404,6 @@ def pre_training(self): log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") log.info("-" * 100) - # TODO: I moved this from training loop to here, is it the right place? # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -422,10 +413,6 @@ def pre_training(self): # move the model to the correct device self.model_to_device() - # TODO: Check where this can be moved - # set model properties before going into wrapper - # self.trainer.model_connector.copy_trainer_model_properties(self.model) - self.configure_ddp() self.barrier() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index e8aefb53ad699..8a69046752088 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -157,6 +157,9 @@ def setup_training(self, model: LightningModule): # register auto-resubmit when on SLURM self.trainer.slurm_connector.register_slurm_signal_handlers() + if not self.trainer.is_global_zero and self.trainer.progress_bar_callback is not None: + self.trainer.progress_bar_callback.disable() + # -------------------------- # Pre-train # -------------------------- From 4be76bf7a480bdb1b15fe15f20cf42397c501b1c Mon Sep 17 00:00:00 2001 From: justusschock Date: Thu, 10 Dec 2020 08:45:34 +0100 Subject: [PATCH 035/157] precision test --- pytorch_lightning/accelerators/accelerator.py | 4 +- pytorch_lightning/accelerators/precision.py | 40 +++++++++++++++---- pytorch_lightning/trainer/training_loop.py | 6 ++- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index a1eb3f4db1d12..567badcd70c32 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -100,8 +100,8 @@ def validation_step_end(self, output): def process_dataloader(self, dataloader): return dataloader - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): - return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, *args, **kwargs) + def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs): + return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs) def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): # TODO: Check out if this can be simplified with new LightningOptimizer! diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py index d0db65fa12dbb..9733aadf96a33 100644 --- a/pytorch_lightning/accelerators/precision.py +++ b/pytorch_lightning/accelerators/precision.py @@ -33,7 +33,16 @@ def master_params(self, optimizer): def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): return model, optimizers, lr_schedulers - def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs): + def backward( + self, + model: LightningModule, + closure_loss: torch.Tensor, + optimizer: torch.optim.Optimizer, + opt_idx: int, + should_accumulate: bool, + *args, + **kwargs, + ): # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) automatic_optimization = model.automatic_optimization @@ -70,7 +79,16 @@ def pre_optimizer_step(self, optimizer, optimizer_idx): def post_optimizer_step(self, optimizer, optimizer_idx): self.scaler.update() - def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *args, **kwargs): + def backward( + self, + model: LightningModule, + closure_loss: torch.Tensor, + optimizer: torch.optim.Optimizer, + opt_idx: int, + should_accumulate: bool, + *args, + **kwargs, + ): closure_loss = self.scaler.scale(closure_loss) # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) @@ -79,8 +97,7 @@ def backward(self, model: LightningModule, closure_loss, optimizer, opt_idx, *ar closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, *args, **kwargs) # unscale gradient to allow analyze within `on_after_backward` - # TODO: Check from where we can get the should_accumulate value (maybe pass it as argument?) - if not self.trainer.train_loop.should_accumulate() and automatic_optimization: + if not should_accumulate and automatic_optimization: self.scaler.unscale_(optimizer) return closure_loss @@ -100,7 +117,16 @@ def connect(self, model, optimizers, lr_schedulers): reinit_scheduler_properties(optimizers, lr_schedulers) return model, optimizers, lr_schedulers - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + def backward( + self, + model: LightningModule, + closure_loss: torch.Tensor, + optimizer: torch.optim.Optimizer, + opt_idx: int, + should_accumulate: bool, + *args, + **kwargs, + ): closure_loss = amp.scale_loss(closure_loss, optimizer) # enter apex context @@ -108,8 +134,8 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = closure_loss.__enter__() # do backward pass - if self.lightning_module: - model = self.trainer.get_model() + # TODO: not entirely sure, why we need this + if model is not None and isinstance(model, LightningModule): model.backward(closure_loss, optimizer, opt_idx) else: closure_loss.backward(*args, **kwargs) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 8a69046752088..3b9e704f840b8 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -818,12 +818,14 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, def backward(self, result, optimizer, opt_idx, *args, **kwargs): self.trainer.dev_debugger.track_event("backward_call") + should_accumulate = self.should_accumulate() + # backward can be called manually in the training loop if isinstance(result, torch.Tensor): - self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, *args, **kwargs) + self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs) else: result.closure_loss = self.trainer.accelerator_backend.backward( - result.closure_loss, optimizer, opt_idx, *args, **kwargs + result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs ) if not self.should_accumulate(): From 098f6650fd43dfa0ebb244c5671d62bce1ee75c2 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Thu, 10 Dec 2020 10:01:30 +0100 Subject: [PATCH 036/157] fix native amp --- pytorch_lightning/accelerators/accelerator.py | 39 ++++++-- .../accelerators/accelerator_connector.py | 40 ++++++++- pytorch_lightning/accelerators/precision.py | 2 +- pytorch_lightning/trainer/trainer.py | 90 ++++++++++++------- 4 files changed, 127 insertions(+), 44 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 567badcd70c32..722328dd66325 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,7 +1,7 @@ from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities import AMPType, NATIVE_AMP_AVALAIBLE from typing import Any, Union import math @@ -9,13 +9,17 @@ from torch.optim import Optimizer from pytorch_lightning.core import LightningModule -from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin, PrecisionPlugin +from pytorch_lightning.accelerators.precision import ( + ApexMixedPrecisionPlugin, + MixedPrecisionPlugin, + NativeMixedPrecisionPlugin, + PrecisionPlugin, +) from pytorch_lightning.utilities.apply_func import move_data_to_device class NewAccelerator(object): - def __init__( self, precision_plugin: PrecisionPlugin, @@ -101,14 +105,18 @@ def process_dataloader(self, dataloader): return dataloader def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs): - return self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs) + return self.precision_plugin.backward( + self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs + ) def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): # TODO: Check out if this can be simplified with new LightningOptimizer! model_ref = self.lightning_module is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE + native_amp = ( + isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE + ) self.precision_plugin.pre_optimizer_step(optimizer, opt_idx) @@ -138,7 +146,7 @@ def clip_gradients(self, optimizer, clip_val): def _clip_gradients(self, optimizer, grad_clip_val): if grad_clip_val is None: return - + grad_clip_val = float(grad_clip_val) if grad_clip_val <= 0: @@ -209,6 +217,25 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin): def to_device(self, batch): return self.batch_to_device(batch, self.root_device) + @property + def amp_backend(self): + if isinstance(self.precision_plugin, ApexMixedPrecisionPlugin): + return AMPType.APEX + elif isinstance(self.precision_plugin, NativeMixedPrecisionPlugin): + return AMPType.NATIVE + else: + return None + + @property + def precision(self): + return self.precision_plugin.precision + + @property + def scaler(self): + if hasattr(self.precision_plugin, 'scaler'): + return self.precision_plugin.scaler + + return None class NewCPUAccelerator(NewAccelerator): diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index a9327c87138ed..0dd945a4a0fa5 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -19,8 +19,8 @@ from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin -from pytorch_lightning.accelerators.precision import PrecisionPlugin -from pytorch_lightning.utilities import device_parser +from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin +from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -57,6 +57,9 @@ def __init__( benchmark, replace_sampler_ddp, deterministic, + precision, + amp_type, + amp_level ): # initialization @@ -77,6 +80,9 @@ def __init__( self.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp self.deterministic = deterministic + self.precision = precision + self.amp_type = None if amp_type is None else amp_type.lower() + self.amp_level = amp_level # init the default rank if exists # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks @@ -143,7 +149,35 @@ def num_gpus(self) -> int: return len(gpus) def select_precision_plugin(self): - return PrecisionPlugin() + if self.precision == 32: + self.amp_type = None + return PrecisionPlugin() + + elif self.precision == 16: + if self.amp_type == 'native': + if not NATIVE_AMP_AVALAIBLE: + rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.' + ' Consider upgrading with `pip install torch>=1.6`.' + ' We will attempt to use NVIDIA Apex for this session.') + self.amp_type = 'apex' + else: + log.info('Using native 16bit precision.') + self.amp_type = AMPType.NATIVE + return NativeMixedPrecisionPlugin() + + if self.amp_type =='apex': + if not APEX_AVAILABLE: + rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.' + ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux') + else: + log.info('Using APEX 16bit precision.') + self.amp_type = AMPType.APEX + return ApexMixedPrecisionPlugin(self.amp_level) + + + + else: + raise NotImplementedError('We only support precisions 32 and 16!') def select_training_type_plugin(self): if self.distributed_backend == "ddp": diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py index 9733aadf96a33..3ce68c8e1efc6 100644 --- a/pytorch_lightning/accelerators/precision.py +++ b/pytorch_lightning/accelerators/precision.py @@ -94,7 +94,7 @@ def backward( # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) automatic_optimization = model.automatic_optimization - closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, *args, **kwargs) + closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs) # unscale gradient to allow analyze within `on_after_backward` if not should_accumulate and automatic_optimization: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8ca7bedd76ccb..bf07c17727d59 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -15,6 +15,7 @@ """Trainer to automate the training.""" import os +from pytorch_lightning.accelerators.precision import PrecisionPlugin import warnings from pathlib import Path from typing import Dict, Iterable, List, Optional, Union @@ -56,6 +57,7 @@ from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin from pytorch_lightning.trainer.properties import TrainerProperties from pytorch_lightning.trainer.states import RunningStage, TrainerState +from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector @@ -80,8 +82,9 @@ # warnings to ignore in trainer warnings.filterwarnings( - 'ignore', message='torch.distributed.reduce_op is deprecated, ' 'please use torch.distributed.ReduceOp instead' + "ignore", message="torch.distributed.reduce_op is deprecated, " "please use torch.distributed.ReduceOp instead" ) +os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" class Trainer( @@ -128,7 +131,7 @@ def __init__( accelerator: Optional[Union[str, NewAccelerator]] = None, sync_batchnorm: bool = False, precision: int = 32, - weights_summary: Optional[str] = 'top', + weights_summary: Optional[str] = "top", weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, @@ -330,10 +333,13 @@ def __init__( benchmark, replace_sampler_ddp, deterministic, + precision, + amp_backend, + amp_level, ) self.logger_connector = LoggerConnector(self) self.model_connector = ModelConnector(self) - self.precision_connector = PrecisionConnector(self) + # self.precision_connector = PrecisionConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) self.training_tricks_connector = TrainingTricksConnector(self) @@ -438,7 +444,7 @@ def __init__( ) # set precision - self.precision_connector.on_trainer_init(precision, amp_level, amp_backend) + # self.precision_connector.on_trainer_init(precision, amp_level, amp_backend) # last thing are the plugins which override whatever the trainer used by default self.plugin_connector.on_trainer_init(plugins) @@ -470,6 +476,18 @@ def optimizer_frequencies(self): def optimizer_frequencies(self, new_freqs): self.accelerator_backend.optimizer_frequencies = new_freqs + @property + def amp_backend(self): + return self.accelerator_backend.amp_backend + + @property + def precision(self): + return self.accelerator_backend.precision + + @property + def scaler(self): + return self.accelerator_backend.scaler + def fit( self, model: LightningModule, @@ -506,7 +524,7 @@ def fit( # bookkeeping # we reuse fit in .test() but change its behavior using this flag - self.testing = os.environ.get('PL_TESTING_MODE', self.testing) + self.testing = os.environ.get("PL_TESTING_MODE", self.testing) # ---------------------------- # SET UP TRAINING @@ -532,7 +550,7 @@ def fit( # TRAIN # ---------------------------- # hook - self.call_hook('on_fit_start') + self.call_hook("on_fit_start") if self.testing: results = self.run_test() @@ -547,12 +565,12 @@ def fit( # POST-Training CLEAN UP # ---------------------------- # hook - self.call_hook('on_fit_end') + self.call_hook("on_fit_end") # hook - self.teardown('fit') - if self.is_function_implemented('teardown'): - model.teardown('fit') + self.teardown("fit") + if self.is_function_implemented("teardown"): + model.teardown("fit") # return 1 when finished # used for testing or when we need to know that training succeeded @@ -597,7 +615,7 @@ def train(self): return # update LR schedulers - self.optimizer_connector.update_learning_rates(interval='epoch') + self.optimizer_connector.update_learning_rates(interval="epoch") # early stopping met_min_epochs = epoch >= self.min_epochs - 1 @@ -606,14 +624,18 @@ def train(self): if self.should_stop: if met_min_epochs and met_min_steps: return - log.info( - 'Trainer was signaled to stop but required minimum epochs' - f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has' - ' not been met. Training will continue...' - ) + else: + log.info( + "Trainer was signaled to stop but required minimum epochs" + f" ({self.min_epochs}) or minimum steps ({self.min_steps}) has" + " not been met. Training will continue..." + ) + + # hook + self.train_loop.on_train_end() except KeyboardInterrupt: - rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...') + rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...") # user could press ctrl+c many times... only shutdown once if not self.interrupted: @@ -744,7 +766,7 @@ def run_test(self): return eval_loop_results def run_sanity_check(self, ref_model): - using_val_step = ref_model.val_dataloader is not None and is_overridden('validation_step', ref_model) + using_val_step = ref_model.val_dataloader is not None and is_overridden("validation_step", ref_model) should_sanity_check = using_val_step and self.num_sanity_val_steps > 0 and self.limit_val_batches > 0 # run tiny validation (if validation defined) @@ -781,7 +803,7 @@ def test( self, model: Optional[LightningModule] = None, test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None, - ckpt_path: Optional[str] = 'best', + ckpt_path: Optional[str] = "best", verbose: bool = True, datamodule: Optional[LightningDataModule] = None, ): @@ -815,18 +837,18 @@ def test( # If you supply a datamodule you can't supply train_dataloader or val_dataloaders if test_dataloaders and datamodule: raise MisconfigurationException( - 'You cannot pass test_dataloaders to trainer.test if you supply a datamodule' + "You cannot pass test_dataloaders to trainer.test if you supply a datamodule" ) # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model or self.get_model(), datamodule, 'test') + self.data_connector.attach_datamodule(model or self.get_model(), datamodule, "test") if model is not None: results = self.__test_given_model(model, test_dataloaders) else: results = self.__test_using_best_weights(ckpt_path, test_dataloaders) - self.teardown('test') + self.teardown("test") return results @@ -834,7 +856,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): model = self.get_model() # if user requests the best checkpoint but we don't have it, error - if ckpt_path == 'best' and not self.checkpoint_callback.best_model_path: + if ckpt_path == "best" and not self.checkpoint_callback.best_model_path: raise MisconfigurationException( 'ckpt_path is "best", but ModelCheckpoint is not configured to save the best model.' ) @@ -842,20 +864,20 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # load best weights if ckpt_path is not None: # ckpt_path is 'best' so load the best model - if ckpt_path == 'best': + if ckpt_path == "best": ckpt_path = self.checkpoint_callback.best_model_path if len(ckpt_path) == 0: rank_zero_warn( - f'.test() found no path for the best weights, {ckpt_path}. Please ' - f'specify a path for a checkpoint .test(ckpt_path=PATH)' + f".test() found no path for the best weights, {ckpt_path}. Please " + f"specify a path for a checkpoint .test(ckpt_path=PATH)" ) return {} if self.accelerator_backend is not None and not self._device_type == DeviceType.TPU: self.accelerator_backend.barrier() ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt['state_dict']) + model.load_state_dict(ckpt["state_dict"]) # attach dataloaders if test_dataloaders is not None: @@ -864,16 +886,16 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path self.testing = True - os.environ['PL_TESTING_MODE'] = '1' + os.environ["PL_TESTING_MODE"] = "1" self.model = model results = self.fit(model) self.testing = False - del os.environ['PL_TESTING_MODE'] + del os.environ["PL_TESTING_MODE"] # teardown - if self.is_function_implemented('teardown'): + if self.is_function_implemented("teardown"): model_ref = self.get_model() - model_ref.teardown('test') + model_ref.teardown("test") return results @@ -891,8 +913,8 @@ def __test_given_model(self, model, test_dataloaders): self.testing = False # teardown - if self.is_function_implemented('teardown'): - model.teardown('test') + if self.is_function_implemented("teardown"): + model.teardown("test") return results @@ -922,7 +944,7 @@ def tune( def call_setup_hook(self, model): # call setup after the ddp process has connected - stage_name = 'test' if self.testing else 'fit' + stage_name = "test" if self.testing else "fit" if self.datamodule is not None: called = self.datamodule.has_setup_test if self.testing else self.datamodule.has_setup_fit if not called: From ea856333b6b1f38c8e275ab080f528da1dfac5bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 12 Dec 2020 06:54:42 +0100 Subject: [PATCH 037/157] a From 846dc92ea535d2367c80a6eae2e1e28344fa32f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 12 Dec 2020 08:03:29 +0100 Subject: [PATCH 038/157] ddp spawn --- .../accelerators/accelerator_connector.py | 10 +- .../accelerators/data_parallel.py | 382 ++++++++++-------- 2 files changed, 221 insertions(+), 171 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 0dd945a4a0fa5..6c23caede81a9 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -18,7 +18,7 @@ import torch from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator -from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin +from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser from pytorch_lightning.utilities import rank_zero_only @@ -188,6 +188,14 @@ def select_training_type_plugin(self): cluster_environment=TorchElasticEnvironment(), # TODO: deterimine this using plugin connector? is_slurm_managing_tasks=False, # TODO: determine this ) + elif self.use_ddp and self.distributed_backend == "ddp_spawn": + plugin = DDPSpawnPlugin( + parallel_device_ids=self.parallel_devices, + num_nodes=self.num_nodes, + logger=None, + cluster_environment=TorchElasticEnvironment(), + is_slurm_managing_tasks=False, # TODO: determine this + ) else: # TODO: cover all other cases plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 3946109fc2a13..8e2420da82c76 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -185,6 +185,23 @@ def connect(self, model): def is_global_zero(self) -> bool: return self.global_rank == 0 + @staticmethod + def configure_sync_batchnorm(model: LightningModule) -> LightningModule: + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + return model + class DataParallelPlugin(ParallelPlugin): def setup(self, model): @@ -423,24 +440,6 @@ def post_training(self, results, best_model_path): if "WORLD_SIZE" in os.environ: del os.environ["WORLD_SIZE"] - @staticmethod - def configure_sync_batchnorm(model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - def barrier(self, *args, **kwargs): if torch_distrib.is_initialized(): torch_distrib.barrier() @@ -460,168 +459,211 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ return output -# class DDPSpawnPlugin(ParallelPlugin): -# def __init__(self, parallel_device_ids, logger=None, cluster_environment=None, proc_offset=0): -# super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) -# self.process_idx = None +class DDPSpawnPlugin(ParallelPlugin): + + distributed_backend = "ddp_spawn" -# self.dist = LightningDistributed() -# # TODO: how to get in nprocs? probably pass it -# self.num_processes = num_processes -# self.mp_queue = None -# self.proc_offset = proc_offset + def __init__( + self, + parallel_device_ids, + num_nodes=1, + logger=None, + cluster_environment=None, + is_slurm_managing_tasks=False, + proc_offset=0, + **kwargs: Dict[str, Any] + ): + super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) + self.num_nodes = num_nodes + self.is_slurm_managing_tasks = is_slurm_managing_tasks + self.proc_offset = proc_offset + self._ddp_kwargs = kwargs + self.process_idx = None + self.dist = LightningDistributed() + self.num_processes = len(parallel_device_ids) + self.mp_queue = None -# def setup(self, model): -# os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) + @property + def root_device(self): + return torch.device("cuda", self.parallel_device_ids[self.local_rank]) + + def setup(self, model): + self._model = model -# # pass in a state q -# smp = mp.get_context('spawn') -# self.mp_queue = smp.SimpleQueue() + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) -# def set_world_ranks(self): -# self.local_rank = self.process_idx -# # check from where we get node_rank, num_processes and num_nodes -# self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx -# self.world_size = self.num_nodes * self.num_processes + # pass in a state q + smp = mp.get_context('spawn') + self.mp_queue = smp.SimpleQueue() -# def pre_training(self): + def set_world_ranks(self): + self.local_rank = self.process_idx + # check from where we get node_rank, num_processes and num_nodes + self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx + self.world_size = self.num_nodes * self.num_processes -# # TODO: Check if current process can be used as one training proc -# # start from one since current process is proc 0 -# for proc_idx in range(1, self.num_processes): -# # use os.fork, since this enables us to continue from here -# # instead of spawning with separate function -# pid = os.fork() + def pre_training(self): -# # set in child processes (PID=0). All previous child processes -# # should already have their process_idx assigned -# if pid == 0 and self.process_idx is None: -# self.process_idx = proc_idx + self.proc_offset + # TODO: Check if current process can be used as one training proc + # start from one since current process is proc 0 + for proc_idx in range(1, self.num_processes): + # use os.fork, since this enables us to continue from here + # instead of spawning with separate function + pid = os.fork() -# # set process idx for current process -# if pid != 0: -# self.process_idx = 0 + self.proc_offset + # set in child processes (PID=0). All previous child processes + # should already have their process_idx assigned + if pid == 0 and self.process_idx is None: + self.process_idx = proc_idx + self.proc_offset -# # TODO: Check where to put that since we don't have access to the pbar here -# # show progressbar only on progress_rank 0 -# # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: -# # self.trainer.progress_bar_callback.disable() + # set process idx for current process + if pid != 0: + self.process_idx = 0 + self.proc_offset -# self.set_world_ranks() + # TODO: Check where to put that since we don't have access to the pbar here + # show progressbar only on progress_rank 0 + # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: + # self.trainer.progress_bar_callback.disable() -# # set warning rank -# rank_zero_only.rank = self.global_rank - -# # TODO: This has to be done somewhere else! -# # self.model.trainer = self.trainer - -# # set up server using proc 0's ip address -# # try to init for 20 times at max in case ports are taken -# # where to store ip_table -# # TODO: CHeck is_slurm_managing_tasks -# self.init_ddp_connection(self.global_rank, self.world_size, self.is_slurm_managing_tasks) - -# # TODO: Move this somewhere else -# # self.trainer.call_setup_hook(self.model) - -# # on world_size=0 let everyone know training is starting -# if self.is_global_zero and not torch.distributed.is_initialized(): -# log.info("-" * 100) -# log.info(f"distributed_backend={self.distributed_backend}") -# log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") -# log.info("-" * 100) - -# self.model = self.configure_sync_batchnorm(self.model) - -# # move the model to the correct device -# self.model_to_device() - -# # TODO: Check where this can be moved -# # set model properties before going into wrapper -# # self.trainer.model_connector.copy_trainer_model_properties(self.model) - -# self.configure_ddp() - -# self.barrier() - -# def post_training(self, results, best_model_path): -# # get original model -# # TODO: How To get this? is this simply self.model? -# # model = self.trainer.get_model() -# model = self.model - -# # persist info in ddp_spawn -# self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) - -# # clean up memory -# torch.cuda.empty_cache() - -# if self.process_idx == 0: -# # restore main state with best weights -# best_path = self.mp_queue.get() -# results = self.mp_queue.get() -# last_path = self.mp_queue.get() - -# # recover the weights of the processes trained in the children -# self.__recover_child_process_weights(model, best_path, last_path) - -# def configure_ddp(self): -# # if unset, default `find_unused_parameters` `True` -# self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) -# self.model = LightningDistributedDataParallel( -# self.model, -# device_ids=self.determine_ddp_device_ids(), -# **self._ddp_kwargs, -# ) - -# def determine_ddp_device_ids(self): -# return [self.root_device] - -# def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None): - -# if self.global_rank == 0 and self.mp_queue is not None: -# rank_zero_warn('cleaning up ddp environment...') -# # todo, pass complete checkpoint as state dictionary -# self.mp_queue.put(best_model_path) -# self.mp_queue.put(results) - -# # save the last weights -# last_path = None -# # TODO: From where to get self.trainer.testing? -# # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: -# if best_model_path is not None and len(best_model_path) > 0: -# last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) -# atomic_save(self.model.state_dict(), last_path) -# self.mp_queue.put(last_path) - - -# def __recover_child_process_weights(self, model, best_path, last_path): -# # TODO: Where can we set this? -# # transfer back the best path to the trainer -# # if self.trainer.checkpoint_callback: -# # self.trainer.checkpoint_callback.best_model_path = best_path -# # todo, pass also best score - -# # load last weights -# # TODO: How to get self.trainer.testing? -# if last_path is not None: # and not self.trainer.testing: -# ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) -# model.load_state_dict(ckpt) - -# # TODO: Where to set this? -# # Do we really need to set this or can we just make the trainer property forward our current property here? -# # self.trainer.model = model - -# def determine_local_rank(self): -# if self.is_slurm_managing_tasks: -# return int(os.environ['SLURM_LOCALID']) -# else: -# return super().determine_node_rank() - -# def determine_node_rank(self): -# if self.is_slurm_managing_tasks: -# return int(os.environ['SLURM_NODEID']) -# else: -# return super().determine_node_rank() + self.set_world_ranks() + + # set warning rank + rank_zero_only.rank = self.global_rank + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + # TODO: CHeck is_slurm_managing_tasks + self.init_ddp_connection(self.global_rank, self.world_size) + + # TODO: Move this somewhere else + # self.trainer.call_setup_hook(self.model) + + # on world_size=0 let everyone know training is starting + if self.is_global_zero and not torch.distributed.is_initialized(): + log.info("-" * 100) + log.info(f"distributed_backend={self.distributed_backend}") + log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + log.info("-" * 100) + + self.model = self.configure_sync_batchnorm(self.model) + + # move the model to the correct device + self.model_to_device() + + # TODO: Check where this can be moved + # set model properties before going into wrapper + # self.trainer.model_connector.copy_trainer_model_properties(self.model) + + self.configure_ddp() + + self.barrier() + + def post_training(self, results, best_model_path): + # get original model + # TODO: How To get this? is this simply self.model? + # model = self.trainer.get_model() + model = self.model + + # persist info in ddp_spawn + self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) + + # clean up memory + torch.cuda.empty_cache() + + if self.process_idx == 0: + # restore main state with best weights + best_path = self.mp_queue.get() + results = self.mp_queue.get() + last_path = self.mp_queue.get() + + # recover the weights of the processes trained in the children + self.__recover_child_process_weights(model, best_path, last_path) + + def configure_ddp(self): + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) + self.model = LightningDistributedDataParallel( + self.model, + device_ids=self.determine_ddp_device_ids(), + **self._ddp_kwargs, + ) + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + # TODO: this code is duplicated in DDP and DDPSpawn, make this a function + os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) + torch_backend = "nccl" if self.on_gpu else "gloo" + + if not torch.distributed.is_initialized(): + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + + def determine_ddp_device_ids(self): + return [self.root_device] + + def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None): + + if self.global_rank == 0 and self.mp_queue is not None: + rank_zero_warn('cleaning up ddp environment...') + # todo, pass complete checkpoint as state dictionary + self.mp_queue.put(best_model_path) + self.mp_queue.put(results) + + # save the last weights + last_path = None + # TODO: From where to get self.trainer.testing? + # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: + if best_model_path is not None and len(best_model_path) > 0: + last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) + atomic_save(self.model.state_dict(), last_path) + self.mp_queue.put(last_path) + + + def __recover_child_process_weights(self, model, best_path, last_path): + # TODO: Where can we set this? + # transfer back the best path to the trainer + # if self.trainer.checkpoint_callback: + # self.trainer.checkpoint_callback.best_model_path = best_path + # todo, pass also best score + + # load last weights + # TODO: How to get self.trainer.testing? + if last_path is not None: # and not self.trainer.testing: + ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) + model.load_state_dict(ckpt) + + # TODO: Where to set this? + # Do we really need to set this or can we just make the trainer property forward our current property here? + # self.trainer.model = model + + def determine_local_rank(self): + if self.is_slurm_managing_tasks: + return int(os.environ['SLURM_LOCALID']) + else: + return super().determine_node_rank() + + def determine_node_rank(self): + if self.is_slurm_managing_tasks: + return int(os.environ['SLURM_NODEID']) + else: + return super().determine_node_rank() + + def barrier(self, *args, **kwargs): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + def broadcast(self, obj: object, src: int = 0) -> object: + return self.dist.broadcast(obj) + + def model_to_device(self): + torch.cuda.set_device(self.root_device) + self.model.to(self.root_device) + + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): + if isinstance(output, torch.Tensor): + output = sync_ddp_if_available(output, group, reduce_op) + return output # STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP \ No newline at end of file From 0d0c3d718975f0fa4ae49bde0b2c5850ab06e28c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 12 Dec 2020 08:15:48 +0100 Subject: [PATCH 039/157] spawn --- .../accelerators/data_parallel.py | 55 ++++++++++++------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 8e2420da82c76..231da55fbfe10 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -478,7 +478,6 @@ def __init__( self.is_slurm_managing_tasks = is_slurm_managing_tasks self.proc_offset = proc_offset self._ddp_kwargs = kwargs - self.process_idx = None self.dist = LightningDistributed() self.num_processes = len(parallel_device_ids) self.mp_queue = None @@ -496,36 +495,50 @@ def setup(self, model): smp = mp.get_context('spawn') self.mp_queue = smp.SimpleQueue() - def set_world_ranks(self): - self.local_rank = self.process_idx + def set_world_ranks(self, process_idx): + self.local_rank = process_idx # check from where we get node_rank, num_processes and num_nodes - self.global_rank = self.determine_node_rank() * self.num_processes + self.process_idx + self.global_rank = self.determine_node_rank() * self.num_processes + process_idx self.world_size = self.num_nodes * self.num_processes def pre_training(self): - - # TODO: Check if current process can be used as one training proc - # start from one since current process is proc 0 - for proc_idx in range(1, self.num_processes): - # use os.fork, since this enables us to continue from here - # instead of spawning with separate function - pid = os.fork() - - # set in child processes (PID=0). All previous child processes - # should already have their process_idx assigned - if pid == 0 and self.process_idx is None: - self.process_idx = proc_idx + self.proc_offset - - # set process idx for current process - if pid != 0: - self.process_idx = 0 + self.proc_offset + mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, self.model, self.proc_offset,)) + + print(self.global_rank, "I am still running", os.getpid(), + "i will go into training loop and crash because i didn't enter process group") + + def new_process(self, process_idx, mp_queue, model, proc_offset): + print("i am a new process", os.getpid()) + # TODO: check if needed + # seed = os.environ.get("PL_GLOBAL_SEED") + # if seed is not None: + # seed_everything(int(seed)) + + # # TODO: Check if current process can be used as one training proc + # No because torch.multiprocessing does not support the fork method in combination with cuda + # # start from one since current process is proc 0 + # for proc_idx in range(1, self.num_processes): + # # use os.fork, since this enables us to continue from here + # # instead of spawning with separate function + # pid = os.fork() + # + # # set in child processes (PID=0). All previous child processes + # # should already have their process_idx assigned + # if pid == 0 and self.process_idx is None: + # self.process_idx = proc_idx + self.proc_offset + # + # # set process idx for current process + # if pid != 0: + # self.process_idx = 0 + self.proc_offset # TODO: Check where to put that since we don't have access to the pbar here # show progressbar only on progress_rank 0 # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: # self.trainer.progress_bar_callback.disable() - self.set_world_ranks() + process_idx = process_idx + proc_offset + + self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.global_rank From 3fb8b4d07ee1696e262b2a9bf8c8d3a6de262475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 13 Dec 2020 03:34:55 +0100 Subject: [PATCH 040/157] finish ddp plugin integration --- pytorch_lightning/accelerators/base_plugin.py | 2 +- .../accelerators/data_parallel.py | 119 ++++++++---------- pytorch_lightning/trainer/properties.py | 4 + pytorch_lightning/trainer/trainer.py | 56 ++++++++- pytorch_lightning/trainer/training_loop.py | 33 ----- 5 files changed, 108 insertions(+), 106 deletions(-) diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py index 42b3e1f00b932..549d311f7f87d 100644 --- a/pytorch_lightning/accelerators/base_plugin.py +++ b/pytorch_lightning/accelerators/base_plugin.py @@ -15,7 +15,7 @@ def post_optimizer_step(self, optimizer, optimizer_idx): def pre_training(self): pass - def post_training(self, results, best_model_path): + def post_training(self, best_model_path): pass @contextlib.contextmanager diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 231da55fbfe10..64517273a9ced 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -113,6 +113,14 @@ def model(self, new_model): def lightning_module(self): return self._model + def start_training(self, trainer): + # double dispatch to initiate the training loop + return trainer.train() + + def start_testing(self, trainer): + # double dispatch to initiate the test loop + return trainer.run_test() + class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device, logger=None): @@ -395,6 +403,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None: torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) def pre_training(self): + # TODO: check if needed seed = os.environ.get("PL_GLOBAL_SEED") if seed is not None: seed_everything(int(seed)) @@ -434,7 +443,7 @@ def pre_training(self): self.barrier() - def post_training(self, results, best_model_path): + def post_training(self, best_model_path): torch.cuda.empty_cache() if "WORLD_SIZE" in os.environ: @@ -486,6 +495,11 @@ def __init__( def root_device(self): return torch.device("cuda", self.parallel_device_ids[self.local_rank]) + @property + def lightning_module(self): + # the model may not be wrapped with DistributedDataParallel if calling this too early + return getattr(self._model, "module", self._model) + def setup(self, model): self._model = model @@ -501,43 +515,19 @@ def set_world_ranks(self, process_idx): self.global_rank = self.determine_node_rank() * self.num_processes + process_idx self.world_size = self.num_nodes * self.num_processes - def pre_training(self): - mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, self.model, self.proc_offset,)) + def start_training(self, trainer): + mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,)) - print(self.global_rank, "I am still running", os.getpid(), - "i will go into training loop and crash because i didn't enter process group") + def start_testing(self, trainer): + mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,)) - def new_process(self, process_idx, mp_queue, model, proc_offset): - print("i am a new process", os.getpid()) + def new_process(self, process_idx, mp_queue, trainer, model, proc_offset): # TODO: check if needed - # seed = os.environ.get("PL_GLOBAL_SEED") - # if seed is not None: - # seed_everything(int(seed)) - - # # TODO: Check if current process can be used as one training proc - # No because torch.multiprocessing does not support the fork method in combination with cuda - # # start from one since current process is proc 0 - # for proc_idx in range(1, self.num_processes): - # # use os.fork, since this enables us to continue from here - # # instead of spawning with separate function - # pid = os.fork() - # - # # set in child processes (PID=0). All previous child processes - # # should already have their process_idx assigned - # if pid == 0 and self.process_idx is None: - # self.process_idx = proc_idx + self.proc_offset - # - # # set process idx for current process - # if pid != 0: - # self.process_idx = 0 + self.proc_offset - - # TODO: Check where to put that since we don't have access to the pbar here - # show progressbar only on progress_rank 0 - # if (self.trainer.node_rank != 0 or self.process_idx != 0) and self.trainer.progress_bar_callback is not None: - # self.trainer.progress_bar_callback.disable() + seed = os.environ.get("PL_GLOBAL_SEED") + if seed is not None: + seed_everything(int(seed)) process_idx = process_idx + proc_offset - self.set_world_ranks(process_idx) # set warning rank @@ -559,39 +549,39 @@ def new_process(self, process_idx, mp_queue, model, proc_offset): log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") log.info("-" * 100) + # set the ranks and devices + self.dist.rank = self.global_rank + self.dist.device = self.root_device + self.model = self.configure_sync_batchnorm(self.model) # move the model to the correct device self.model_to_device() - # TODO: Check where this can be moved - # set model properties before going into wrapper - # self.trainer.model_connector.copy_trainer_model_properties(self.model) - self.configure_ddp() self.barrier() - def post_training(self, results, best_model_path): - # get original model - # TODO: How To get this? is this simply self.model? - # model = self.trainer.get_model() - model = self.model + if trainer.testing: + results = trainer.run_test() + else: + results = trainer.train() # persist info in ddp_spawn - self.transfer_distrib_spawn_state_on_fit_end(model, self.mp_queue, results, best_model_path) + self.transfer_distrib_spawn_state_on_fit_end(results) + def post_training(self, best_model_path): # clean up memory torch.cuda.empty_cache() - if self.process_idx == 0: - # restore main state with best weights - best_path = self.mp_queue.get() - results = self.mp_queue.get() - last_path = self.mp_queue.get() + # restore main state with best weights + best_path = self.mp_queue.get() + results = self.mp_queue.get() + last_path = self.mp_queue.get() - # recover the weights of the processes trained in the children - self.__recover_child_process_weights(model, best_path, last_path) + # recover the weights of the processes trained in the children + self.__recover_child_process_weights(best_path, last_path) + return results def configure_ddp(self): # if unset, default `find_unused_parameters` `True` @@ -616,7 +606,9 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None: def determine_ddp_device_ids(self): return [self.root_device] - def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_path=None): + def transfer_distrib_spawn_state_on_fit_end(self, results): + # TODO: is there a better way than accessing callback through model -> trainer -> callback? + best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path if self.global_rank == 0 and self.mp_queue is not None: rank_zero_warn('cleaning up ddp environment...') @@ -626,30 +618,24 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, results, best_model_pat # save the last weights last_path = None - # TODO: From where to get self.trainer.testing? - # if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0: - if best_model_path is not None and len(best_model_path) > 0: + # TODO: is there a better way than accessing trainer through model -> trainer? + if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0: last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) - atomic_save(self.model.state_dict(), last_path) + atomic_save(self.lightning_module.state_dict(), last_path) self.mp_queue.put(last_path) - - def __recover_child_process_weights(self, model, best_path, last_path): - # TODO: Where can we set this? + def __recover_child_process_weights(self, best_path, last_path): + # TODO: is there a better way than accessing callback through model -> trainer -> callback? # transfer back the best path to the trainer - # if self.trainer.checkpoint_callback: - # self.trainer.checkpoint_callback.best_model_path = best_path + if self.lightning_module.trainer.checkpoint_callback: + self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path # todo, pass also best score # load last weights # TODO: How to get self.trainer.testing? if last_path is not None: # and not self.trainer.testing: ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt) - - # TODO: Where to set this? - # Do we really need to set this or can we just make the trainer property forward our current property here? - # self.trainer.model = model + self.lightning_module.load_state_dict(ckpt) def determine_local_rank(self): if self.is_slurm_managing_tasks: @@ -679,4 +665,5 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ output = sync_ddp_if_available(output, group, reduce_op) return output -# STILL MISSING: DDP2 (?), HOROVOD DDP AND HPC DDP \ No newline at end of file + +# TODO: DDP2 (?), HOROVOD DDP AND HPC DDP diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index cb613dc087691..02844cb1375bd 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -338,6 +338,10 @@ def save_checkpoint(self, filepath, weights_only: bool = False): def get_model(self): # TODO: rename this to lightning_module (see training type plugin) # backward compatible + return self.lightning_module + + @property + def lightning_module(self): return self.training_type_plugin.lightning_module def __getstate__(self): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bf07c17727d59..5fd80fadfe751 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -532,8 +532,6 @@ def fit( # self.accelerator_backend = self.accelerator_connector.select_accelerator() self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator_backend.setup(self, model) - - # TODO: is calling pre-training the correct place here @justus? self.training_type_plugin.pre_training() # ---------------------------- @@ -550,15 +548,16 @@ def fit( # TRAIN # ---------------------------- # hook + self.call_hook("on_fit_start") + # double dispatch: let the plugin initiate the training/test loop. if self.testing: - results = self.run_test() + self.training_type_plugin.start_testing(self) else: - results = self.train() + self.training_type_plugin.start_training(self) - # TODO: is calling post training the correct place here @justus? - self.training_type_plugin.post_training(results, self.checkpoint_callback.best_model_path) + results = self.training_type_plugin.post_training(self.checkpoint_callback.best_model_path) self.accelerator_backend.teardown() # ---------------------------- @@ -579,7 +578,49 @@ def fit( self._state = TrainerState.FINISHED return results or 1 + def pre_training_routine(self): + # wait for all to join if on distributed + self.accelerator.training_type_plugin.barrier("setup_training") + + # register auto-resubmit when on SLURM + self.slurm_connector.register_slurm_signal_handlers() + + # -------------------------- + # Pre-train + # -------------------------- + # on pretrain routine start + ref_model = self.get_model() + + self.on_pretrain_routine_start(ref_model) + if self.is_function_implemented("on_pretrain_routine_start"): + ref_model.on_pretrain_routine_start() + + # print model summary + if self.is_global_zero and self.weights_summary is not None and not self.testing: + if self.weights_summary in ModelSummary.MODES: + ref_model.summarize(mode=self.weights_summary) + else: + raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES)) + + # TODO: what the heck is this + # track model now. + # if cluster resets state, the model will update with the saved weights + # self.trainer.model = model + + # restore training and model before hpc is called + self.checkpoint_connector.restore_weights(ref_model) + + # on pretrain routine end + self.on_pretrain_routine_end(ref_model) + if self.is_function_implemented("on_pretrain_routine_end"): + ref_model.on_pretrain_routine_end() + def train(self): + self.pre_training_routine() + + if not self.is_global_zero and self.progress_bar_callback is not None: + self.progress_bar_callback.disable() + self.run_sanity_check(self.get_model()) # set stage for logging @@ -748,6 +789,9 @@ def track_output_for_epoch_end(self, outputs, output): return outputs def run_test(self): + if not self.is_global_zero and self.progress_bar_callback is not None: + self.progress_bar_callback.disable() + # only load test dataloader for testing # self.reset_test_dataloader(ref_model) with self.profiler.profile("run_test_evaluation"): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 3b9e704f840b8..066b0818bde21 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -151,39 +151,6 @@ def setup_training(self, model: LightningModule): self.trainer.logger.log_graph(ref_model) self.trainer.logger.save() - # wait for all to join if on distributed - self.trainer.accelerator.training_type_plugin.barrier("setup_training") - - # register auto-resubmit when on SLURM - self.trainer.slurm_connector.register_slurm_signal_handlers() - - if not self.trainer.is_global_zero and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # -------------------------- - # Pre-train - # -------------------------- - # on pretrain routine start - self.trainer.on_pretrain_routine_start(ref_model) - if self.trainer.is_function_implemented("on_pretrain_routine_start"): - ref_model.on_pretrain_routine_start() - - # print model summary - if self.trainer.is_global_zero and not self.trainer.testing: - ref_model.summarize(mode=self.trainer.weights_summary) - - # track model now. - # if cluster resets state, the model will update with the saved weights - self.trainer.model = model - - # restore training state and model weights before hpc is called - self.trainer.checkpoint_connector.restore_weights(model) - - # on pretrain routine end - self.trainer.on_pretrain_routine_end(ref_model) - if self.trainer.is_function_implemented("on_pretrain_routine_end"): - ref_model.on_pretrain_routine_end() - def on_train_end(self): if self._teardown_already_run: return From 0f5298ee6624830d7f38840a0b111d21ce55e563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 13 Dec 2020 19:50:28 +0100 Subject: [PATCH 041/157] remove logger from plugins --- .../accelerators/accelerator_connector.py | 1 - .../accelerators/data_parallel.py | 19 +++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 6c23caede81a9..40800db4c1c8c 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -192,7 +192,6 @@ def select_training_type_plugin(self): plugin = DDPSpawnPlugin( parallel_device_ids=self.parallel_devices, num_nodes=self.num_nodes, - logger=None, cluster_environment=TorchElasticEnvironment(), is_slurm_managing_tasks=False, # TODO: determine this ) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 64517273a9ced..529bfc69648e1 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -41,10 +41,9 @@ class ReduceOp: class TrainingTypePlugin(Plugin, ABC): - def __init__(self, logger=None): + def __init__(self): self._model = None self.global_rank = 0 - self.logger = logger @property @abstractmethod @@ -123,8 +122,8 @@ def start_testing(self, trainer): class SingleDevicePlugin(TrainingTypePlugin): - def __init__(self, device, logger=None): - super().__init__(logger=logger) + def __init__(self, device): + super().__init__() self.device: torch.device = device @property @@ -161,8 +160,8 @@ def broadcast(self, obj: object, src: int = 0) -> object: class ParallelPlugin(TrainingTypePlugin, ABC): - def __init__(self, parallel_device_ids, logger=None, cluster_environment=None): - super().__init__(logger=logger) + def __init__(self, parallel_device_ids, cluster_environment=None): + super().__init__() self.parallel_device_ids = parallel_device_ids self.local_rank = 0 self.world_size = 1 @@ -252,11 +251,12 @@ def __init__( is_slurm_managing_tasks=False, **kwargs: Dict[str, Any], ) -> None: - super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) + super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment) self.interactive_ddp_procs = [] - self.dist = LightningDistributed() self.num_nodes = num_nodes + self.logger = logger self.is_slurm_managing_tasks = is_slurm_managing_tasks + self.dist = LightningDistributed() self._ddp_kwargs = kwargs self._has_spawned_children = False self.task_idx = None @@ -476,13 +476,12 @@ def __init__( self, parallel_device_ids, num_nodes=1, - logger=None, cluster_environment=None, is_slurm_managing_tasks=False, proc_offset=0, **kwargs: Dict[str, Any] ): - super().__init__(parallel_device_ids=parallel_device_ids, logger=logger, cluster_environment=cluster_environment) + super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment) self.num_nodes = num_nodes self.is_slurm_managing_tasks = is_slurm_managing_tasks self.proc_offset = proc_offset From 434e30ebad3debd2e1fb5c195de1afe743a24f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 13 Dec 2020 19:53:43 +0100 Subject: [PATCH 042/157] setup --- pytorch_lightning/trainer/trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 5fd80fadfe751..6993d25cb1d94 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -532,6 +532,7 @@ def fit( # self.accelerator_backend = self.accelerator_connector.select_accelerator() self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator_backend.setup(self, model) + self.train_loop.setup_training(model) self.training_type_plugin.pre_training() # ---------------------------- @@ -542,8 +543,6 @@ def fit( # self.accelerator_backend.validation_loop = self.run_evaluation # self.accelerator_backend.test_loop = self.run_evaluation - self.train_loop.setup_training(model) - # ---------------------------- # TRAIN # ---------------------------- From 3fb31c8bfe2ce9a282ff9835f3266c9e4f260b58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 13 Dec 2020 19:57:33 +0100 Subject: [PATCH 043/157] remove logger arg --- pytorch_lightning/accelerators/accelerator_connector.py | 1 - pytorch_lightning/accelerators/data_parallel.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 40800db4c1c8c..4683a8b2a5917 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -184,7 +184,6 @@ def select_training_type_plugin(self): plugin = DDPPlugin( parallel_device_ids=self.parallel_devices, num_nodes=self.num_nodes, - logger=None, cluster_environment=TorchElasticEnvironment(), # TODO: deterimine this using plugin connector? is_slurm_managing_tasks=False, # TODO: determine this ) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 529bfc69648e1..6875224f62d0a 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -246,7 +246,6 @@ def __init__( self, parallel_device_ids, num_nodes=1, - logger=None, cluster_environment=None, is_slurm_managing_tasks=False, **kwargs: Dict[str, Any], @@ -254,7 +253,6 @@ def __init__( super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment) self.interactive_ddp_procs = [] self.num_nodes = num_nodes - self.logger = logger self.is_slurm_managing_tasks = is_slurm_managing_tasks self.dist = LightningDistributed() self._ddp_kwargs = kwargs @@ -334,8 +332,10 @@ def _call_children_scripts(self): os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - if self.logger is not None: - os.environ["PL_EXP_VERSION"] = str(self.logger.version) + print("logger", self.lightning_module.logger) + if self.lightning_module.logger is not None: + os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) + print("exp", os.environ["PL_EXP_VERSION"]) num_gpus = len(self.parallel_device_ids) # TODO: Add num_nodes (pass it in?) From e7a7a87b321eb6602240a40af2a449ef975b3a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 13 Dec 2020 20:00:34 +0100 Subject: [PATCH 044/157] module --- pytorch_lightning/accelerators/data_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 6875224f62d0a..2d05091fe3518 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -288,7 +288,8 @@ def setup(self, model): @property def lightning_module(self): - return self._model.module + # the model may not be wrapped with DistributedDataParallel if calling this too early + return getattr(self._model, "module", self._model) def _call_children_scripts(self): From 1e8aa44ee3b9917b5a7670cfbd63b7611d9a5fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 13 Dec 2020 20:01:21 +0100 Subject: [PATCH 045/157] clean up --- pytorch_lightning/accelerators/data_parallel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 2d05091fe3518..7b34acc4b764d 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -333,10 +333,8 @@ def _call_children_scripts(self): os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - print("logger", self.lightning_module.logger) if self.lightning_module.logger is not None: os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) - print("exp", os.environ["PL_EXP_VERSION"]) num_gpus = len(self.parallel_device_ids) # TODO: Add num_nodes (pass it in?) From 628fdc3ab447a97a073ca94f3736019bc3393dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Dec 2020 03:44:10 +0100 Subject: [PATCH 046/157] ddp_cpu integration --- .../accelerators/accelerator_connector.py | 33 +++++++---- .../accelerators/data_parallel.py | 55 +++++++++++-------- pytorch_lightning/trainer/properties.py | 2 +- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 4683a8b2a5917..f1ebbd5950b6c 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -18,7 +18,8 @@ import torch from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator -from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin +from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ + DataParallelPlugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser from pytorch_lightning.utilities import rank_zero_only @@ -94,8 +95,8 @@ def __init__( # for gpus allow int, string and gpu list # if auto_select_gpus and isinstance(gpus, int): # self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus) - self.parallel_devices = device_parser.parse_gpu_ids(self.gpus) - self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_devices) + self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) + self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_device_ids) # self.root_device = torch.device("cpu") self.set_distributed_mode() @@ -139,15 +140,25 @@ def tpu_id(self): @property def on_gpu(self): - return self.parallel_devices and torch.cuda.is_available() + return self.parallel_device_ids and torch.cuda.is_available() @property def num_gpus(self) -> int: - gpus = self.parallel_devices + gpus = self.parallel_device_ids if gpus is None: return 0 return len(gpus) + @property + def parallel_devices(self): + if self.on_gpu: + devices = [torch.device("cuda", i) for i in self.parallel_device_ids] + elif self.on_tpu: + raise NotImplementedError + else: + devices = [torch.device("cpu")] * self.num_processes + return devices + def select_precision_plugin(self): if self.precision == 32: self.amp_type = None @@ -180,16 +191,18 @@ def select_precision_plugin(self): raise NotImplementedError('We only support precisions 32 and 16!') def select_training_type_plugin(self): - if self.distributed_backend == "ddp": + if self.use_dp and self.distributed_backend == "dp": + plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) + elif self.use_ddp and self.distributed_backend == "ddp": plugin = DDPPlugin( - parallel_device_ids=self.parallel_devices, + parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=TorchElasticEnvironment(), # TODO: deterimine this using plugin connector? is_slurm_managing_tasks=False, # TODO: determine this ) - elif self.use_ddp and self.distributed_backend == "ddp_spawn": + elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"): plugin = DDPSpawnPlugin( - parallel_device_ids=self.parallel_devices, + parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=TorchElasticEnvironment(), is_slurm_managing_tasks=False, # TODO: determine this @@ -279,8 +292,6 @@ def set_distributed_mode(self): "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs." ) self.use_ddp = True - self.data_parallel_device_ids = None - self.on_gpu = False # HOROVOD elif self.distributed_backend == "horovod": diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 7b34acc4b764d..8e55596f5952b 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -86,7 +86,6 @@ def determine_local_rank(self): return int(os.environ.get('LOCAL_RANK', 0)) def determine_node_rank(self): - # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK. # otherwise use given node rank or default to node rank 0 env_vars = ['NODE_RANK', 'GROUP_RANK'] @@ -160,9 +159,9 @@ def broadcast(self, obj: object, src: int = 0) -> object: class ParallelPlugin(TrainingTypePlugin, ABC): - def __init__(self, parallel_device_ids, cluster_environment=None): + def __init__(self, parallel_devices: List[torch.device], cluster_environment=None): super().__init__() - self.parallel_device_ids = parallel_device_ids + self.parallel_devices = parallel_devices self.local_rank = 0 self.world_size = 1 self.cluster_environment = cluster_environment @@ -178,7 +177,7 @@ def root_device(self): @property def on_gpu(self): - return self.parallel_device_ids and torch.cuda.is_available() + return self.root_device.type == "cuda" and torch.cuda.is_available() @abstractmethod def setup(self, model): @@ -211,8 +210,9 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule: class DataParallelPlugin(ParallelPlugin): + def setup(self, model): - self._model = LightningDataParallel(model, self.parallel_device_ids) + self._model = LightningDataParallel(model, self.parallel_devices) def reduce(self, output): if isinstance(output, Result): @@ -225,12 +225,16 @@ def reduce(self, output): @property def root_device(self): - return torch.device("cuda", self.parallel_device_ids[0]) + return self.parallel_devices[0] @property def lightning_module(self): return self._model.module + def model_to_device(self): + # no need to do anything when model is wrapped in torch.nn.DataParallel + pass + def barrier(self, *args, **kwargs): pass @@ -244,13 +248,13 @@ class DDPPlugin(ParallelPlugin): def __init__( self, - parallel_device_ids, + parallel_devices, num_nodes=1, cluster_environment=None, is_slurm_managing_tasks=False, **kwargs: Dict[str, Any], ) -> None: - super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment) + super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.interactive_ddp_procs = [] self.num_nodes = num_nodes self.is_slurm_managing_tasks = is_slurm_managing_tasks @@ -258,11 +262,11 @@ def __init__( self._ddp_kwargs = kwargs self._has_spawned_children = False self.task_idx = None - self.num_processes = len(parallel_device_ids) + self.num_processes = len(parallel_devices) @property def root_device(self): - return torch.device("cuda", self.parallel_device_ids[self.local_rank]) + return self.parallel_devices[self.local_rank] def determine_local_rank(self): if self.is_slurm_managing_tasks: @@ -327,22 +331,20 @@ def _call_children_scripts(self): # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables - if self.parallel_device_ids is None: + if self.parallel_devices is None: raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") - os.environ["PL_TRAINER_GPUS"] = ",".join([str(i) for i in self.parallel_device_ids]) + os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" if self.lightning_module.logger is not None: os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) - num_gpus = len(self.parallel_device_ids) - # TODO: Add num_nodes (pass it in?) + num_gpus = len(self.parallel_devices) os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" self.interactive_ddp_procs = [] - # TODO: Add num_processes (pass it in?) for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" @@ -388,6 +390,8 @@ def configure_ddp(self): ) def determine_ddp_device_ids(self): + if self.root_device.type == "cpu": + return None return [self.root_device.index] def init_ddp_connection(self, global_rank: int, world_size: int) -> None: @@ -456,9 +460,8 @@ def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) def model_to_device(self): - # TODO: Can we easily make this a property that falls back here? - # self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank] - torch.cuda.set_device(self.root_device) + if self.root_device.type == "cuda": + torch.cuda.set_device(self.root_device) self.model.to(self.root_device) def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): @@ -473,25 +476,25 @@ class DDPSpawnPlugin(ParallelPlugin): def __init__( self, - parallel_device_ids, + parallel_devices, num_nodes=1, cluster_environment=None, is_slurm_managing_tasks=False, proc_offset=0, **kwargs: Dict[str, Any] ): - super().__init__(parallel_device_ids=parallel_device_ids, cluster_environment=cluster_environment) + super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.num_nodes = num_nodes self.is_slurm_managing_tasks = is_slurm_managing_tasks self.proc_offset = proc_offset self._ddp_kwargs = kwargs self.dist = LightningDistributed() - self.num_processes = len(parallel_device_ids) + self.num_processes = len(parallel_devices) self.mp_queue = None @property def root_device(self): - return torch.device("cuda", self.parallel_device_ids[self.local_rank]) + return self.parallel_devices[self.local_rank] @property def lightning_module(self): @@ -570,6 +573,7 @@ def new_process(self, process_idx, mp_queue, trainer, model, proc_offset): def post_training(self, best_model_path): # clean up memory + # TODO: move this to gpu accelerator torch.cuda.empty_cache() # restore main state with best weights @@ -602,7 +606,9 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None: torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) def determine_ddp_device_ids(self): - return [self.root_device] + if self.root_device.type == "cpu": + return None + return [self.root_device.index] def transfer_distrib_spawn_state_on_fit_end(self, results): # TODO: is there a better way than accessing callback through model -> trainer -> callback? @@ -655,7 +661,8 @@ def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) def model_to_device(self): - torch.cuda.set_device(self.root_device) + if self.root_device.type == "cuda": + torch.cuda.set_device(self.root_device) self.model.to(self.root_device) def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 02844cb1375bd..86d146783e2f3 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -135,7 +135,7 @@ def use_tpu(self): @property def num_nodes(self): - return self.accelerator_connector.num_gpus + return self.accelerator_connector.num_nodes @property def num_processes(self): From 9f369cc03d9e6a295b8b5382d9f5e7232c8b1e2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Dec 2020 07:21:00 +0100 Subject: [PATCH 047/157] cuda context manager for emptying cache --- pytorch_lightning/accelerators/data_parallel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 8e55596f5952b..d76a7f291aa94 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -447,7 +447,8 @@ def pre_training(self): self.barrier() def post_training(self, best_model_path): - torch.cuda.empty_cache() + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() if "WORLD_SIZE" in os.environ: del os.environ["WORLD_SIZE"] @@ -573,8 +574,8 @@ def new_process(self, process_idx, mp_queue, trainer, model, proc_offset): def post_training(self, best_model_path): # clean up memory - # TODO: move this to gpu accelerator - torch.cuda.empty_cache() + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() # restore main state with best weights best_path = self.mp_queue.get() From a8e830609837a70e4d092f7cd626cbbf01eed8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Dec 2020 07:23:48 +0100 Subject: [PATCH 048/157] args --- pytorch_lightning/accelerators/data_parallel.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index d76a7f291aa94..8d6e23eac0879 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -481,13 +481,11 @@ def __init__( num_nodes=1, cluster_environment=None, is_slurm_managing_tasks=False, - proc_offset=0, **kwargs: Dict[str, Any] ): super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.num_nodes = num_nodes self.is_slurm_managing_tasks = is_slurm_managing_tasks - self.proc_offset = proc_offset self._ddp_kwargs = kwargs self.dist = LightningDistributed() self.num_processes = len(parallel_devices) @@ -518,18 +516,17 @@ def set_world_ranks(self, process_idx): self.world_size = self.num_nodes * self.num_processes def start_training(self, trainer): - mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,)) + mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,)) def start_testing(self, trainer): - mp.spawn(self.new_process, nprocs=self.num_processes, args=(self.mp_queue, trainer, self.model, self.proc_offset,)) + mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer, )) - def new_process(self, process_idx, mp_queue, trainer, model, proc_offset): + def new_process(self, process_idx, trainer): # TODO: check if needed seed = os.environ.get("PL_GLOBAL_SEED") if seed is not None: seed_everything(int(seed)) - process_idx = process_idx + proc_offset self.set_world_ranks(process_idx) # set warning rank From 71cbd334fc4db672c770aef811cbd8c088cbbe1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 14 Dec 2020 07:33:43 +0100 Subject: [PATCH 049/157] move "log_gpu_memory" to logger connector --- .../accelerators/accelerator_connector.py | 2 -- .../logger_connector/logger_connector.py | 8 +++++--- pytorch_lightning/trainer/trainer.py | 19 +------------------ 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index f1ebbd5950b6c..75ecf398c1ec7 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -53,7 +53,6 @@ def __init__( auto_select_gpus, gpus, num_nodes, - log_gpu_memory, sync_batchnorm, benchmark, replace_sampler_ddp, @@ -76,7 +75,6 @@ def __init__( self.auto_select_gpus = auto_select_gpus self.gpus = gpus self.num_nodes = num_nodes - self.log_gpu_memory = log_gpu_memory self.sync_batchnorm = sync_batchnorm self.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 8e992f8f12034..887ed2f30979b 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -30,8 +30,10 @@ class LoggerConnector: - def __init__(self, trainer): + + def __init__(self, trainer, log_gpu_memory): self.trainer = trainer + self.log_gpu_memory = log_gpu_memory self._callback_metrics = MetricsHolder() self._evaluation_callback_metrics = MetricsHolder(to_float=True) self._logged_metrics = MetricsHolder() @@ -219,8 +221,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics= and global_step for the rest. """ # add gpu memory - if self.trainer._device_type == DeviceType.GPU and self.trainer.log_gpu_memory: - mem_map = memory.get_memory_profile(self.trainer.log_gpu_memory) + if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: + mem_map = memory.get_memory_profile(self.log_gpu_memory) metrics.update(mem_map) # add norms diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 6993d25cb1d94..27ce210fd4630 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -328,7 +328,6 @@ def __init__( auto_select_gpus, gpus, num_nodes, - log_gpu_memory, sync_batchnorm, benchmark, replace_sampler_ddp, @@ -337,7 +336,7 @@ def __init__( amp_backend, amp_level, ) - self.logger_connector = LoggerConnector(self) + self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) # self.precision_connector = PrecisionConnector(self) self.callback_connector = CallbackConnector(self) @@ -383,22 +382,6 @@ def __init__( gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan ) - # init accelerator related flags - # self.accelerator_connector.on_trainer_init( - # num_processes, - # tpu_cores, - # accelerator, - # distributed_backend, - # auto_select_gpus, - # gpus, - # num_nodes, - # log_gpu_memory, - # sync_batchnorm, - # benchmark, - # replace_sampler_ddp, - # deterministic, - # ) - # init train loop related flags # TODO: remove in 1.3.0 if automatic_optimization is None: From 1a9ad4fa173b5c07275cac7bc90947690f242510 Mon Sep 17 00:00:00 2001 From: justusschock Date: Mon, 14 Dec 2020 16:14:19 +0100 Subject: [PATCH 050/157] fix imports --- pytorch_lightning/accelerators/accelerator.py | 2 +- pytorch_lightning/accelerators/accelerator_connector.py | 2 +- pytorch_lightning/trainer/trainer.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 722328dd66325..c6d6221fc11cc 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,7 +1,7 @@ from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities import AMPType, NATIVE_AMP_AVALAIBLE +from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE, AMPType from typing import Any, Union import math diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 75ecf398c1ec7..6aad549d4cdfb 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -21,7 +21,7 @@ from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ DataParallelPlugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin -from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE, device_parser +from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 27ce210fd4630..e15132a5849cb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -24,8 +24,8 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector +from pytorch_lightning.core.memory import ModelSummary +from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes from pytorch_lightning.callbacks import Callback from pytorch_lightning.accelerators.accelerator_connector import BackendConnector from pytorch_lightning.callbacks import Callback, ModelCheckpoint From 7b874cc249f7eb2d421d6785e4aac3389b842bbb Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:47:43 +0100 Subject: [PATCH 051/157] typo --- pytorch_lightning/accelerators/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 6aad549d4cdfb..8abc5db36340b 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -164,7 +164,7 @@ def select_precision_plugin(self): elif self.precision == 16: if self.amp_type == 'native': - if not NATIVE_AMP_AVALAIBLE: + if not NATIVE_AMP_AVAILABLE: rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.' ' Consider upgrading with `pip install torch>=1.6`.' ' We will attempt to use NVIDIA Apex for this session.') From bc2460aee8395546bb63cb041f4609887e589266 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:48:02 +0100 Subject: [PATCH 052/157] remove todo --- pytorch_lightning/accelerators/accelerator.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index c6d6221fc11cc..3f24d6b01c71d 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -110,7 +110,6 @@ def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, * ) def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): - # TODO: Check out if this can be simplified with new LightningOptimizer! model_ref = self.lightning_module is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) @@ -232,11 +231,30 @@ def precision(self): @property def scaler(self): - if hasattr(self.precision_plugin, 'scaler'): + if hasattr(self.precision_plugin, "scaler"): return self.precision_plugin.scaler return None + @property + def rpc_enabled(self): + return self.training_type_plugin.rpc_enabled + + # TODO: Check where this comes from and why it is needed + def optimizer_state(self, optimizer: Optimizer) -> dict: + """ + Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom + plugins. + Return: + Optimizer state dict + """ + if self.training_type_plugin and hasattr(self.training_type_plugin, "optimizer_state"): + return self.training_type_plugin.optimizer_state(optimizer) + return optimizer.state_dict() + + def on_save(self, checkpoint): + return checkpoint + class NewCPUAccelerator(NewAccelerator): def setup(self, trainer, model): From 506c44632540ade383aa0d2e11b4036d023958a9 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:48:19 +0100 Subject: [PATCH 053/157] add rpc_enabled flag --- pytorch_lightning/accelerators/data_parallel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 8d6e23eac0879..331968ca9ee66 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -119,6 +119,10 @@ def start_testing(self, trainer): # double dispatch to initiate the test loop return trainer.run_test() + @property + def rpc_enabled(self): + return False + class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device): From 19d19d575852aafdd90ab9f00af433269549534c Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:48:30 +0100 Subject: [PATCH 054/157] remove unused self arg --- pytorch_lightning/accelerators/scheduler_properties.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/scheduler_properties.py b/pytorch_lightning/accelerators/scheduler_properties.py index 6835df4499385..37dbdd13c3c58 100644 --- a/pytorch_lightning/accelerators/scheduler_properties.py +++ b/pytorch_lightning/accelerators/scheduler_properties.py @@ -1,7 +1,7 @@ from torch import optim -def reinit_scheduler_properties(self, optimizers: list, schedulers: list): +def reinit_scheduler_properties(optimizers: list, schedulers: list): # Reinitialize optimizer.step properties added by schedulers for scheduler in schedulers: scheduler = scheduler['scheduler'] From dd4d148b42464e076c11ece42fea01beac0f5dde Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:49:02 +0100 Subject: [PATCH 055/157] comment out unnexessary amp part --- pytorch_lightning/core/optimizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index acba35d9ae0ac..03559065725fe 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -129,8 +129,9 @@ def __optimizer_step(self, *args, closure: Optional[Callable] = None, profiler_n with trainer.profiler.profile(profiler_name): xm.optimizer_step(optimizer, optimizer_args={'closure': closure, **kwargs}) - elif trainer.amp_backend is not None: - trainer.precision_connector.backend.optimizer_step(trainer, optimizer, closure) + # elif trainer.amp_backend is not None: + # # TODO: Adapt for new optimizer structure + # trainer.precision_connector.backend.optimizer_step(trainer, optimizer, closure) else: with trainer.profiler.profile(profiler_name): From f2fffc69cd0dcddf2e28c2ad97bb606bdc8d47f7 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:49:23 +0100 Subject: [PATCH 056/157] fix model connector --- pytorch_lightning/trainer/connectors/model_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/model_connector.py b/pytorch_lightning/trainer/connectors/model_connector.py index a4bf9a6e505e6..563b664fffbc4 100644 --- a/pytorch_lightning/trainer/connectors/model_connector.py +++ b/pytorch_lightning/trainer/connectors/model_connector.py @@ -44,5 +44,5 @@ def get_model(self): def _get_reference_model(self, model): if self.trainer.accelerator_backend: - return self.trainer.accelerator_backend.get_reference_model(model) + return self.trainer.accelerator_backend.lightning_module return model From c6b3aeb8b17e304f36ee956e5fcc32ae23e97083 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:49:39 +0100 Subject: [PATCH 057/157] fix import --- pytorch_lightning/trainer/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e15132a5849cb..60e5a93b97d4e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -15,6 +15,7 @@ """Trainer to automate the training.""" import os +from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.accelerators.precision import PrecisionPlugin import warnings from pathlib import Path From 55fc9527ff2bad6f9419f6c9da0a7b28dfbc376f Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 14 Dec 2020 16:49:56 +0100 Subject: [PATCH 058/157] copy properties only once --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 066b0818bde21..bc42de5aed110 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -105,8 +105,8 @@ def on_train_start(self): self.trainer.call_hook("on_train_start") def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule): - # bind logger and other properties - self.trainer.model_connector.copy_trainer_model_properties(model) + # # bind logger and other properties + # self.trainer.model_connector.copy_trainer_model_properties(model) # clean hparams if hasattr(model, "hparams"): From 177a634c8245926b471ddfb0df279d05d7a83a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 03:16:54 +0100 Subject: [PATCH 059/157] add cluster env --- .../accelerators/accelerator_connector.py | 40 ++++++++++++------- .../trainer/connectors/slurm_connector.py | 4 +- pytorch_lightning/trainer/trainer.py | 3 +- tests/backends/test_accelerator_connector.py | 9 +++-- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 8abc5db36340b..21e8a61e333ac 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -13,7 +13,6 @@ # limitations under the License. from typing import Union -from pytorch_lightning import accelerators import os import torch @@ -59,7 +58,8 @@ def __init__( deterministic, precision, amp_type, - amp_level + amp_level, + is_slurm_managing_tasks, ): # initialization @@ -82,6 +82,7 @@ def __init__( self.precision = precision self.amp_type = None if amp_type is None else amp_type.lower() self.amp_level = amp_level + self.is_slurm_managing_tasks = is_slurm_managing_tasks # init the default rank if exists # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks @@ -110,12 +111,6 @@ def __init__( # init flags for SLURM+DDP to work self.world_size = 1 self.interactive_ddp_procs = [] - - # link up SLURM - # TODO: this should be taken out of here... but depends too much on DDP - # self.slurm_connector.on_trainer_init(self.num_nodes) - # self.node_rank = self.determine_ddp_node_rank() - # self.local_rank = self.determine_local_rank() self.global_rank = 0 # NVIDIA setup @@ -182,28 +177,26 @@ def select_precision_plugin(self): log.info('Using APEX 16bit precision.') self.amp_type = AMPType.APEX return ApexMixedPrecisionPlugin(self.amp_level) - - - else: raise NotImplementedError('We only support precisions 32 and 16!') def select_training_type_plugin(self): + cluster_environment = self.select_cluster_environment() if self.use_dp and self.distributed_backend == "dp": plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_ddp and self.distributed_backend == "ddp": plugin = DDPPlugin( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, - cluster_environment=TorchElasticEnvironment(), # TODO: deterimine this using plugin connector? - is_slurm_managing_tasks=False, # TODO: determine this + cluster_environment=cluster_environment, + is_slurm_managing_tasks=self.is_slurm_managing_tasks, ) elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"): plugin = DDPSpawnPlugin( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, - cluster_environment=TorchElasticEnvironment(), - is_slurm_managing_tasks=False, # TODO: determine this + cluster_environment=cluster_environment, + is_slurm_managing_tasks=self.is_slurm_managing_tasks, ) else: # TODO: cover all other cases @@ -225,6 +218,23 @@ def select_accelerator(self): training_type_plugin=self.select_training_type_plugin(), ) + def select_cluster_environment(self): + # TODO: support the cloud environment set by the plugin connector! + # if self.trainer.plugin_connector.cloud_environment: + # env = self.trainer.plugin_connector.cloud_environment + # elif self.is_slurm_managing_tasks: + if self.is_slurm_managing_tasks: + env = SLURMEnvironment() + elif self._is_using_torchelastic(): + env = TorchElasticEnvironment() + else: + env = TorchElasticEnvironment() + return env + + def _is_using_torchelastic(self): + te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) + return te_flags_passed + def set_distributed_mode(self): # No distributed backend diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py index ad860c0b154b2..212e126e4bac3 100644 --- a/pytorch_lightning/trainer/connectors/slurm_connector.py +++ b/pytorch_lightning/trainer/connectors/slurm_connector.py @@ -13,10 +13,8 @@ class SLURMConnector: - def __init__(self, trainer): + def __init__(self, trainer, num_gpu_nodes): self.trainer = trainer - - def on_trainer_init(self, num_gpu_nodes): self.configure_slurm_ddp(num_gpu_nodes) def configure_slurm_ddp(self, num_gpu_nodes): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 60e5a93b97d4e..14eb8e81d95ea 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -322,6 +322,7 @@ def __init__( self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) + self.slurm_connector = SLURMConnector(self, num_nodes) self.accelerator_connector = BackendConnector( num_processes, tpu_cores, @@ -336,6 +337,7 @@ def __init__( precision, amp_backend, amp_level, + self.is_slurm_managing_tasks, # set by slurm connector ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) @@ -345,7 +347,6 @@ def __init__( self.training_tricks_connector = TrainingTricksConnector(self) self.profile_connector = ProfilerConnector(self) self.checkpoint_connector = CheckpointConnector(self) - self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self, multiple_trainloader_mode) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index f13830f68d8d6..1dddd48ea0d25 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -17,8 +17,10 @@ import pytest -from pytorch_lightning import accelerators, Trainer -from pytorch_lightning.accelerators import Accelerator +from pytorch_lightning import Trainer, accelerators +from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator +from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin +from pytorch_lightning.accelerators.old.accelerator import Accelerator from pytorch_lightning.callbacks import Callback from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment from pytorch_lightning.utilities import DistributedType @@ -28,7 +30,8 @@ def test_accelerator_choice_cpu(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, accelerators.CPUAccelerator) + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend.training_type_plugin, SingleDevicePlugin) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) model = BoringModel() From 7290e99ae50262242c99eafd0da29e69d37675fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 03:27:31 +0100 Subject: [PATCH 060/157] move slurm configuration --- .../accelerators/accelerator_connector.py | 40 ++++++- .../trainer/connectors/slurm_connector.py | 102 +----------------- pytorch_lightning/trainer/trainer.py | 3 +- 3 files changed, 40 insertions(+), 105 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 21e8a61e333ac..ad012ee1f6ead 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -59,7 +59,6 @@ def __init__( precision, amp_type, amp_level, - is_slurm_managing_tasks, ): # initialization @@ -82,7 +81,7 @@ def __init__( self.precision = precision self.amp_type = None if amp_type is None else amp_type.lower() self.amp_level = amp_level - self.is_slurm_managing_tasks = is_slurm_managing_tasks + self.is_slurm_managing_tasks = False # init the default rank if exists # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks @@ -99,6 +98,7 @@ def __init__( # self.root_device = torch.device("cpu") self.set_distributed_mode() + self.configure_slurm_ddp() # todo: select accelerator based on trainer flags self.accelerator = self.select_accelerator() @@ -347,3 +347,39 @@ def check_horovod(self): def has_horovodrun(): """Returns True if running with `horovodrun` using Gloo or OpenMPI.""" return "OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ + + def configure_slurm_ddp(self): + # extract SLURM flag vars + # whenever we have the correct number of tasks, we let slurm manage processes + # otherwise we launch the required number of processes + if self.use_ddp or self.use_ddp2: + num_requested_gpus = self.num_gpus * self.num_nodes + num_slurm_tasks = 0 + try: + num_slurm_tasks = int(os.environ['SLURM_NTASKS']) + self.is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus + + # enable slurm cpu + if num_requested_gpus == 0: + self.is_slurm_managing_tasks = num_slurm_tasks == self.num_processes + + # in interactive mode we don't manage tasks + job_name = os.environ['SLURM_JOB_NAME'] + if job_name == 'bash': + self.is_slurm_managing_tasks = False + + except Exception: + # likely not on slurm, so set the slurm managed flag to false + self.is_slurm_managing_tasks = False + + # used for tests only, set this flag to simulate slurm managing a task + try: + should_fake = int(os.environ['FAKE_SLURM_MANAGING_TASKS']) + if should_fake: + self.is_slurm_managing_tasks = True + except Exception: + pass + + # notify user the that slurm is managing tasks + if self.is_slurm_managing_tasks: + rank_zero_info('Multi-processing is handled by Slurm.') diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py index 212e126e4bac3..02552dd67de26 100644 --- a/pytorch_lightning/trainer/connectors/slurm_connector.py +++ b/pytorch_lightning/trainer/connectors/slurm_connector.py @@ -1,69 +1,14 @@ import os -import re import signal from subprocess import call -import torch -import torch.distributed as torch_distrib - from pytorch_lightning import _logger as log -from pytorch_lightning.utilities import DeviceType, DistributedType -from pytorch_lightning.utilities.distributed import rank_zero_info class SLURMConnector: - def __init__(self, trainer, num_gpu_nodes): + def __init__(self, trainer): self.trainer = trainer - self.configure_slurm_ddp(num_gpu_nodes) - - def configure_slurm_ddp(self, num_gpu_nodes): - self.trainer.is_slurm_managing_tasks = False - - # extract SLURM flag vars - # whenever we have the correct number of tasks, we let slurm manage processes - # otherwise we launch the required number of processes - if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2): - self.trainer.num_requested_gpus = self.trainer.num_gpus * num_gpu_nodes - self.trainer.num_slurm_tasks = 0 - try: - self.trainer.num_slurm_tasks = int(os.environ['SLURM_NTASKS']) - self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_requested_gpus - - # enable slurm cpu - if self.trainer.num_requested_gpus == 0: - self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_processes - - # in interactive mode we don't manage tasks - job_name = os.environ['SLURM_JOB_NAME'] - if job_name == 'bash': - self.trainer.is_slurm_managing_tasks = False - # todo: specify the possible exception - except Exception: - # likely not on slurm, so set the slurm managed flag to false - self.trainer.is_slurm_managing_tasks = False - - # used for tests only, set this flag to simulate slurm managing a task - should_fake = os.environ.get('FAKE_SLURM_MANAGING_TASKS') - if should_fake and int(should_fake): - self.trainer.is_slurm_managing_tasks = True - - # notify user the that slurm is managing tasks - if self.trainer.is_slurm_managing_tasks: - rank_zero_info('Multi-processing is handled by Slurm.') - - # todo: the same function as slurm_environment.py `_resolve_root_node_address` - def resolve_root_node_address(self, root_node): - if '[' in root_node: - name, numbers = root_node.split('[', maxsplit=1) - number = numbers.split(',', maxsplit=1)[0] - if '-' in number: - number = number.split('-')[0] - - number = re.sub('[^0-9]', '', number) - root_node = name + number - - return root_node def register_slurm_signal_handlers(self): # see if we're using slurm (not interactive) @@ -110,48 +55,3 @@ def term_handler(self, signum, frame): # Todo: required argument `signum` is not used # Todo: required argument `frame` is not used log.info("bypassing sigterm") - - # todo: this is the same func as slurm_environment.py `master_port` - def connect_ddp(self, global_rank: int, world_size: int) -> None: - """ - Sets up environment variables necessary for pytorch distributed communications - based on slurm environment. - """ - # use slurm job id for the port number - # guarantees unique ports across jobs from same grid search - default_port = os.environ.get("SLURM_JOB_ID") - if default_port: - # use the last 4 numbers in the job id as the id - default_port = default_port[-4:] - # all ports should be in the 10k+ range - default_port = int(default_port) + 15000 - else: - default_port = 12910 - - # if user gave a port number, use that one instead - if "MASTER_PORT" in os.environ: - default_port = os.environ["MASTER_PORT"] - else: - os.environ["MASTER_PORT"] = str(default_port) - log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") - - # figure out the root node addr - root_node = os.environ.get("SLURM_NODELIST") - if root_node: - root_node = root_node.split(" ")[0] - else: - root_node = "127.0.0.1" - - root_node = self.trainer.slurm_connector.resolve_root_node_address(root_node) - os.environ["MASTER_ADDR"] = root_node - log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") - - torch_backend = "nccl" if self.trainer._device_type == DeviceType.GPU else "gloo" - - if not torch.distributed.is_initialized(): - log.info( - f"initializing ddp (SLURM): GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}" - ) - torch_distrib.init_process_group( - torch_backend, rank=global_rank, world_size=world_size - ) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 14eb8e81d95ea..60e5a93b97d4e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -322,7 +322,6 @@ def __init__( self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) - self.slurm_connector = SLURMConnector(self, num_nodes) self.accelerator_connector = BackendConnector( num_processes, tpu_cores, @@ -337,7 +336,6 @@ def __init__( precision, amp_backend, amp_level, - self.is_slurm_managing_tasks, # set by slurm connector ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) @@ -347,6 +345,7 @@ def __init__( self.training_tricks_connector = TrainingTricksConnector(self) self.profile_connector = ProfilerConnector(self) self.checkpoint_connector = CheckpointConnector(self) + self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self, multiple_trainloader_mode) From 1b9c095f6d1da8dabf94b51282bbd8586cc75b4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 03:38:35 +0100 Subject: [PATCH 061/157] resolve importerrors --- pytorch_lightning/accelerators/accelerator.py | 8 +++++++- tests/core/test_datamodules.py | 2 -- tests/models/test_gpu.py | 5 ++--- tests/models/test_hooks.py | 2 -- tests/models/test_horovod.py | 9 +++++---- tests/models/test_tpu.py | 8 ++++---- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3f24d6b01c71d..242be59c082bf 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -284,4 +284,10 @@ def on_train_start(self): torch.cuda.empty_cache() -# TODO: Add NewTPUAccelerator \ No newline at end of file +# TODO: Complete the TPUAccelerator +class NewTPUAccelerator(NewAccelerator): + def setup(self, trainer, model): + raise NotImplementedError + + def on_train_start(self): + raise NotImplementedError diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index dd7f7e8614f6f..9817e3c85a7e0 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -20,7 +20,6 @@ import torch from pytorch_lightning import LightningDataModule, Trainer -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.trainer.states import TrainerState from tests.base import BoringDataModule, BoringModel @@ -419,7 +418,6 @@ def transfer_batch_to_device(self, data, device): model.transfer_batch_to_device = dm.transfer_batch_to_device - trainer.accelerator_backend = GPUAccelerator(trainer) batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0')) expected = torch.device('cuda', 0) assert dm.hook_called diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 7cfeb8f0ae53e..4bf854da4b8d8 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -24,7 +24,8 @@ from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import BoringModel +from tests.base import EvalModelTemplate + PRETEND_N_OF_GPUS = 16 @@ -210,7 +211,6 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) - trainer.accelerator_backend = GPUAccelerator(trainer) # non-transferrable types primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}] @@ -306,7 +306,6 @@ def to(self, *args, **kwargs): def test_non_blocking(): """ Tests that non_blocking=True only gets passed on torch.Tensor.to, but not on other objects. """ trainer = Trainer() - trainer.accelerator_backend = GPUAccelerator(trainer) batch = torch.zeros(2, 3) with patch.object(batch, 'to', wraps=batch.to) as mocked: diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 1f25d46f82944..0565ba594179f 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -18,7 +18,6 @@ import torch from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.trainer.states import TrainerState from tests.base import BoringModel, EvalModelTemplate @@ -116,7 +115,6 @@ def transfer_batch_to_device(self, data, device): batch = CustomBatch((torch.zeros(5, 28), torch.ones(5, 1, dtype=torch.long))) trainer = Trainer(gpus=1) - trainer.accelerator_backend = GPUAccelerator(trainer) # running .fit() would require us to implement custom data loaders, we mock the model reference instead trainer.get_model = MagicMock(return_value=model) batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0')) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 7ac7cd235f392..6b2eaef1f1da8 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -26,7 +26,8 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator +from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator +from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE @@ -311,12 +312,12 @@ def _compute_batch(): accelerator='horovod', ) - accelerator_backend = trainer.accelerator_connector.select_accelerator() - assert isinstance(accelerator_backend, HorovodAccelerator) + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + # TODO: test that we selected the correct training_type_plugin based on horovod flags metric = Accuracy(compute_on_step=True, dist_sync_on_step=True, - dist_sync_fn=accelerator_backend.gather_all_tensors, + dist_sync_fn=trainer.accelerator_backend.gather_all_tensors, threshold=threshold) for i in range(hvd.rank(), num_batches, hvd.size()): diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 5e977eed765d0..45cd9b2154c43 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -19,8 +19,8 @@ from torch.utils.data import DataLoader import tests.base.develop_pipelines as tpipes -from pytorch_lightning import Trainer -from pytorch_lightning.accelerators import TPUAccelerator +from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning.accelerators.accelerator import NewTPUAccelerator from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _TPU_AVAILABLE @@ -250,9 +250,9 @@ def test_broadcast_on_tpu(): """ Checks if an object from the master process is broadcasted to other processes correctly""" def test_broadcast(rank): trainer = Trainer(tpu_cores=8) - backend = TPUAccelerator(trainer) + assert isinstance(trainer.accelerator_backend, NewTPUAccelerator) obj = ("ver_0.5", "logger_name", rank) - result = backend.broadcast(obj) + result = trainer.accelerator_backend.broadcast(obj) assert result == ("ver_0.5", "logger_name", 0) xmp.spawn(test_broadcast, nprocs=8, start_method='fork') From e50aea912861256f11cb6f6b727678dae302ca8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 05:16:59 +0100 Subject: [PATCH 062/157] handle distributed_sampler_kwargs --- .../accelerators/data_parallel.py | 34 ++++++++++++++++--- pytorch_lightning/trainer/properties.py | 3 +- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 331968ca9ee66..b5f774f9b7bed 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -195,6 +195,14 @@ def connect(self, model): def is_global_zero(self) -> bool: return self.global_rank == 0 + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=len(self.parallel_devices), + rank=self.global_rank + ) + return distributed_sampler_kwargs + @staticmethod def configure_sync_batchnorm(model: LightningModule) -> LightningModule: """ @@ -272,6 +280,19 @@ def __init__( def root_device(self): return self.parallel_devices[self.local_rank] + @property + def lightning_module(self): + # the model may not be wrapped with DistributedDataParallel if calling this too early + return getattr(self._model, "module", self._model) + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=(self.num_nodes * self.num_processes), + rank=self.global_rank + ) + return distributed_sampler_kwargs + def determine_local_rank(self): if self.is_slurm_managing_tasks: return int(os.environ['SLURM_LOCALID']) @@ -294,11 +315,6 @@ def setup(self, model): # set the task idx self.task_idx = int(os.environ["LOCAL_RANK"]) - @property - def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - return getattr(self._model, "module", self._model) - def _call_children_scripts(self): # bookkeeping of spawned processes @@ -504,6 +520,14 @@ def lightning_module(self): # the model may not be wrapped with DistributedDataParallel if calling this too early return getattr(self._model, "module", self._model) + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=(self.num_nodes * self.num_processes), + rank=self.global_rank + ) + return distributed_sampler_kwargs + def setup(self, model): self._model = model diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 86d146783e2f3..97d9885e57f32 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -365,8 +365,9 @@ def require_distributed_sampler(self): @property def distributed_sampler_kwargs(self): if self.accelerator_backend is not None: - return self.accelerator_backend.distributed_sampler_kwargs + return self.training_type_plugin.distributed_sampler_kwargs + # TODO: make sure the cases below are handled by the training_type_plugin if self._device_type == DeviceType.TPU: kwargs = dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) From 2e8f9444f70d9075b25ea2062de8b479ea3a661f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 05:22:34 +0100 Subject: [PATCH 063/157] move emptying cache to accelertor --- pytorch_lightning/accelerators/accelerator.py | 7 +++++++ pytorch_lightning/accelerators/data_parallel.py | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 242be59c082bf..a370106773e71 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -283,6 +283,10 @@ def on_train_start(self): with torch.cuda.device(self.root_device): torch.cuda.empty_cache() + def on_train_end(self): + # clean up memory + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() # TODO: Complete the TPUAccelerator class NewTPUAccelerator(NewAccelerator): @@ -291,3 +295,6 @@ def setup(self, trainer, model): def on_train_start(self): raise NotImplementedError + + def on_train_end(self): + raise NotImplementedError diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index b5f774f9b7bed..73b77c65cf775 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -467,9 +467,6 @@ def pre_training(self): self.barrier() def post_training(self, best_model_path): - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() - if "WORLD_SIZE" in os.environ: del os.environ["WORLD_SIZE"] @@ -598,10 +595,6 @@ def new_process(self, process_idx, trainer): self.transfer_distrib_spawn_state_on_fit_end(results) def post_training(self, best_model_path): - # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() - # restore main state with best weights best_path = self.mp_queue.get() results = self.mp_queue.get() From bcc7a72de742c1435ee2cad63abeea4a6d5cb902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 08:45:15 +0100 Subject: [PATCH 064/157] fix a few tests --- pytorch_lightning/accelerators/base_plugin.py | 2 +- .../accelerators/data_parallel.py | 18 +++++++++--------- pytorch_lightning/trainer/properties.py | 9 +++++++++ pytorch_lightning/trainer/trainer.py | 6 +++--- tests/trainer/test_dataloaders.py | 2 +- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/base_plugin.py index 549d311f7f87d..3ecfb48726f76 100644 --- a/pytorch_lightning/accelerators/base_plugin.py +++ b/pytorch_lightning/accelerators/base_plugin.py @@ -15,7 +15,7 @@ def post_optimizer_step(self, optimizer, optimizer_idx): def pre_training(self): pass - def post_training(self, best_model_path): + def post_training(self): pass @contextlib.contextmanager diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 73b77c65cf775..60f61b65bf8c7 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -64,6 +64,10 @@ def model_to_device(self): def is_global_zero(self): raise NotImplementedError + @abstractmethod + def reduce(self, output, *args, **kwargs): + raise NotImplementedError + @abstractmethod def barrier(self, name: Optional[str] = None): raise NotImplementedError @@ -133,7 +137,7 @@ def __init__(self, device): def on_gpu(self): return self.device.type == "cuda" and torch.cuda.is_available() - def reduce(self, output): + def reduce(self, output, *args, **kwargs): return output @property @@ -170,10 +174,6 @@ def __init__(self, parallel_devices: List[torch.device], cluster_environment=Non self.world_size = 1 self.cluster_environment = cluster_environment - @abstractmethod - def reduce(self, output): - raise NotImplementedError - @property @abstractmethod def root_device(self): @@ -187,7 +187,7 @@ def on_gpu(self): def setup(self, model): raise NotImplementedError - def connect(self, model): + def connect(self, model, *args, **kwargs): self.setup(model) return self.model @@ -226,7 +226,7 @@ class DataParallelPlugin(ParallelPlugin): def setup(self, model): self._model = LightningDataParallel(model, self.parallel_devices) - def reduce(self, output): + def reduce(self, output, *args, **kwargs): if isinstance(output, Result): output.dp_reduce() @@ -466,7 +466,7 @@ def pre_training(self): self.barrier() - def post_training(self, best_model_path): + def post_training(self): if "WORLD_SIZE" in os.environ: del os.environ["WORLD_SIZE"] @@ -594,7 +594,7 @@ def new_process(self, process_idx, trainer): # persist info in ddp_spawn self.transfer_distrib_spawn_state_on_fit_end(results) - def post_training(self, best_model_path): + def post_training(self): # restore main state with best weights best_path = self.mp_queue.get() results = self.mp_queue.get() diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 97d9885e57f32..0a85a4a298ae3 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -66,6 +66,7 @@ class TrainerProperties(ABC): accelerator_backend: NewAccelerator num_nodes: int num_processes: int + accelerator_connector: BackendConnector @property def accelerator(self): @@ -141,6 +142,14 @@ def num_nodes(self): def num_processes(self): return self.accelerator_connector.num_processes + @property + def root_gpu(self): + return self.accelerator_connector.root_gpu + + @property + def data_parallel_device_ids(self): + return self.accelerator_connector.parallel_device_ids + @property def log_dir(self): if self.checkpoint_callback is not None: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 60e5a93b97d4e..0bae9a788c10c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -540,7 +540,7 @@ def fit( else: self.training_type_plugin.start_training(self) - results = self.training_type_plugin.post_training(self.checkpoint_callback.best_model_path) + results = self.training_type_plugin.post_training() self.accelerator_backend.teardown() # ---------------------------- @@ -900,8 +900,8 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): f"specify a path for a checkpoint .test(ckpt_path=PATH)" ) return {} - if self.accelerator_backend is not None and not self._device_type == DeviceType.TPU: - self.accelerator_backend.barrier() + if not self._device_type == DeviceType.TPU: + self.training_type_plugin.barrier() ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt["state_dict"]) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index a93a722bba597..42d9072e476d6 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -129,7 +129,7 @@ def test_multiple_val_dataloader(tmpdir): # make sure predictions are good for each val set for dataloader in trainer.val_dataloaders: - tpipes.run_prediction(trainer.model, dataloader) + tpipes.run_prediction(dataloader, model) @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific']) From 259c7f72b4fd6006dd9d117d84fac63fc5f51e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 09:05:25 +0100 Subject: [PATCH 065/157] restoring the result from subprocess --- .../accelerators/data_parallel.py | 27 ++++++++++++------- pytorch_lightning/trainer/trainer.py | 3 ++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 60f61b65bf8c7..4f7984d25c77f 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -43,6 +43,7 @@ class ReduceOp: class TrainingTypePlugin(Plugin, ABC): def __init__(self): self._model = None + self._results = None self.global_rank = 0 @property @@ -76,6 +77,7 @@ def barrier(self, name: Optional[str] = None): def broadcast(self, obj: object, src: int = 0) -> object: raise NotImplementedError + # TODO method this is currently unused def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): if device_ids is None: return @@ -115,17 +117,26 @@ def model(self, new_model): def lightning_module(self): return self._model + @property + def results(self): + """ + The results of the last training/testing run will be cached here. + In distributed training, we make sure to transfer the results to the appropriate master process. + """ + # TODO: improve these docs + return self._results + + @property + def rpc_enabled(self): + return False + def start_training(self, trainer): # double dispatch to initiate the training loop - return trainer.train() + self._results = trainer.train() def start_testing(self, trainer): # double dispatch to initiate the test loop - return trainer.run_test() - - @property - def rpc_enabled(self): - return False + self._results = trainer.run_test() class SingleDevicePlugin(TrainingTypePlugin): @@ -597,12 +608,10 @@ def new_process(self, process_idx, trainer): def post_training(self): # restore main state with best weights best_path = self.mp_queue.get() - results = self.mp_queue.get() last_path = self.mp_queue.get() - + self._results = self.mp_queue.get() # recover the weights of the processes trained in the children self.__recover_child_process_weights(best_path, last_path) - return results def configure_ddp(self): # if unset, default `find_unused_parameters` `True` diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0bae9a788c10c..ce1741ecfbbb6 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -540,8 +540,9 @@ def fit( else: self.training_type_plugin.start_training(self) - results = self.training_type_plugin.post_training() + self.training_type_plugin.post_training() self.accelerator_backend.teardown() + results = self.training_type_plugin.results # ---------------------------- # POST-Training CLEAN UP From dfab52a001f5acb73bcb9c91cea2ec6227a57349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 09:08:34 +0100 Subject: [PATCH 066/157] fix queue.get() order for results --- pytorch_lightning/accelerators/data_parallel.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 4f7984d25c77f..56806f604f53e 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -610,6 +610,7 @@ def post_training(self): best_path = self.mp_queue.get() last_path = self.mp_queue.get() self._results = self.mp_queue.get() + # recover the weights of the processes trained in the children self.__recover_child_process_weights(best_path, last_path) @@ -644,9 +645,6 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): if self.global_rank == 0 and self.mp_queue is not None: rank_zero_warn('cleaning up ddp environment...') - # todo, pass complete checkpoint as state dictionary - self.mp_queue.put(best_model_path) - self.mp_queue.put(results) # save the last weights last_path = None @@ -654,7 +652,11 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0: last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) atomic_save(self.lightning_module.state_dict(), last_path) + + # todo, pass complete checkpoint as state dictionary + self.mp_queue.put(best_model_path) self.mp_queue.put(last_path) + self.mp_queue.put(results) def __recover_child_process_weights(self, best_path, last_path): # TODO: is there a better way than accessing callback through model -> trainer -> callback? From 6742488d0210b57105ebc5a64e7f59e60d76e8f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 09:29:52 +0100 Subject: [PATCH 067/157] add missing "block_backward_sync" context manager --- pytorch_lightning/accelerators/data_parallel.py | 15 ++++++++++++++- pytorch_lightning/trainer/training_loop.py | 5 +++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 56806f604f53e..4ccca43cc0902 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod import re +from contextlib import contextmanager + from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.accelerators.base_plugin import Plugin @@ -20,7 +22,6 @@ import numpy as np import torch.distributed as torch_distrib from pytorch_lightning import _logger as log -import contextlib import torch.multiprocessing as mp from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info @@ -231,6 +232,18 @@ def configure_sync_batchnorm(model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) return model + @contextmanager + def block_backward_sync(self): + """ + Blocks ddp sync gradients behaviour on backwards pass. + This is useful for skipping sync when accumulating gradients, reducing communication overhead + Returns: context manager with sync behaviour off + """ + if isinstance(self.model, LightningDistributedDataParallel): + yield self.model.no_sync() + else: + yield None + class DataParallelPlugin(ParallelPlugin): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index bc42de5aed110..65437ebc5e5dd 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -18,6 +18,7 @@ import numpy as np import torch +from pytorch_lightning.accelerators.data_parallel import ParallelPlugin from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.memory import ModelSummary @@ -719,8 +720,8 @@ def block_ddp_sync_behaviour(self): Returns: context manager with sync behaviour off """ - if self.trainer.accelerator_backend is not None and self.automatic_optimization: - yield self.trainer.accelerator_backend.block_ddp_plugin_sync_behaviour() + if isinstance(self.trainer.training_type_plugin, ParallelPlugin) is not None and self.automatic_optimization: + yield self.trainer.training_type_plugin.block_backward_sync() else: yield None From 8c89932458867ee3d48bf1412afc063e0e069307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 09:31:16 +0100 Subject: [PATCH 068/157] add missing "block_backward_sync" context manager --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 65437ebc5e5dd..7c010ba72c137 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -720,7 +720,7 @@ def block_ddp_sync_behaviour(self): Returns: context manager with sync behaviour off """ - if isinstance(self.trainer.training_type_plugin, ParallelPlugin) is not None and self.automatic_optimization: + if isinstance(self.trainer.training_type_plugin, ParallelPlugin) and self.automatic_optimization: yield self.trainer.training_type_plugin.block_backward_sync() else: yield None From 0186a0fa5e9fe145118bbee055709024fb2336f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 10:06:11 +0100 Subject: [PATCH 069/157] fix sync_batchnorm --- .../accelerators/accelerator_connector.py | 2 ++ pytorch_lightning/accelerators/data_parallel.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index ad012ee1f6ead..91bad5fc5f373 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -190,6 +190,7 @@ def select_training_type_plugin(self): num_nodes=self.num_nodes, cluster_environment=cluster_environment, is_slurm_managing_tasks=self.is_slurm_managing_tasks, + sync_batchnorm=self.sync_batchnorm, ) elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"): plugin = DDPSpawnPlugin( @@ -197,6 +198,7 @@ def select_training_type_plugin(self): num_nodes=self.num_nodes, cluster_environment=cluster_environment, is_slurm_managing_tasks=self.is_slurm_managing_tasks, + sync_batchnorm=self.sync_batchnorm, ) else: # TODO: cover all other cases diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 4ccca43cc0902..b8290ae4b1cd8 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -288,12 +288,14 @@ def __init__( num_nodes=1, cluster_environment=None, is_slurm_managing_tasks=False, + sync_batchnorm=False, **kwargs: Dict[str, Any], ) -> None: super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.interactive_ddp_procs = [] self.num_nodes = num_nodes self.is_slurm_managing_tasks = is_slurm_managing_tasks + self.sync_batchnorm = sync_batchnorm self.dist = LightningDistributed() self._ddp_kwargs = kwargs self._has_spawned_children = False @@ -481,7 +483,8 @@ def pre_training(self): self.dist.rank = self.global_rank self.dist.device = self.root_device - self.model = self.configure_sync_batchnorm(self.model) + if self.sync_batchnorm: + self.model = self.configure_sync_batchnorm(self.model) # move the model to the correct device self.model_to_device() @@ -522,11 +525,13 @@ def __init__( num_nodes=1, cluster_environment=None, is_slurm_managing_tasks=False, + sync_batchnorm=False, **kwargs: Dict[str, Any] ): super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.num_nodes = num_nodes self.is_slurm_managing_tasks = is_slurm_managing_tasks + self.sync_batchnorm = sync_batchnorm self._ddp_kwargs = kwargs self.dist = LightningDistributed() self.num_processes = len(parallel_devices) @@ -601,7 +606,8 @@ def new_process(self, process_idx, trainer): self.dist.rank = self.global_rank self.dist.device = self.root_device - self.model = self.configure_sync_batchnorm(self.model) + if self.sync_batchnorm: + self.model = self.configure_sync_batchnorm(self.model) # move the model to the correct device self.model_to_device() From b2ac1f401fc14343d8a037bae58e7386cf9430d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 10:10:49 +0100 Subject: [PATCH 070/157] fix supported gpu-ids for tuple --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 4bf854da4b8d8..5643dce5a6160 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -162,6 +162,7 @@ def test_determine_root_gpu_device(gpus, expected_root_gpu): pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"), pytest.param([0], [0]), pytest.param([1, 3], [1, 3]), + pytest.param((1, 3), [1, 3]), pytest.param('0', [0]), pytest.param('3', [3]), pytest.param('1, 3', [1, 3]), @@ -181,7 +182,6 @@ def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): pytest.param([-1]), pytest.param([None]), pytest.param(['0']), - pytest.param((0, 1)), ]) def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): with pytest.raises(MisconfigurationException): From 07a41ce9226f3c241424dc7429536a91f8d901b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 22 Dec 2020 12:05:33 +0100 Subject: [PATCH 071/157] fix clip gradients and inf recursion --- pytorch_lightning/accelerators/accelerator.py | 13 ++++++++----- pytorch_lightning/accelerators/precision.py | 3 +++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index a370106773e71..d2c040a30d9e9 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -150,16 +150,19 @@ def _clip_gradients(self, optimizer, grad_clip_val): if grad_clip_val <= 0: return - self._clip_gradients(optimizer, grad_clip_val) model = self.lightning_module # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX + + # if self.trainer.amp_backend == AMPType.APEX: + # parameters = self.precision_plugin.master_params(optimizer) + # else: + # parameters = model.parameters() + + # TODO # ... or we call master_params() and in the default plugin we return the model.parameters() - if self.trainer.amp_backend == AMPType.APEX: - parameters = self.precision_plugin.master_params(optimizer) - else: - parameters = model.parameters() + parameters = self.precision_plugin.master_params(optimizer) max_norm = grad_clip_val norm_type = float(2.0) diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py index 3ce68c8e1efc6..a2ee98b686bae 100644 --- a/pytorch_lightning/accelerators/precision.py +++ b/pytorch_lightning/accelerators/precision.py @@ -112,6 +112,9 @@ def __init__(self, amp_level): self.backend = AMPType.APEX self.amp_level = amp_level + def master_params(self, optimizer): + return amp.master_params(optimizer) + def connect(self, model, optimizers, lr_schedulers): model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level) reinit_scheduler_properties(optimizers, lr_schedulers) From 63b7eafa03c0bdafe8dc0fe6ed54680a3a5c2295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 23 Dec 2020 12:11:32 +0100 Subject: [PATCH 072/157] accelerator selection: added cluster_environment plugin --- .../accelerators/accelerator_connector.py | 60 +++--- .../accelerators/data_parallel.py | 4 +- pytorch_lightning/plugins/plugin_connector.py | 19 +- pytorch_lightning/trainer/properties.py | 4 + pytorch_lightning/trainer/trainer.py | 11 +- tests/backends/test_accelerator_connector.py | 175 ++++++++---------- 6 files changed, 137 insertions(+), 136 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 91bad5fc5f373..935548b9fd6e3 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -18,7 +18,7 @@ from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ - DataParallelPlugin + DataParallelPlugin, DDP2Plugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser from pytorch_lightning.utilities import rank_zero_only @@ -59,6 +59,7 @@ def __init__( precision, amp_type, amp_level, + cluster_environment, ): # initialization @@ -81,6 +82,7 @@ def __init__( self.precision = precision self.amp_type = None if amp_type is None else amp_type.lower() self.amp_level = amp_level + self.cluster_environment = cluster_environment self.is_slurm_managing_tasks = False # init the default rank if exists @@ -152,6 +154,11 @@ def parallel_devices(self): devices = [torch.device("cpu")] * self.num_processes return devices + @property + def is_using_torchelastic(self): + te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) + return te_flags_passed + def select_precision_plugin(self): if self.precision == 32: self.amp_type = None @@ -182,26 +189,43 @@ def select_precision_plugin(self): def select_training_type_plugin(self): cluster_environment = self.select_cluster_environment() - if self.use_dp and self.distributed_backend == "dp": - plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) - elif self.use_ddp and self.distributed_backend == "ddp": - plugin = DDPPlugin( + if self.use_ddp2: + plugin = DDP2Plugin( parallel_devices=self.parallel_devices, - num_nodes=self.num_nodes, - cluster_environment=cluster_environment, - is_slurm_managing_tasks=self.is_slurm_managing_tasks, - sync_batchnorm=self.sync_batchnorm, + cluster_environment=cluster_environment ) - elif self.use_ddp and self.distributed_backend in ("ddp_spawn", "ddp_spawn_cpu", "ddp_cpu"): - plugin = DDPSpawnPlugin( + elif self.use_ddp: + use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks + use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic + use_ddp_spawn = self.use_ddp and self.distributed_backend == "ddp_spawn" + use_ddp_cpu_spawn = self.use_ddp and self.distributed_backend == "ddp_cpu" + use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic + use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks + + # ddp script mode uses the same flags as TE + # TODO: decouple from TE + if os.environ.get('PL_IN_DDP_SUBPROCESS', False): + use_torchelastic_ddp = False + + if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: + ddp_plugin_cls = DDPPlugin + elif use_ddp_spawn or use_ddp_cpu_spawn: + ddp_plugin_cls = DDPSpawnPlugin + else: + ddp_plugin_cls = DDPPlugin + + plugin = ddp_plugin_cls( parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=cluster_environment, is_slurm_managing_tasks=self.is_slurm_managing_tasks, sync_batchnorm=self.sync_batchnorm, ) + elif self.use_dp: + plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) + elif self.use_horovod: + raise NotImplementedError else: - # TODO: cover all other cases plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) return plugin @@ -221,22 +245,16 @@ def select_accelerator(self): ) def select_cluster_environment(self): - # TODO: support the cloud environment set by the plugin connector! - # if self.trainer.plugin_connector.cloud_environment: - # env = self.trainer.plugin_connector.cloud_environment - # elif self.is_slurm_managing_tasks: + if self.cluster_environment is not None: + return self.cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() - elif self._is_using_torchelastic(): + elif self.is_using_torchelastic: env = TorchElasticEnvironment() else: env = TorchElasticEnvironment() return env - def _is_using_torchelastic(self): - te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) - return te_flags_passed - def set_distributed_mode(self): # No distributed backend diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index b8290ae4b1cd8..ba28732336430 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -339,7 +339,7 @@ def setup(self, model): self._call_children_scripts() # set the task idx - self.task_idx = int(os.environ["LOCAL_RANK"]) + self.task_idx = self.cluster_environment.local_rank() def _call_children_scripts(self): @@ -721,3 +721,5 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ # TODO: DDP2 (?), HOROVOD DDP AND HPC DDP +class DDP2Plugin(DDPPlugin): + pass diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/plugin_connector.py index ccd128d87a26a..e1071fa24ec04 100644 --- a/pytorch_lightning/plugins/plugin_connector.py +++ b/pytorch_lightning/plugins/plugin_connector.py @@ -26,20 +26,21 @@ class PluginConnector: - def __init__(self, trainer): + def __init__(self, trainer, plugins: Optional[Union[str, list]]): self.trainer = trainer - self.plugins = [] + self.plugins = plugins or [] self.ddp_plugin = DDPPlugin() self.cloud_environment = None - - def on_trainer_init(self, plugins: Optional[Union[str, list]]): - self.plugins = plugins - if self.plugins is None: - self.plugins = [] + self.amp_plugin = NativeAMPPlugin(trainer) + self.apex_plugin = ApexPlugin(trainer) self.plugins = self._convert_str_custom_plugins(self.plugins) - self.plugins = self._append_required_plugins(self.plugins) - self.__attach_ddp() + # TODO: do we need this? + #self self.plugins = self._append_required_plugins(self.plugins) self.__attach_cluster() + # TODO: attach training_type_plugin + + def on_trainer_init(self): + self.__attach_ddp() self.__attach_amp() self.__attach_apex() diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 0a85a4a298ae3..bb7559f503b25 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -86,6 +86,10 @@ def distributed_backend(self): def training_type_plugin(self): return self.accelerator.training_type_plugin + @property + def precision_plugin(self): + return self.accelerator.precision_plugin + @property def global_rank(self): return self.accelerator.training_type_plugin.global_rank diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ce1741ecfbbb6..fa1e853153853 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -322,6 +322,7 @@ def __init__( self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) + self.plugin_connector = PluginConnector(self, plugins) self.accelerator_connector = BackendConnector( num_processes, tpu_cores, @@ -336,6 +337,7 @@ def __init__( precision, amp_backend, amp_level, + self.plugin_connector.cloud_environment ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) @@ -349,7 +351,6 @@ def __init__( self.tuner = Tuner(self) self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self, multiple_trainloader_mode) - self.plugin_connector = PluginConnector(self) # training state self.model = None @@ -431,7 +432,8 @@ def __init__( # self.precision_connector.on_trainer_init(precision, amp_level, amp_backend) # last thing are the plugins which override whatever the trainer used by default - self.plugin_connector.on_trainer_init(plugins) + # TODO: probably not needed anymore after refactor + self.plugin_connector.on_trainer_init() # Callback system self.on_init_end() @@ -517,7 +519,6 @@ def fit( self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator_backend.setup(self, model) self.train_loop.setup_training(model) - self.training_type_plugin.pre_training() # ---------------------------- # INSPECT THESE FOR MAIN LOOPS @@ -531,9 +532,11 @@ def fit( # TRAIN # ---------------------------- # hook - self.call_hook("on_fit_start") + # plugin will setup training (e.g. ddp will launch child processes) + self.training_type_plugin.pre_training() + # double dispatch: let the plugin initiate the training/test loop. if self.testing: self.training_type_plugin.start_testing(self) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index 1dddd48ea0d25..37a1911be38d3 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -16,94 +16,59 @@ from unittest import mock import pytest +import torch -from pytorch_lightning import Trainer, accelerators -from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator -from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin -from pytorch_lightning.accelerators.old.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewGPUAccelerator, NewAccelerator +from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin +from pytorch_lightning.accelerators.precision import PrecisionPlugin +from pytorch_lightning import Trainer from pytorch_lightning.callbacks import Callback from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment -from pytorch_lightning.utilities import DistributedType from tests.base.boring_model import BoringModel def test_accelerator_choice_cpu(tmpdir): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) - assert isinstance(trainer.accelerator_backend.training_type_plugin, SingleDevicePlugin) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - - model = BoringModel() trainer = Trainer( fast_dev_run=True, - callbacks=[CB()] ) - trainer.fit(model) + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.training_type_plugin, SingleDevicePlugin) def test_accelerator_choice_ddp_cpu(tmpdir): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSpawnAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - raise SystemExit() - - model = BoringModel() trainer = Trainer( fast_dev_run=True, accelerator='ddp_cpu', - num_processes=2, - callbacks=[CB()], ) - - with pytest.raises(SystemExit): - trainer.fit(model) + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch('torch.cuda.device_count', return_value=2) def test_accelerator_choice_ddp(tmpdir): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - raise SystemExit() - - model = BoringModel() trainer = Trainer( fast_dev_run=True, accelerator='ddp', gpus=1, - callbacks=[CB()], ) - - with pytest.raises(SystemExit): - trainer.fit(model) + assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch('torch.cuda.device_count', return_value=2) def test_accelerator_choice_ddp_spawn(tmpdir): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPSpawnAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - raise SystemExit() - - model = BoringModel() trainer = Trainer( fast_dev_run=True, accelerator='ddp_spawn', gpus=1, - callbacks=[CB()], ) - - with pytest.raises(SystemExit): - trainer.fit(model) + assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) @mock.patch.dict(os.environ, { @@ -117,11 +82,13 @@ def on_fit_start(self, trainer, pl_module): def test_accelerator_choice_ddp_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) - assert trainer.accelerator_backend.task_idx == 10 - assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + assert trainer.use_ddp + assert trainer.accelerator_connector.is_slurm_managing_tasks + assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.task_idx == 10 + assert trainer.training_type_plugin.cluster_environment.local_rank() == 10 raise SystemExit() model = BoringModel() @@ -148,11 +115,13 @@ def on_fit_start(self, trainer, pl_module): def test_accelerator_choice_ddp2_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type == DistributedType.DDP2 - assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) - assert trainer.accelerator_backend.task_idx == 10 - assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + assert trainer.use_ddp2 + assert trainer.accelerator_connector.is_slurm_managing_tasks + assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDP2Plugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.task_idx == 10 + assert trainer.training_type_plugin.cluster_environment.local_rank() == 10 raise SystemExit() @@ -178,11 +147,12 @@ def on_fit_start(self, trainer, pl_module): def test_accelerator_choice_ddp_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - assert trainer.accelerator_backend.task_idx == 10 - assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + assert trainer.use_ddp + assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.task_idx == 10 + assert trainer.training_type_plugin.cluster_environment.local_rank() == 10 raise SystemExit() model = BoringModel() @@ -207,11 +177,12 @@ def on_fit_start(self, trainer, pl_module): def test_accelerator_choice_ddp2_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type == DistributedType.DDP2 - assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - assert trainer.accelerator_backend.task_idx == 10 - assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + assert trainer.use_ddp2 + assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDP2Plugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.task_idx == 10 + assert trainer.training_type_plugin.cluster_environment.local_rank() == 10 raise SystemExit() model = BoringModel() @@ -235,12 +206,12 @@ def on_fit_start(self, trainer, pl_module): def test_accelerator_choice_ddp_cpu_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) - assert trainer.accelerator_backend.task_idx == 10 - assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx - + assert trainer.use_ddp + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.task_idx == 10 + assert trainer.training_type_plugin.cluster_environment.local_rank() == 10 raise SystemExit() model = BoringModel() @@ -266,9 +237,11 @@ def on_fit_start(self, trainer, pl_module): def test_accelerator_choice_ddp_cpu_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) + assert trainer.use_ddp + assert trainer.accelerator_connector.is_slurm_managing_tasks + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) raise SystemExit() model = BoringModel() @@ -302,9 +275,10 @@ def master_address(self): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) - assert isinstance(trainer.accelerator_backend.cluster_environment, CustomCluster) + assert trainer.use_ddp + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster) raise SystemExit() model = BoringModel() @@ -329,29 +303,27 @@ def on_fit_start(self, trainer, pl_module): }) @mock.patch('torch.cuda.device_count', return_value=0) def test_custom_accelerator(tmpdir): - class Accel(Accelerator): - def init_ddp_connection( - self, - global_rank: int, - world_size: int, - is_slurm_managing_tasks: bool = True) -> None: - pass + class Accel(NewAccelerator): + pass - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, Accel) - raise SystemExit() + class Prec(PrecisionPlugin): + pass - model = BoringModel() + class TrainTypePlugin(SingleDevicePlugin): + pass + + accelerator = Accel( + training_type_plugin=TrainTypePlugin(device=torch.device("cpu")), + precision_plugin=Prec(), + ) trainer = Trainer( + accelerator=accelerator, fast_dev_run=True, - accelerator=Accel(), num_processes=2, - callbacks=[CB()] ) - - with pytest.raises(SystemExit): - trainer.fit(model) + assert isinstance(trainer.accelerator_backend, Accel) + assert isinstance(trainer.training_type_plugin, TrainTypePlugin) + assert isinstance(trainer.precision_plugin, Prec) @mock.patch.dict(os.environ, { @@ -365,7 +337,8 @@ def on_fit_start(self, trainer, pl_module): def test_dist_backend_accelerator_mapping(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) + assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) raise SystemExit() model = BoringModel() From f8344c5afe7bcfee3b942c3ba6084878ae0ec829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 23 Dec 2020 13:10:34 +0100 Subject: [PATCH 073/157] fix torchelastic test --- pytorch_lightning/accelerators/accelerator_connector.py | 4 ++++ pytorch_lightning/accelerators/data_parallel.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 935548b9fd6e3..3733fad589921 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -251,7 +251,11 @@ def select_cluster_environment(self): env = SLURMEnvironment() elif self.is_using_torchelastic: env = TorchElasticEnvironment() + # TODO: decouple DDP from TE + # maybe introduce a DefaultEnvironment? + os.environ["PL_IN_DDP_SUBPROCESS"] = "1" else: + # TODO: maybe introduce a DefaultEnvironment? env = TorchElasticEnvironment() return env diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index ba28732336430..ab94eea92b3f5 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -2,6 +2,7 @@ import re from contextlib import contextmanager +from pytorch_lightning.cluster_environments import TorchElasticEnvironment from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.accelerators.base_plugin import Plugin @@ -335,6 +336,7 @@ def setup(self, model): self._model = model # start the other scripts + # TODO: make sure this works, in torchelastic we should not launch child processes! if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": self._call_children_scripts() From 34e3c15c18d9fd48c63e114ef651595b71c8ddf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Dec 2020 08:04:33 +0100 Subject: [PATCH 074/157] fix reduce early stopping decision for DDP --- pytorch_lightning/accelerators/accelerator.py | 4 ---- pytorch_lightning/accelerators/data_parallel.py | 12 ++++++++++++ pytorch_lightning/callbacks/early_stopping.py | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index d2c040a30d9e9..9a3824b794089 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -194,10 +194,6 @@ def on_train_epoch_end(self, outputs): def on_train_end(self): pass - # TODO: Check if we can change logic for early stopping to accelerator/trainer completely or have a separate connector (should be self contained) - def early_stopping_should_stop(self, pl_module): - return self.trainer.should_stop - def setup_optimizers(self, trainer, model): if trainer.testing is True: return diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index ab94eea92b3f5..eeb14380402d6 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -107,6 +107,9 @@ def determine_node_rank(self): rank_zero_info(f"Using environment variable {k} for node rank ({rank}).") return int(rank) + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + return should_stop + @property def model(self): return self._model @@ -216,6 +219,12 @@ def distributed_sampler_kwargs(self): ) return distributed_sampler_kwargs + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device) + should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM) + should_stop = bool(should_stop == self.world_size) + return should_stop + @staticmethod def configure_sync_batchnorm(model: LightningModule) -> LightningModule: """ @@ -278,6 +287,9 @@ def barrier(self, *args, **kwargs): def broadcast(self, obj: object, src: int = 0) -> object: return obj + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + return should_stop + class DDPPlugin(ParallelPlugin): diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index ec44a1eeb416b..d39e600820735 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -188,6 +188,7 @@ def _run_early_stopping_check(self, trainer, pl_module): return # short circuit if metric not present current = logs.get(self.monitor) + should_stop = False # when in dev debugging trainer.dev_debugger.track_early_stopping_history(self, current) @@ -204,5 +205,5 @@ def _run_early_stopping_check(self, trainer, pl_module): trainer.should_stop = True # stop every ddp process if any world process decides to stop - should_stop = trainer.accelerator_backend.early_stopping_should_stop(pl_module) + should_stop = trainer.training_type_plugin.reduce_early_stopping_decision(should_stop) trainer.should_stop = should_stop From 27a4cff940efc305b0a573f4b7d2e40c0aae2b97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Dec 2020 10:05:56 +0100 Subject: [PATCH 075/157] fix tests: callbacks, conversion to lightning optimizer --- pytorch_lightning/accelerators/accelerator.py | 1 + .../accelerators/data_parallel.py | 8 +++--- pytorch_lightning/trainer/optimizers.py | 5 ++-- pytorch_lightning/trainer/properties.py | 25 +++++++++++++------ pytorch_lightning/trainer/trainer.py | 2 ++ tests/callbacks/test_callbacks.py | 9 +++---- tests/models/test_hooks.py | 4 +-- 7 files changed, 33 insertions(+), 21 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 9a3824b794089..8c1bfdc9301cb 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -36,6 +36,7 @@ def setup(self, trainer, model): self.connect_training_type_plugin(self.training_type_plugin, model) self.setup_optimizers(trainer, model) self.connect_precision_plugin(self.precision_plugin) + self.optimizers = trainer.convert_to_lightning_optimizers(self.optimizers) @property def model(self): diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index eeb14380402d6..dcc6e4b139406 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -480,10 +480,10 @@ def pre_training(self): # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table - # TODO: CHeck is_slurm_managing_tasks self.init_ddp_connection(self.global_rank, self.world_size) - # TODO: Move this somewhere else + # TODO: we moved it to the trainer.fit after calling pre_training + # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) # on world_size=0 let everyone know training is starting @@ -603,10 +603,10 @@ def new_process(self, process_idx, trainer): # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table - # TODO: CHeck is_slurm_managing_tasks self.init_ddp_connection(self.global_rank, self.world_size) - # TODO: Move this somewhere else + # TODO: we moved it to the trainer.fit after calling pre_training + # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) # on world_size=0 let everyone know training is starting diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index 919042516ad50..e56856dfb2b4f 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -81,7 +81,7 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]: return optimizers, lr_schedulers, optimizer_frequencies - def convert_to_lightning_optimizers(self): + def convert_to_lightning_optimizers(self, optimizers): def _convert_to_lightning_optimizer(trainer, optimizer): if not isinstance(optimizer, LightningOptimizer): optimizer = LightningOptimizer(optimizer) @@ -89,7 +89,8 @@ def _convert_to_lightning_optimizer(trainer, optimizer): return optimizer if self._enable_pl_optimizer: - self.optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in self.optimizers] + optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in optimizers] + return optimizers def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None): # Convert each scheduler into dict structure with relevant information diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index bb7559f503b25..e4a78704749fa 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -258,6 +258,10 @@ def match_env_arguments(cls) -> Namespace: def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: return add_argparse_args(cls, parent_parser) + @property + def gpus(self) -> Optional[Union[List[int], str, int]]: + return self.accelerator_connector.gpus + @property def num_gpus(self) -> int: return self.accelerator_connector.num_gpus @@ -357,15 +361,20 @@ def get_model(self): def lightning_module(self): return self.training_type_plugin.lightning_module - def __getstate__(self): - # unwrap optimizer - self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers] - return self.__dict__ + @property + def optimizers(self): + return self.accelerator.optimizers - def __setstate__(self, d): - self.__dict__ = d - # wrap optimizers in enable_pl_optimzer is True - self.convert_to_lightning_optimizers() + # TODO: Do we need getstate / setstate? + # def __getstate__(self): + # # unwrap optimizer + # self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers] + # return self.__dict__ + # + # def __setstate__(self, d): + # self.__dict__ = d + # # wrap optimizers in enable_pl_optimzer is True + # self.convert_to_lightning_optimizers() @property def require_distributed_sampler(self): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index fa1e853153853..a0d62d2a1104d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -537,6 +537,8 @@ def fit( # plugin will setup training (e.g. ddp will launch child processes) self.training_type_plugin.pre_training() + self.call_setup_hook(self.lightning_module) + # double dispatch: let the plugin initiate the training/test loop. if self.testing: self.training_type_plugin.start_testing(self) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 53d6f80d9d7bf..f3e1dabfb6e59 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -56,8 +56,8 @@ def test_trainer_callback_system(torch_save): call.on_init_start(trainer), call.on_init_end(trainer), call.on_before_accelerator_backend_setup(trainer, model), - call.setup(trainer, model, 'fit'), call.on_fit_start(trainer, model), + call.setup(trainer, model, 'fit'), call.on_pretrain_routine_start(trainer, model), call.on_pretrain_routine_end(trainer, model), call.on_sanity_check_start(trainer, model), @@ -110,11 +110,10 @@ def test_trainer_callback_system(torch_save): assert callback_mock.method_calls == [ call.on_init_start(trainer), call.on_init_end(trainer), - call.on_before_accelerator_backend_setup(trainer, model), - call.setup(trainer, model, 'test'), call.on_fit_start(trainer, model), - call.on_pretrain_routine_start(trainer, model), - call.on_pretrain_routine_end(trainer, model), + call.setup(trainer, model, 'test'), + # call.on_pretrain_routine_start(trainer, model), + # call.on_pretrain_routine_end(trainer, model), call.on_test_start(trainer, model), call.on_test_epoch_start(trainer, model), call.on_test_batch_start(trainer, model, ANY, 0, 0), diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 0565ba594179f..72f0790ca3df3 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -356,8 +356,8 @@ def teardown(self, stage: str): expected = [ 'on_fit_start', - 'on_pretrain_routine_start', - 'on_pretrain_routine_end', + # 'on_pretrain_routine_start', + # 'on_pretrain_routine_end', 'on_test_model_eval', 'on_test_start', 'on_test_epoch_start', From df5ac30ba7450123d873abb1ec33deae534d79f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Dec 2020 13:20:05 +0100 Subject: [PATCH 076/157] fix lightning optimizer does not pickle --- pytorch_lightning/trainer/properties.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index e4a78704749fa..f7daa1c44708c 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -365,16 +365,17 @@ def lightning_module(self): def optimizers(self): return self.accelerator.optimizers - # TODO: Do we need getstate / setstate? - # def __getstate__(self): - # # unwrap optimizer - # self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers] - # return self.__dict__ - # - # def __setstate__(self, d): - # self.__dict__ = d - # # wrap optimizers in enable_pl_optimzer is True - # self.convert_to_lightning_optimizers() + # TODO: refactor this so that it can be done in LightningOptimizer + def __getstate__(self): + # unwrap optimizer + self.accelerator.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers] + return self.__dict__ + + # TODO: refactor this so that it can be done in LightningOptimizer + def __setstate__(self, d): + self.__dict__ = d + # wrap optimizers if enable_pl_optimzer is True + self.accelerator.optimizers = self.convert_to_lightning_optimizers(self.optimizers) @property def require_distributed_sampler(self): From dcf917ad6f4c25ce71495c8247144684ccb0c793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Dec 2020 14:22:59 +0100 Subject: [PATCH 077/157] fix setting benchmark and deterministic option --- .../accelerators/accelerator_connector.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 3733fad589921..e89654416bbbe 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -118,6 +118,19 @@ def __init__( # NVIDIA setup # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids) + # benchmarking + # TODO: should this be moved to GPU accelerator? + torch.backends.cudnn.benchmark = self.benchmark + + # determinism for cudnn + # TODO: should this be moved to GPU accelerator? + torch.backends.cudnn.deterministic = deterministic + if deterministic: + # fixing non-deterministic part of horovod + # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 + os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) + + # TODO: move this to TPU accelerator/plugin self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE") self.replace_sampler_ddp = replace_sampler_ddp From 272f088581fa34b07ada2cd03c8ae97cd9d523fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Dec 2020 14:49:13 +0100 Subject: [PATCH 078/157] fix slurm amp test --- .../cluster_environments/slurm_environment.py | 4 ++-- tests/models/test_amp.py | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 870119414d27b..50da4bc42d5dc 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -32,7 +32,7 @@ def master_address(self): else: root_node = "127.0.0.1" - root_node = self._resolve_root_node_address(root_node) + root_node = self.resolve_root_node_address(root_node) os.environ["MASTER_ADDR"] = root_node log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") return root_node @@ -70,7 +70,7 @@ def world_size(self): def local_rank(self): return int(os.environ['SLURM_LOCALID']) - def _resolve_root_node_address(self, root_node): + def resolve_root_node_address(self, root_node): if '[' in root_node: name, numbers = root_node.split('[', maxsplit=1) number = numbers.split(',', maxsplit=1)[0] diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 55d32cc662701..ed2aa1ac99031 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -20,6 +20,8 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.cluster_environments import SLURMEnvironment +from pytorch_lightning.loggers import WandbLogger from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _APEX_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -138,10 +140,11 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete' # test root model address - assert trainer.slurm_connector.resolve_root_node_address('abc') == 'abc' - assert trainer.slurm_connector.resolve_root_node_address('abc[23]') == 'abc23' - assert trainer.slurm_connector.resolve_root_node_address('abc[23-24]') == 'abc23' - assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' + assert isinstance(trainer.accelerator_connector.cluster_environment, SLURMEnvironment) + assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc') == 'abc' + assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23' + assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23' + assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) From 45294760f8f52fa10dfcb1673773829fbcc7b382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 11:26:58 +0100 Subject: [PATCH 079/157] fix prepare_data test and determine node_rank --- .../accelerators/data_parallel.py | 51 +++---------------- .../cluster_environment.py | 7 ++- .../cluster_environments/slurm_environment.py | 3 ++ .../torchelastic_environment.py | 17 ++++++- pytorch_lightning/trainer/properties.py | 5 ++ tests/core/test_datamodules.py | 28 ++++++---- 6 files changed, 52 insertions(+), 59 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index dcc6e4b139406..86ce580fdff79 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -90,23 +90,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]') - def determine_local_rank(self): - return int(os.environ.get('LOCAL_RANK', 0)) - - def determine_node_rank(self): - # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK. - # otherwise use given node rank or default to node rank 0 - env_vars = ['NODE_RANK', 'GROUP_RANK'] - node_ids = [(k, os.environ.get(k, None)) for k in env_vars] - node_ids = [(k, v) for k, v in node_ids if v is not None] - if len(node_ids) == 0: - return 0 - if len(node_ids) > 1: - log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.") - k, rank = node_ids.pop() - rank_zero_info(f"Using environment variable {k} for node rank ({rank}).") - return int(rank) - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: return should_stop @@ -313,6 +296,7 @@ def __init__( self._ddp_kwargs = kwargs self._has_spawned_children = False self.task_idx = None + self.node_rank = 0 self.num_processes = len(parallel_devices) @property @@ -332,18 +316,6 @@ def distributed_sampler_kwargs(self): ) return distributed_sampler_kwargs - def determine_local_rank(self): - if self.is_slurm_managing_tasks: - return int(os.environ['SLURM_LOCALID']) - else: - return super().determine_node_rank() - - def determine_node_rank(self): - if self.is_slurm_managing_tasks: - return int(os.environ['SLURM_NODEID']) - else: - return super().determine_node_rank() - def setup(self, model): self._model = model @@ -436,8 +408,8 @@ def _check_can_spawn_children(self): def set_world_ranks(self): self.local_rank = self.task_idx - # TODO: check from where we get node_rank and num_processes - self.global_rank = self.determine_node_rank() * self.num_processes + self.task_idx + self.node_rank = self.cluster_environment.node_rank() + self.global_rank = self.node_rank * self.num_processes + self.local_rank self.world_size = self.num_nodes * self.num_processes def configure_ddp(self): @@ -549,6 +521,7 @@ def __init__( self._ddp_kwargs = kwargs self.dist = LightningDistributed() self.num_processes = len(parallel_devices) + self.node_rank = 0 self.mp_queue = None @property @@ -579,8 +552,8 @@ def setup(self, model): def set_world_ranks(self, process_idx): self.local_rank = process_idx - # check from where we get node_rank, num_processes and num_nodes - self.global_rank = self.determine_node_rank() * self.num_processes + process_idx + self.node_rank = self.cluster_environment.node_rank() + self.global_rank = self.node_rank * self.num_processes + self.local_rank self.world_size = self.num_nodes * self.num_processes def start_training(self, trainer): @@ -704,18 +677,6 @@ def __recover_child_process_weights(self, best_path, last_path): ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) self.lightning_module.load_state_dict(ckpt) - def determine_local_rank(self): - if self.is_slurm_managing_tasks: - return int(os.environ['SLURM_LOCALID']) - else: - return super().determine_node_rank() - - def determine_node_rank(self): - if self.is_slurm_managing_tasks: - return int(os.environ['SLURM_NODEID']) - else: - return super().determine_node_rank() - def barrier(self, *args, **kwargs): if torch_distrib.is_initialized(): torch_distrib.barrier() diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index 5196e44411082..6de290cd63ee9 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -26,8 +26,11 @@ def master_address(self): def master_port(self): pass - def world_size(self): + def world_size(self) -> int: return self._world_size - def local_rank(self): + def local_rank(self) -> int: + pass + + def node_rank(self) -> int: pass diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 50da4bc42d5dc..9710d654dff0d 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -70,6 +70,9 @@ def world_size(self): def local_rank(self): return int(os.environ['SLURM_LOCALID']) + def node_rank(self): + return int(os.environ['SLURM_NODEID']) + def resolve_root_node_address(self, root_node): if '[' in root_node: name, numbers = root_node.split('[', maxsplit=1) diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py index 5c14ea49b4cd0..89fd4ebb2cee0 100644 --- a/pytorch_lightning/cluster_environments/torchelastic_environment.py +++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py @@ -16,7 +16,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn, rank_zero_info class TorchElasticEnvironment(ClusterEnvironment): @@ -50,3 +50,18 @@ def world_size(self): def local_rank(self): return int(os.environ['LOCAL_RANK']) + + def node_rank(self): + # TODO: use GROUP_RANK and provide a default environment class that uses NODE_RANK + # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK. + # otherwise use given node rank or default to node rank 0 + env_vars = ['NODE_RANK', 'GROUP_RANK'] + node_ids = [(k, os.environ.get(k, None)) for k in env_vars] + node_ids = [(k, v) for k, v in node_ids if v is not None] + if len(node_ids) == 0: + return 0 + if len(node_ids) > 1: + log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.") + k, rank = node_ids.pop() + rank_zero_info(f"Using environment variable {k} for node rank ({rank}).") + return int(rank) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index f7daa1c44708c..1982154b1ecf9 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -99,6 +99,11 @@ def local_rank(self): # some training types define a local rank return getattr(self.accelerator.training_type_plugin, "local_rank", 0) + @property + def node_rank(self): + # some training types define a local rank + return getattr(self.accelerator.training_type_plugin, "node_rank", 0) + @property def world_size(self): # some training types define a world size diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 9817e3c85a7e0..45a5c177d58fa 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -13,8 +13,9 @@ # limitations under the License. import pickle from argparse import ArgumentParser +from unittest import mock +from unittest.mock import MagicMock, PropertyMock from typing import Any, Dict -from unittest.mock import MagicMock import pytest import torch @@ -26,7 +27,9 @@ from tests.base.develop_utils import reset_seed -def test_can_prepare_data(tmpdir): +@mock.patch("pytorch_lightning.trainer.trainer.Trainer.node_rank", new_callable=PropertyMock) +@mock.patch("pytorch_lightning.trainer.trainer.Trainer.local_rank", new_callable=PropertyMock) +def test_can_prepare_data(local_rank, node_rank): dm = BoringDataModule() trainer = Trainer() @@ -36,33 +39,36 @@ def test_can_prepare_data(tmpdir): # prepare_data_per_node = True # local rank = 0 (True) trainer.prepare_data_per_node = True - trainer.local_rank = 0 + + local_rank.return_value = 0 + assert trainer.local_rank == 0 assert trainer.data_connector.can_prepare_data() # local rank = 1 (False) - trainer.local_rank = 1 + local_rank.return_value = 1 + assert trainer.local_rank == 1 assert not trainer.data_connector.can_prepare_data() # prepare_data_per_node = False (prepare across all nodes) # global rank = 0 (True) trainer.prepare_data_per_node = False - trainer.node_rank = 0 - trainer.local_rank = 0 + node_rank.return_value = 0 + local_rank.return_value = 0 assert trainer.data_connector.can_prepare_data() # global rank = 1 (False) - trainer.node_rank = 1 - trainer.local_rank = 0 + node_rank.return_value = 1 + local_rank.return_value = 0 assert not trainer.data_connector.can_prepare_data() - trainer.node_rank = 0 - trainer.local_rank = 1 + node_rank.return_value = 0 + local_rank.return_value = 1 assert not trainer.data_connector.can_prepare_data() # 2 dm # prepar per node = True # local rank = 0 (True) trainer.prepare_data_per_node = True - trainer.local_rank = 0 + local_rank.return_value = 0 # is_overridden prepare data = True # has been called From 5319b0fefc916f82f6232339829c044e7d72ecec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 11:40:27 +0100 Subject: [PATCH 080/157] fix retrieving last path when testing --- pytorch_lightning/accelerators/data_parallel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 86ce580fdff79..a71051b5792b5 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -672,8 +672,7 @@ def __recover_child_process_weights(self, best_path, last_path): # todo, pass also best score # load last weights - # TODO: How to get self.trainer.testing? - if last_path is not None: # and not self.trainer.testing: + if last_path is not None and not self.lightning_module.trainer.testing: ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) self.lightning_module.load_state_dict(ckpt) From 3b54cfb2128a1b122b038fbf21b2da516c8ae3b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 11:41:52 +0100 Subject: [PATCH 081/157] remove obsolete plugin argument --- pytorch_lightning/accelerators/accelerator_connector.py | 1 - pytorch_lightning/accelerators/data_parallel.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index e89654416bbbe..224eed99b8863 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -231,7 +231,6 @@ def select_training_type_plugin(self): parallel_devices=self.parallel_devices, num_nodes=self.num_nodes, cluster_environment=cluster_environment, - is_slurm_managing_tasks=self.is_slurm_managing_tasks, sync_batchnorm=self.sync_batchnorm, ) elif self.use_dp: diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index a71051b5792b5..7ec9f3b82f0cf 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -283,14 +283,12 @@ def __init__( parallel_devices, num_nodes=1, cluster_environment=None, - is_slurm_managing_tasks=False, sync_batchnorm=False, **kwargs: Dict[str, Any], ) -> None: super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.interactive_ddp_procs = [] self.num_nodes = num_nodes - self.is_slurm_managing_tasks = is_slurm_managing_tasks self.sync_batchnorm = sync_batchnorm self.dist = LightningDistributed() self._ddp_kwargs = kwargs @@ -510,13 +508,11 @@ def __init__( parallel_devices, num_nodes=1, cluster_environment=None, - is_slurm_managing_tasks=False, sync_batchnorm=False, **kwargs: Dict[str, Any] ): super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) self.num_nodes = num_nodes - self.is_slurm_managing_tasks = is_slurm_managing_tasks self.sync_batchnorm = sync_batchnorm self._ddp_kwargs = kwargs self.dist = LightningDistributed() From 6540b8785f530ef728c99181526e1dc9b99ef6fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 12:04:48 +0100 Subject: [PATCH 082/157] fix test: test_trainer_config --- pytorch_lightning/accelerators/accelerator_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 224eed99b8863..181783d268f2f 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -148,7 +148,8 @@ def tpu_id(self): @property def on_gpu(self): - return self.parallel_device_ids and torch.cuda.is_available() + gpus = self.parallel_device_ids + return gpus is not None and len(gpus) > 0 and torch.cuda.is_available() @property def num_gpus(self) -> int: @@ -335,6 +336,7 @@ def set_distributed_mode(self): rank_zero_warn( "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs." ) + self.parallel_device_ids = None self.use_ddp = True # HOROVOD From 6b450e165485f735b46d2a050eefaeb2ff9de7a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 15:34:51 +0100 Subject: [PATCH 083/157] fix torchscript tests --- pytorch_lightning/core/lightning.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 33d206b6bc49d..7d4fa62286062 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -66,6 +66,8 @@ class LightningModule( "on_gpu", "current_epoch", "global_step", + "global_rank", + "local_rank", ] + DeviceDtypeModuleMixin.__jit_unused_properties__ def __init__(self, *args, **kwargs): From 4ef539f2b7b87aa716daf71586da09d4fb9511e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 16:11:56 +0100 Subject: [PATCH 084/157] fix trainer.model access --- pytorch_lightning/trainer/properties.py | 9 +++++++++ pytorch_lightning/trainer/trainer.py | 9 +-------- tests/base/develop_pipelines.py | 11 ++--------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 1982154b1ecf9..8c4a64d128635 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -357,6 +357,15 @@ def checkpoint_callbacks(self) -> List[ModelCheckpoint]: def save_checkpoint(self, filepath, weights_only: bool = False): self.checkpoint_connector.save_checkpoint(filepath, weights_only) + @property + def model(self): + """ + The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel. + To access the pure LightningModule, use + :meth:`~pytorch_lightning.trainer.trainer.Trainer.lightning_module` instead. + """ + return self.accelerator.model + def get_model(self): # TODO: rename this to lightning_module (see training type plugin) # backward compatible diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index a0d62d2a1104d..5ed45df5eaf8b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -353,7 +353,7 @@ def __init__( self.train_loop = TrainLoop(self, multiple_trainloader_mode) # training state - self.model = None + self.weights_summary = weights_summary self.shown_warnings = set() # init callbacks @@ -591,11 +591,6 @@ def pre_training_routine(self): else: raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES)) - # TODO: what the heck is this - # track model now. - # if cluster resets state, the model will update with the saved weights - # self.trainer.model = model - # restore training and model before hpc is called self.checkpoint_connector.restore_weights(ref_model) @@ -920,7 +915,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): self.tested_ckpt_path = ckpt_path self.testing = True os.environ["PL_TESTING_MODE"] = "1" - self.model = model results = self.fit(model) self.testing = False del os.environ["PL_TESTING_MODE"] @@ -941,7 +935,6 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval self.testing = True - self.model = model results = self.fit(model) self.testing = False diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 4949d53fc9a50..71747c21bf989 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -44,11 +44,6 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50 for dataloader in test_loaders: run_prediction(pretrained_model, dataloader, min_acc=min_acc) - if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN): - # on hpc this would work fine... but need to hack it for the purpose of the test - trainer.model = pretrained_model - trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() - def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True, min_acc: float = 0.25): @@ -84,10 +79,8 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, if with_hpc: if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2): # on hpc this would work fine... but need to hack it for the purpose of the test - trainer.model = pretrained_model - trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers( - pretrained_model - ) + trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ + trainer.init_optimizers(pretrained_model) # test HPC saving trainer.checkpoint_connector.hpc_save(save_dir, logger) From 1001ccfa581d5301cb9199fe4294d3248581e335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 16:23:59 +0100 Subject: [PATCH 085/157] move properties --- pytorch_lightning/trainer/properties.py | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 8c4a64d128635..62241722ff365 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -379,6 +379,38 @@ def lightning_module(self): def optimizers(self): return self.accelerator.optimizers + @optimizers.setter + def optimizers(self, new_optims): + self.accelerator.optimizers = new_optims + + @property + def lr_schedulers(self): + return self.accelerator.lr_schedulers + + @lr_schedulers.setter + def lr_schedulers(self, new_schedulers): + self.accelerator.lr_schedulers = new_schedulers + + @property + def optimizer_frequencies(self): + return self.accelerator.optimizer_frequencies + + @optimizer_frequencies.setter + def optimizer_frequencies(self, new_freqs): + self.accelerator.optimizer_frequencies = new_freqs + + @property + def amp_backend(self): + return self.accelerator.amp_backend + + @property + def precision(self): + return self.accelerator.precision + + @property + def scaler(self): + return self.accelerator.scaler + # TODO: refactor this so that it can be done in LightningOptimizer def __getstate__(self): # unwrap optimizer From 38a1d0fc3bde969b5f4b18c589cfae7e91396dc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 16:44:58 +0100 Subject: [PATCH 086/157] fix test_transfer_batch_hook --- tests/models/test_hooks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 72f0790ca3df3..b2491389135f2 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect +from unittest import mock from unittest.mock import MagicMock import pytest import torch +from unittest.mock import PropertyMock from pytorch_lightning import Trainer from pytorch_lightning.trainer.states import TrainerState @@ -90,7 +92,8 @@ def training_epoch_end(self, outputs): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_transfer_batch_hook(): +@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock) +def test_transfer_batch_hook(model_getter_mock): class CustomBatch: @@ -116,7 +119,7 @@ def transfer_batch_to_device(self, data, device): trainer = Trainer(gpus=1) # running .fit() would require us to implement custom data loaders, we mock the model reference instead - trainer.get_model = MagicMock(return_value=model) + model_getter_mock.return_value = model batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0')) expected = torch.device('cuda', 0) assert model.hook_called From 46cf7effbf13980d8f3886945c53940d414da676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 17:16:20 +0100 Subject: [PATCH 087/157] fix auto_select_gpus --- pytorch_lightning/accelerators/accelerator_connector.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 181783d268f2f..efce11ab4bae6 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -20,6 +20,7 @@ from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ DataParallelPlugin, DDP2Plugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin +from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info @@ -91,18 +92,16 @@ def __init__( if "LOCAL_RANK" in os.environ: rank_zero_only.rank = int(os.environ["LOCAL_RANK"]) - # TODO: Move autoselect GPUS to other place # for gpus allow int, string and gpu list - # if auto_select_gpus and isinstance(gpus, int): - # self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus) + if auto_select_gpus and isinstance(gpus, int): + self.gpus = pick_multiple_gpus(gpus) + self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_device_ids) - # self.root_device = torch.device("cpu") self.set_distributed_mode() self.configure_slurm_ddp() - # todo: select accelerator based on trainer flags self.accelerator = self.select_accelerator() # override dist backend when using tpus From 258f50e275b904ac755530f942c7ff6fb379cbb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 17:38:48 +0100 Subject: [PATCH 088/157] fix omegaconf test --- pytorch_lightning/utilities/device_parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 9417bc13e8e8b..ce81ef0222b9e 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -14,6 +14,7 @@ from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch +from typing import Union, Any, List, Optional, Tuple, MutableSequence from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -148,7 +149,7 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, ...]]) -> Optional[List[int]]: assert gpus is not None - if isinstance(gpus, (list, tuple)): + if isinstance(gpus, (MutableSequence, tuple)): return list(gpus) # must be an int @@ -177,7 +178,7 @@ def _check_data_type(device_ids: Any) -> None: device_ids: gpus/tpu_cores parameter as passed to the Trainer """ if device_ids is not None and \ - (not isinstance(device_ids, (int, str, list, tuple)) or isinstance(device_ids, bool)): + (not isinstance(device_ids, (int, str, MutableSequence, tuple)) or isinstance(device_ids, bool)): raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.") From a5d69b9a20fc2656eb24cf3f66d9ab747b13e63f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 27 Dec 2020 18:10:27 +0100 Subject: [PATCH 089/157] fix test that needs to simulate slurm ddp --- tests/models/test_amp.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index ed2aa1ac99031..d80077f3855b9 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -109,11 +109,17 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@mock.patch.dict(os.environ, { + "SLURM_NTASKS": "1", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0" +}) def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" # simulate setting slurm flags tutils.set_random_master_port() - os.environ['SLURM_LOCALID'] = str(0) model = EvalModelTemplate() @@ -133,18 +139,17 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): callbacks=[checkpoint], logger=logger, ) - trainer.is_slurm_managing_tasks = True - trainer.fit(model) + result = trainer.fit(model) # correct result and ok accuracy assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete' # test root model address - assert isinstance(trainer.accelerator_connector.cluster_environment, SLURMEnvironment) - assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc') == 'abc' - assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23' - assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23' - assert trainer.accelerator_connector.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc') == 'abc' + assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23' + assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23' + assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23' @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) From 88a7ed5d31f5f1e5a5a1e1c3edbb3e151aac5a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 29 Dec 2020 21:14:10 +0100 Subject: [PATCH 090/157] add horovod plugin --- pytorch_lightning/accelerators/accelerator.py | 13 +- .../accelerators/accelerator_connector.py | 7 +- .../accelerators/data_parallel.py | 160 +++++++++++++++++- tests/models/test_horovod.py | 2 +- 4 files changed, 170 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 8c1bfdc9301cb..465ed3dd237e5 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin +from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin, HorovodPlugin from pytorch_lightning.accelerators.base_plugin import Plugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE, AMPType @@ -106,12 +106,17 @@ def process_dataloader(self, dataloader): return dataloader def backward(self, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs): - return self.precision_plugin.backward( + output = self.precision_plugin.backward( self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs ) - def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): + # TODO: this is a hack, find a better solution for this (hook?) + if isinstance(self.training_type_plugin, HorovodPlugin): + optimizer.synchronize() + + return output + def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_closure): model_ref = self.lightning_module is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) native_amp = ( @@ -119,6 +124,7 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl ) self.precision_plugin.pre_optimizer_step(optimizer, opt_idx) + self.training_type_plugin.pre_optimizer_step(optimizer, opt_idx) # model hook res = model_ref.optimizer_step( @@ -133,6 +139,7 @@ def optimizer_step(self, optimizer, current_epoch, batch_idx, opt_idx, lambda_cl ) self.precision_plugin.post_optimizer_step(optimizer, opt_idx) + self.training_type_plugin.post_optimizer_step(optimizer, opt_idx) return res def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index efce11ab4bae6..825ea25a354fa 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -18,7 +18,7 @@ from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ - DataParallelPlugin, DDP2Plugin + DataParallelPlugin, DDP2Plugin, HorovodPlugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser @@ -236,7 +236,7 @@ def select_training_type_plugin(self): elif self.use_dp: plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: - raise NotImplementedError + plugin = HorovodPlugin(parallel_devices=self.parallel_devices) else: plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) return plugin @@ -364,7 +364,10 @@ def _set_horovod_backend(self): hvd.init() if self.on_gpu: # Horovod assigns one local GPU per process + self.parallel_device_ids = list(range(hvd.local_size())) self.root_gpu = hvd.local_rank() + else: + self.num_processes = hvd.local_size() def check_horovod(self): """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py index 7ec9f3b82f0cf..02a748222732e 100644 --- a/pytorch_lightning/accelerators/data_parallel.py +++ b/pytorch_lightning/accelerators/data_parallel.py @@ -1,8 +1,12 @@ from abc import ABC, abstractmethod import re -from contextlib import contextmanager +from contextlib import contextmanager, ExitStack -from pytorch_lightning.cluster_environments import TorchElasticEnvironment +from torch.optim.lr_scheduler import _LRScheduler + +from pytorch_lightning.cluster_environments import TorchElasticEnvironment, ClusterEnvironment +from pytorch_lightning.core.optimizer import LightningOptimizer +from pytorch_lightning.utilities import HOROVOD_AVAILABLE from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.accelerators.base_plugin import Plugin @@ -26,6 +30,9 @@ import torch.multiprocessing as mp from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info +if HOROVOD_AVAILABLE: + import horovod.torch as hvd + try: from hydra.utils import to_absolute_path, get_original_cwd from hydra.core.hydra_config import HydraConfig @@ -166,7 +173,11 @@ def broadcast(self, obj: object, src: int = 0) -> object: class ParallelPlugin(TrainingTypePlugin, ABC): - def __init__(self, parallel_devices: List[torch.device], cluster_environment=None): + def __init__( + self, + parallel_devices: List[torch.device], + cluster_environment: Optional[ClusterEnvironment] = None, + ): super().__init__() self.parallel_devices = parallel_devices self.local_rank = 0 @@ -240,6 +251,9 @@ def block_backward_sync(self): class DataParallelPlugin(ParallelPlugin): + def __init__(self, parallel_devices: List[torch.device]): + super().__init__(parallel_devices=parallel_devices, cluster_environment=None) + def setup(self, model): self._model = LightningDataParallel(model, self.parallel_devices) @@ -282,7 +296,7 @@ def __init__( self, parallel_devices, num_nodes=1, - cluster_environment=None, + cluster_environment: ClusterEnvironment = None, sync_batchnorm=False, **kwargs: Dict[str, Any], ) -> None: @@ -507,7 +521,7 @@ def __init__( self, parallel_devices, num_nodes=1, - cluster_environment=None, + cluster_environment: ClusterEnvironment = None, sync_batchnorm=False, **kwargs: Dict[str, Any] ): @@ -690,6 +704,140 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ return output -# TODO: DDP2 (?), HOROVOD DDP AND HPC DDP +# TODO: DDP2 class DDP2Plugin(DDPPlugin): pass + + +class HorovodPlugin(ParallelPlugin): + + def __init__(self, parallel_devices: List[torch.device]): + super().__init__(parallel_devices=parallel_devices, cluster_environment=None) + + @property + def root_device(self): + return self.parallel_devices[self.local_rank] + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=hvd.size(), + rank=hvd.rank() + ) + return distributed_sampler_kwargs + + def setup(self, model): + self._model = model + + self.global_rank = hvd.rank() + self.local_rank = hvd.local_rank() + rank_zero_only.rank = self.global_rank + + self.model_to_device() + + def pre_training(self): + + def _unpack_lightning_optimizer(opt): + return opt._optimizer if isinstance(opt, LightningOptimizer) else opt + + optimizers = self.lightning_module.trainer.optimizers + optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers] + + # Horovod: scale the learning rate by the number of workers to account for + # increased total batch size + for optimizer in optimizers: + for param_group in optimizer.param_groups: + param_group['lr'] *= hvd.size() + + # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR + lr_schedulers = self.lightning_module.trainer.lr_schedulers + for scheduler in lr_schedulers: + scheduler = scheduler['scheduler'] + if isinstance(scheduler, _LRScheduler): + scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs] + + # Horovod: broadcast parameters & optimizer state to ensure consistent initialization + hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0) + for optimizer in optimizers: + hvd.broadcast_optimizer_state(optimizer, root_rank=0) + + def _filter_named_parameters(model, optimizer): + opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])]) + return [(name, p) for name, p in model.named_parameters() if p in opt_params] + + # Horovod: wrap optimizers to perform gradient aggregation via allreduce + optimizers = [ + hvd.DistributedOptimizer(optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer)) + for optimizer in optimizers + ] + + optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers) + self.lightning_module.trainer.optimizers = optimizers + + def start_training(self, trainer): + with ExitStack() as stack: + for optimizer in trainer.optimizers: + # Synchronization will be performed explicitly following backward() + stack.enter_context(optimizer.skip_synchronize()) + + # set up training routine + self._results = trainer.train() + + # Make sure all workers have finished training before returning to the user + hvd.join() + + def start_testing(self, trainer): + with ExitStack() as stack: + # set up training routine + # self.trainer.train_loop.setup_training(self.trainer.model) + self._results = trainer.run_test() + + # Make sure all workers have finished training before returning to the user + hvd.join() + + def barrier(self, *args, **kwargs): + hvd.join() + + def broadcast(self, obj: object, src: int = 0) -> object: + obj = hvd.broadcast_object(obj, src) + return obj + + def model_to_device(self): + if self.on_gpu: + torch.cuda.set_device(self.root_device) + self.model.to(self.root_device) + + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): + if group is not None: + raise ValueError( + "Horovod does not support allreduce using a subcommunicator at this time. " + "Unset `group`." + ) + + if reduce_op is None or reduce_op == "sum": + reduce_op = hvd.Sum + elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): + reduce_op = hvd.Average + else: + raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") + + # sync all processes before reduction + hvd.join() + return hvd.allreduce(output, op=reduce_op) + + def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): + if group is not None: + raise ValueError( + "Horovod does not support allgather using a subcommunicator at this time. " + "Unset `group`." + ) + + if len(result.shape) == 0: + # Convert scalars to single dimension tensors + result = result.reshape(1) + + # sync and gather all + hvd.join() + gathered = hvd.allgather(result) + gathered_result = list(gathered.split(1, dim=0)) + return gathered_result diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 6b2eaef1f1da8..623f329035533 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -317,7 +317,7 @@ def _compute_batch(): metric = Accuracy(compute_on_step=True, dist_sync_on_step=True, - dist_sync_fn=trainer.accelerator_backend.gather_all_tensors, + dist_sync_fn=trainer.training_type_plugin.gather_all_tensors, threshold=threshold) for i in range(hvd.rank(), num_batches, hvd.size()): From 40daa41def2f77a5760470b0fed813397e58e629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 29 Dec 2020 21:33:32 +0100 Subject: [PATCH 091/157] fix test with named arguments --- tests/core/test_lightning_module.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 9d45310a1de54..f2936c7f19d55 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -117,15 +117,15 @@ def configure_optimizers(self): optimizer_2 = Adam(self.layer.parameters(), lr=0.1) return [optimizer, optimizer_2] - def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, - on_tpu=False, using_native_amp=False, using_lbfgs=False): - # warm up lr - if self.trainer.global_step < 500: - lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) - for pg in optimizer.param_groups: - pg['lr'] = lr_scale * 0.01 - - optimizer.step(closure=closure) + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, + on_tpu=False, using_native_amp=False, using_lbfgs=False): + # warm up lr + if self.trainer.global_step < 500: + lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) + for pg in optimizer.param_groups: + pg['lr'] = lr_scale * 0.01 + + optimizer.step(closure=optimizer_closure) model = TestModel() model.training_epoch_end = None From 96fc074d017c478aa5c578e0da70464d6dc9c683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 30 Dec 2020 00:12:23 +0100 Subject: [PATCH 092/157] clean up whitespace --- pytorch_lightning/accelerators/accelerator.py | 13 ++++++------- .../accelerators/accelerator_connector.py | 2 -- pytorch_lightning/trainer/trainer.py | 9 --------- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 465ed3dd237e5..07777d982b2d6 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,8 +1,10 @@ -from pytorch_lightning.accelerators.data_parallel import ParallelPlugin, TrainingTypePlugin, HorovodPlugin -from pytorch_lightning.accelerators.base_plugin import Plugin +import os + +from pytorch_lightning import _logger as log +from pytorch_lightning.accelerators.data_parallel import TrainingTypePlugin, HorovodPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE, AMPType -from typing import Any, Union +from pytorch_lightning.utilities import AMPType +from typing import Any import math import torch @@ -159,8 +161,6 @@ def _clip_gradients(self, optimizer, grad_clip_val): if grad_clip_val <= 0: return - model = self.lightning_module - # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX # if self.trainer.amp_backend == AMPType.APEX: @@ -215,7 +215,6 @@ def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: Lightn def connect_precision_plugin(self, plugin: PrecisionPlugin): model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers) - self.model = model self.optimizers = optimizers self.schedulers = schedulers diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 825ea25a354fa..7addf4bdd72c2 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union import os import torch @@ -272,7 +271,6 @@ def select_cluster_environment(self): return env def set_distributed_mode(self): - # No distributed backend if self.distributed_backend is None: # horovod multi GPU diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 5ed45df5eaf8b..382b6e3c5ae8e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -520,14 +520,6 @@ def fit( self.accelerator_backend.setup(self, model) self.train_loop.setup_training(model) - # ---------------------------- - # INSPECT THESE FOR MAIN LOOPS - # ---------------------------- - # assign training and eval functions... inspect these to see the train and eval loops :) - # self.accelerator_backend.train_loop = self.train - # self.accelerator_backend.validation_loop = self.run_evaluation - # self.accelerator_backend.test_loop = self.run_evaluation - # ---------------------------- # TRAIN # ---------------------------- @@ -562,7 +554,6 @@ def fit( # return 1 when finished # used for testing or when we need to know that training succeeded - if self._state != TrainerState.INTERRUPTED: self._state = TrainerState.FINISHED return results or 1 From 210831ab6bd86d661d16e296a3ee107dcd0c9b24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 30 Dec 2020 00:21:16 +0100 Subject: [PATCH 093/157] fix datamodules test --- tests/core/test_datamodules.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 45a5c177d58fa..7796c9c074d6e 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -23,6 +23,7 @@ from pytorch_lightning import LightningDataModule, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.trainer.states import TrainerState +from pytorch_lightning.utilities.model_helpers import is_overridden from tests.base import BoringDataModule, BoringModel from tests.base.develop_utils import reset_seed @@ -397,7 +398,8 @@ def test_full_loop_dp(tmpdir): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_dm_transfer_batch_to_device(tmpdir): +@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock) +def test_dm_transfer_batch_to_device(get_module_mock): class CustomBatch: def __init__(self, data): self.samples = data[0] @@ -420,9 +422,9 @@ def transfer_batch_to_device(self, data, device): trainer = Trainer(gpus=1) # running .fit() would require us to implement custom data loaders, we mock the model reference instead - trainer.get_model = MagicMock(return_value=model) - - model.transfer_batch_to_device = dm.transfer_batch_to_device + get_module_mock.return_value = model + if is_overridden('transfer_batch_to_device', dm): + model.transfer_batch_to_device = dm.transfer_batch_to_device batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0')) expected = torch.device('cuda', 0) From 98b6dd4569806a2fd45462888da795813d51f3fc Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 6 Jan 2021 17:31:11 +0100 Subject: [PATCH 094/157] remove old accelerators --- .../accelerators/old/accelerator.py | 259 --------------- .../accelerators/old/ddp2_accelerator.py | 268 ---------------- .../old/ddp_cpu_hpc_accelerator.py | 48 --- .../old/ddp_cpu_spawn_accelerator.py | 297 ------------------ .../accelerators/old/dp_accelerator.py | 189 ----------- .../accelerators/old/gpu_accelerator.py | 108 ------- 6 files changed, 1169 deletions(-) delete mode 100644 pytorch_lightning/accelerators/old/accelerator.py delete mode 100644 pytorch_lightning/accelerators/old/ddp2_accelerator.py delete mode 100644 pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py delete mode 100644 pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py delete mode 100644 pytorch_lightning/accelerators/old/dp_accelerator.py delete mode 100644 pytorch_lightning/accelerators/old/gpu_accelerator.py diff --git a/pytorch_lightning/accelerators/old/accelerator.py b/pytorch_lightning/accelerators/old/accelerator.py deleted file mode 100644 index b16e0125054bb..0000000000000 --- a/pytorch_lightning/accelerators/old/accelerator.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import math -from enum import Enum -from pytorch_lightning.core.lightning import LightningModule -from typing import Any, Optional, Union - -import torch - -from pytorch_lightning.utilities import AMPType, rank_zero_warn -from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.parsing import AttributeDict -import torch.distributed as torch_distrib -from pytorch_lightning import _logger as log - -try: - from apex import amp -except ImportError: - amp = None - -if torch.distributed.is_available(): - from torch.distributed import ReduceOp -else: - - class ReduceOp: - SUM = None - - -EPSILON = 1e-6 -EPSILON_FP16 = 1e-5 - - -class Accelerator(object): - def __init__(self, trainer=None, cluster_environment=None, ddp_plugin=None): - self.trainer = trainer - self.nickname = None - self.cluster_environment = cluster_environment - self.dist = AttributeDict(rank=0, device=None) - self.ddp_plugin = ddp_plugin - - if trainer is not None: - self.train_loop = self.trainer.train - self.validation_loop = self.trainer.run_evaluation - self.test_loop = self.trainer.run_evaluation - - def setup(self, model): - pass - - def teardown(self): - # Ensure if necessary all processes are finished - self.barrier() - - def barrier(self, name: Optional[str] = None): - pass - - def broadcast(self, obj, src=0): - return obj - - def train_or_test(self): - if self.trainer.testing: - results = self.trainer.run_test() - else: - results = self.trainer.train() - return results - - def batch_to_device(self, batch: Any, device: torch.device): - model = self.trainer.get_model() - if model is not None: - return model.transfer_batch_to_device(batch, device) - return move_data_to_device(batch, device) - - def training_step_end(self, output): - return output - - def test_step_end(self, output): - return output - - def validation_step_end(self, output): - return output - - def process_dataloader(self, dataloader): - return dataloader - - def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): - if self.trainer.precision == 16: - closure_loss = self.trainer.precision_connector.backend.backward( - closure_loss, optimizer, opt_idx, *args, **kwargs - ) - else: - # do backward pass - model = self.trainer.get_model() - model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) - - # once backward has been applied, release graph - closure_loss = closure_loss.detach() - return closure_loss - - def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): - model_ref = self.trainer.get_model() - is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = self.trainer.amp_backend == AMPType.NATIVE - - # native amp + lbfgs is a no go right now - if native_amp and is_lbfgs: - raise MisconfigurationException( - "native PyTorch amp and lbfgs are not compatible." - " To request, please file a Github issue in PyTorch and tag @mcarilli" - ) - - # model hook - model_ref.optimizer_step( - epoch=self.trainer.current_epoch, - batch_idx=batch_idx, - optimizer=optimizer, - optimizer_idx=opt_idx, - optimizer_closure=lambda_closure, - on_tpu=False, # TPUAccelerator class sets this as True - using_native_amp=native_amp, - using_lbfgs=is_lbfgs, - ) - - # scale when native amp - if native_amp: - self.trainer.scaler.update() - - def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): - model_ref = self.trainer.get_model() - model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) - - def clip_gradients(self, optimizer, clip_val=None): - # TODO: separate TPU case from here - self._clip_gradients(optimizer, clip_val) - - def _clip_gradients(self, optimizer, clip_val=None): - # use the trainer's clip val if none passed - grad_clip_val = self.trainer.gradient_clip_val - if clip_val is not None: - grad_clip_val = clip_val - grad_clip_val = float(grad_clip_val) - - # this code is a modification of torch.nn.utils.clip_grad_norm_ - # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md - if grad_clip_val <= 0: - return - - model = self.trainer.get_model() - if self.trainer.amp_backend == AMPType.APEX: - parameters = amp.master_params(optimizer) - else: - parameters = model.parameters() - - max_norm = grad_clip_val - norm_type = float(2.0) - - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - - if norm_type == math.inf: - total_norm = max(p.grad.data.abs().max() for p in parameters) - else: - device = parameters[0].device - out = torch.empty(len(parameters), device=device) - for i, p in enumerate(parameters): - torch.norm(p.grad.data.to(device), norm_type, out=out[i]) - total_norm = torch.norm(out, norm_type) - - eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON - clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps) - clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) - for p in parameters: - p.grad.data.mul_(clip_coef.to(p.grad.data.device)) - - def on_train_epoch_end(self, outputs): - pass - - def on_train_end(self): - pass - - def early_stopping_should_stop(self, pl_module): - return self.trainer.should_stop - - def setup_optimizers(self, model): - if self.trainer.testing is True: - return - - optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) - self.trainer.optimizers = optimizers - self.trainer.lr_schedulers = lr_schedulers - self.trainer.optimizer_frequencies = optimizer_frequencies - - def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: - os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) - torch_backend = "nccl" if self.trainer.on_gpu else "gloo" - - if not torch.distributed.is_initialized(): - log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - - def sync_tensor( - self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None - ) -> torch.Tensor: - """ - Function to reduce a tensor from several distributed processes to one aggregated tensor. - - Args: - tensor: the tensor to sync and reduce - group: the process group to gather results from. Defaults to all processes (world) - reduce_op: the reduction operation. Defaults to sum. - Can also be a string of 'avg', 'mean' to calculate the mean during reduction. - - Return: - reduced value - """ - raise NotImplementedError() - - def __getstate__(self): - return { - "trainer": self.trainer, - "nickname": self.nickname, - "cluster_environment": self.cluster_environment, - "dist": self.dist, - "ddp_plugin": self.ddp_plugin, - } - - def __setstate__(self, d): - self.trainer = d["trainer"] - self.nickname = d["nickname"] - self.cluster_environment = d["cluster_environment"] - self.dist = d["dist"] - self.ddp_plugin = d["ddp_plugin"] - - -# TODO: allow user to compare with string even internaly we shall use these Enum to prevent typos... -class BackendType(Enum): - DP = "dp" - DDP = "ddp" - DDP2 = "ddp2" - DDP_SPAWN = "ddp_spawn" - # decuple distrib and device - DDP_CPU = "ddp_cpu" - HOROVOD = "horovod" - # this is rather device - TPU = "tpu" diff --git a/pytorch_lightning/accelerators/old/ddp2_accelerator.py b/pytorch_lightning/accelerators/old/ddp2_accelerator.py deleted file mode 100644 index a5e8d720ce186..0000000000000 --- a/pytorch_lightning/accelerators/old/ddp2_accelerator.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -from typing import Any, List, Optional, Union - -import torch -import torch.distributed as torch_distrib -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.core.step_result import Result -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available - - -class DDP2Accelerator(Accelerator): - - def __init__(self, - trainer, - cluster_environment: Optional[ClusterEnvironment] = None, - ddp_plugin: Optional[DDPPlugin] = None): - """ - Runs training using DDP2 strategy on a cluster - - Example:: - - # default - trainer = Trainer(accelerator=DDP2Accelerator()) - - """ - super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self.dist = LightningDistributed() - self.nickname = 'ddp2' - - def setup(self, model): - self.trainer.model = model - self.task_idx = self.cluster_environment.local_rank() - - def train(self): - model = self.trainer.model - return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) - - def training_step(self, args): - return self._step(args) - - def validation_step(self, args): - return self._step(args) - - def test_step(self, args): - return self._step(args) - - def _step(self, args): - args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args) - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def training_step_end(self, output): - if isinstance(output, Result): - output.dp_reduce() - return output - - def validation_step_end(self, output): - if isinstance(output, Result): - output.dp_reduce() - return output - - def test_step_end(self, output): - if isinstance(output, Result): - output.dp_reduce() - return output - - def set_world_ranks(self, process_idx): - # Todo: required argument `process_idx` is not used - self.trainer.local_rank = self.trainer.node_rank - self.trainer.global_rank = self.trainer.node_rank - self.trainer.world_size = self.trainer.num_nodes - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def init_device(self, process_idx): - self.trainer.root_gpu = process_idx - torch.cuda.set_device(self.trainer.root_gpu) - - def model_to_device(self, model): - model.cuda(self.trainer.root_gpu) - - def get_device_ids(self): - device_ids = self.trainer.data_parallel_device_ids - return device_ids - - def ddp_train(self, process_idx, mp_queue, model): - """ - Entry point for ddp - - Args: - process_idx: current process rank - mp_queue: multiprocessing queue - model: pointer to current :class:`LightningModule` - - Returns: - Dict with evaluation results - - """ - # Todo: required argument `mp_queue` is not used - # show progressbar only on progress_rank 0 - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # Initialize cuda device - self.init_device(process_idx) - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - if isinstance(self.ddp_plugin, RPCPlugin): - if not self.ddp_plugin.is_main_rpc_process: - self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer) - self.ddp_plugin.exit_rpc_process() - if self.ddp_plugin.return_after_exit_rpc_process: - return - else: - self.ddp_plugin.on_main_rpc_connection(self.trainer) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend}') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - self.ddp_plugin.on_after_setup_optimizers(self.trainer) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - self.trainer.convert_to_lightning_optimizers() - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - return sync_ddp_if_available(tensor, group, reduce_op) - - def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False): - """ - Function to gather a tensor from several distributed processes - - Args: - tensor: tensor of shape (batch, ...) - group: the process group to gather results from. Defaults to all processes (world) - sync_grads: flag that allows users to synchronize gradients for all_gather op - - Return: - A tensor of shape (world_size, batch, ...) - """ - return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) - - def get_reference_model(self, model) -> LightningModule: - return self.ddp_plugin.get_model_from_plugin(model) - - @property - def distributed_sampler_kwargs(self): - distributed_sampler_kwargs = dict( - num_replicas=self.trainer.num_nodes, - rank=self.trainer.global_rank - ) - if self.ddp_plugin is not None: - distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs) - return distributed_sampler_kwargs - - @property - def require_distributed_sampler(self): - return True diff --git a/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py deleted file mode 100644 index 7db8e3defdb21..0000000000000 --- a/pytorch_lightning/accelerators/old/ddp_cpu_hpc_accelerator.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -from typing import Optional - -from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator -from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin - - -class DDPCPUHPCAccelerator(DDPHPCAccelerator): - - def __init__(self, - trainer, - cluster_environment: Optional[ClusterEnvironment] = None, - ddp_plugin: Optional[DDPPlugin] = None): - """ - Runs training using DDP (with CPUs) strategy on a cluster - - Example:: - - # default - trainer = Trainer(accelerator=DDPCPUHPCAccelerator()) - - """ - super().__init__(trainer, cluster_environment, ddp_plugin) - self.nickname = 'ddp_cpu' - - def model_to_device(self, model, process_idx): - # Todo: required argument `process_idx` is not used - model.cpu() - - def get_device_ids(self): - device_ids = None - return device_ids - - def init_device(self, process_idx): - pass diff --git a/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py deleted file mode 100644 index b15b9e8062257..0000000000000 --- a/pytorch_lightning/accelerators/old/ddp_cpu_spawn_accelerator.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import os -from typing import Any, List, Optional, Union - -import torch -import torch.distributed as torch_distrib -import torch.multiprocessing as mp -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import ( - all_gather_ddp_if_available, - find_free_network_port, - rank_zero_only, - rank_zero_warn, - sync_ddp_if_available, -) - - -class DDPCPUSpawnAccelerator(Accelerator): - - def __init__(self, - trainer, - nprocs: int, - cluster_environment: Optional[ClusterEnvironment] = None, - ddp_plugin: Optional[DDPPlugin] = None): - """ - Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn - - Example:: - - # default - trainer = Trainer(accelerator=DDPCPUSpawnAccelerator()) - - """ - super().__init__(trainer, cluster_environment, ddp_plugin) - self.mp_queue = None - self.nprocs = nprocs - self.dist = LightningDistributed() - self.nickname = 'ddp_cpu' - - def setup(self, model): - os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) - - # pass in a state q - smp = mp.get_context('spawn') - self.mp_queue = smp.SimpleQueue() - - self.trainer.model = model - - def train(self): - model = self.trainer.model - - # train in children process - mp.spawn(self.ddp_train, nprocs=self.nprocs, args=(self.mp_queue, model,)) - - # restore main state with best weights - best_path = self.mp_queue.get() - results = self.mp_queue.get() - - # recover the weights of the processes trained in the children - self.__recover_child_process_weights(model, best_path) - return results - - def ddp_train(self, process_idx, mp_queue, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - """ - # show progressbar only on progress_rank 0 - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - if isinstance(self.ddp_plugin, RPCPlugin): - if not self.ddp_plugin.is_main_rpc_process: - self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer) - self.ddp_plugin.exit_rpc_process() - if self.ddp_plugin.return_after_exit_rpc_process: - return - else: - self.ddp_plugin.on_main_rpc_connection(self.trainer) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend}') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - self.ddp_plugin.on_after_setup_optimizers(self.trainer) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - self.trainer.convert_to_lightning_optimizers() - - # DDP spawn already spawned off each process... no need to do anything - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # get original model - model = self.trainer.get_model() - - # persist info in ddp_spawn - self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - - # clean up memory - torch.cuda.empty_cache() - - def training_step(self, args): - return self._step(args) - - def validation_step(self, args): - return self._step(args) - - def test_step(self, args): - return self._step(args) - - def _step(self, args): - args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args) - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - torch_distrib.all_reduce(stop, op=torch_distrib.reduce_op.SUM) - torch_distrib.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - def model_to_device(self, model, process_idx): - # Todo: required argument `process_idx` is not used - model.cpu() - - def get_device_ids(self): - device_ids = None - return device_ids - - def __recover_child_process_weights(self, model, best_path): - # transfer back the best path to the trainer - if self.trainer.checkpoint_callback: - self.trainer.checkpoint_callback.best_model_path = best_path - - self.trainer.model = model - - def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): - # Todo: required argument `model` is not used - # track the best model path - best_model_path = None - if self.trainer.checkpoint_callback is not None: - best_model_path = self.trainer.checkpoint_callback.best_model_path - - if self.trainer.global_rank == 0 and mp_queue is not None: - rank_zero_warn('cleaning up ddp environment...') - # todo, pass complete checkpoint as state dictionary - mp_queue.put(best_model_path) - mp_queue.put(results) - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - return sync_ddp_if_available(tensor, group, reduce_op) - - def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False): - """ - Function to gather a tensor from several distributed processes - - Args: - tensor: tensor of shape (batch, ...) - group: the process group to gather results from. Defaults to all processes (world) - sync_grads: flag that allows users to synchronize gradients for all_gather op - - Return: - A tensor of shape (world_size, batch, ...) - """ - return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) - - def get_reference_model(self, model) -> LightningModule: - return self.ddp_plugin.get_model_from_plugin(model) - - @property - def distributed_sampler_kwargs(self): - distributed_sampler_kwargs = dict( - num_replicas=self.trainer.num_nodes * self.trainer.num_processes, - rank=self.trainer.global_rank - ) - if self.ddp_plugin is not None: - distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs) - return distributed_sampler_kwargs - - @property - def require_distributed_sampler(self): - return True diff --git a/pytorch_lightning/accelerators/old/dp_accelerator.py b/pytorch_lightning/accelerators/old/dp_accelerator.py deleted file mode 100644 index 847d156d4f11d..0000000000000 --- a/pytorch_lightning/accelerators/old/dp_accelerator.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Optional - -import torch -from torch import optim - -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.core.step_result import Result -from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDataParallel -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.exceptions import MisconfigurationException - - -class DataParallelAccelerator(Accelerator): - - def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): - """ - Runs training using DP via manual start (not HPC cluster) - - Example:: - - # default - trainer = Trainer(accelerator=DataParallelAccelerator()) - - """ - super().__init__(trainer, cluster_environment) - self.model_autocast_original_forward = None - self.dist = LightningDistributed() - self.nickname = 'dp' - - def setup(self, model): - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # put model on correct device - model.cuda(self.trainer.root_gpu) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # init torch data parallel - model = self.__init_torch_data_parallel(model) - - # hack forward to do autocast for the user - self.model_autocast_original_forward = model.forward - - # init half precision - if self.trainer.amp_backend: - model = self.__init_half_precision(model) - - self.trainer.convert_to_lightning_optimizers() - - self.trainer.model = model - - def __init_torch_data_parallel(self, model): - # create list of device ids - device_ids = self.trainer.data_parallel_device_ids - if isinstance(device_ids, int): - device_ids = list(range(device_ids)) - - # set dp device - torch.cuda.set_device(self.trainer.root_gpu) - model = LightningDataParallel(model, device_ids=device_ids) - return model - - def __init_half_precision(self, model): - if self.trainer.amp_backend == AMPType.NATIVE: - self.__init_native_amp(model) - else: - model = self.__init_nvidia_apex(model) - return model - - def __init_native_amp(self, model): - model.forward = torch.cuda.amp.autocast()(model.forward) - - def __init_nvidia_apex(self, model): - # check for this bug (amp + dp + !01 doesn't work) - # https://github.com/NVIDIA/apex/issues/227 - if self.trainer.amp_level == 'O2': - raise MisconfigurationException( - f'Amp level {self.trainer.amp_level} with DataParallel is not supported.' - f' See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.' - f' We recommend you switch to ddp if you want to use amp') - else: - model = self.trainer.precision_connector.connect(model) - - return model - - def train(self): - model = self.trainer.model - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - return results - - def teardown(self): - # replace the original fwd function - self.trainer.model.forward = self.model_autocast_original_forward - self.barrier() - - def _step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def training_step(self, args): - return self._step(args) - - def validation_step(self, args): - return self._step(args) - - def test_step(self, args): - return self._step(args) - - def training_step_end(self, output): - if isinstance(output, Result): - output.dp_reduce() - elif isinstance(output, torch.Tensor): - output = output.mean() - return output - - def validation_step_end(self, output): - if isinstance(output, Result): - output.dp_reduce() - elif isinstance(output, torch.Tensor): - output = output.mean() - return output - - def test_step_end(self, output): - if isinstance(output, Result): - output.dp_reduce() - elif isinstance(output, torch.Tensor): - output = output.mean() - return output - - def reinit_scheduler_properties(self, optimizers: list, schedulers: list): - """ - Reinitialize optimizer.step properties added by schedulers - """ - for scheduler in schedulers: - scheduler = scheduler['scheduler'] - - for optimizer in optimizers: - # check that we dont mix users optimizers and schedulers - if scheduler.optimizer == optimizer: - # Find the mro belonging to the base lr scheduler class - for i, mro in enumerate(scheduler.__class__.__mro__): - is_regular_scheduler = optim.lr_scheduler._LRScheduler - is_lr_reduce_on_plateau = optim.lr_scheduler.ReduceLROnPlateau - if is_regular_scheduler or is_lr_reduce_on_plateau: - idx = i - state = scheduler.state_dict() - else: - state = None - - scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) - if state is not None: - scheduler.load_state_dict(state) - - def get_reference_model(self, model) -> LightningModule: - if isinstance(model, LightningDataParallel): - return model.module - return model - - @property - def require_distributed_sampler(self): - return False diff --git a/pytorch_lightning/accelerators/old/gpu_accelerator.py b/pytorch_lightning/accelerators/old/gpu_accelerator.py deleted file mode 100644 index 2fe3b26679f5c..0000000000000 --- a/pytorch_lightning/accelerators/old/gpu_accelerator.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Any, Callable, Optional, Union - -import torch - -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.utilities import AMPType - - -class GPUAccelerator(Accelerator): - amp_backend: AMPType - - def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): - """ - Runs training using a single GPU - - Example:: - - # default - trainer = Trainer(accelerator=GPUAccelerator()) - - """ - super().__init__(trainer, cluster_environment) - self.dist = LightningDistributed() - self.nickname = None - - def setup(self, model): - - # call setup - self.trainer.call_setup_hook(model) - - torch.cuda.set_device(self.trainer.root_gpu) - model.cuda(self.trainer.root_gpu) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - self.trainer.convert_to_lightning_optimizers() - - self.trainer.model = model - - def train(self): - model = self.trainer.model - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - return results - - def _step(self, model_step: Callable, args): - args[0] = self.to_device(args[0]) - - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = model_step(*args) - else: - output = model_step(*args) - - return output - - def training_step(self, args): - return self._step(self.trainer.model.training_step, args) - - def validation_step(self, args): - return self._step(self.trainer.model.validation_step, args) - - def test_step(self, args): - return self._step(self.trainer.model.test_step, args) - - def to_device(self, batch): - gpu_id = 0 - if isinstance(self.trainer.data_parallel_device_ids, list): - gpu_id = self.trainer.data_parallel_device_ids[0] - - # Don't copy the batch since there is a single gpu that the batch could - # be referenced from and if there are multiple optimizers the batch will - # wind up copying it to the same device repeatedly. - return self.batch_to_device(batch, gpu_id) - - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - return tensor - - @property - def require_distributed_sampler(self): - return False From dfcbba6241376f4f7b8c17bae4d37e4218089ec8 Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 6 Jan 2021 17:31:19 +0100 Subject: [PATCH 095/157] fix naming --- pytorch_lightning/accelerators/accelerator.py | 58 +---------- .../accelerators/accelerator_connector.py | 10 +- pytorch_lightning/accelerators/cpu.py | 14 +++ pytorch_lightning/accelerators/gpu.py | 25 +++++ pytorch_lightning/accelerators/tpu.py | 13 +++ pytorch_lightning/trainer/data_loading.py | 4 +- pytorch_lightning/trainer/properties.py | 4 +- pytorch_lightning/trainer/trainer.py | 4 +- test.py | 97 +++++++++++++++++++ tests/backends/test_accelerator_connector.py | 32 +++--- tests/core/test_datamodules.py | 2 +- tests/models/test_hooks.py | 30 +++--- tests/models/test_horovod.py | 4 +- tests/models/test_tpu.py | 4 +- 14 files changed, 197 insertions(+), 104 deletions(-) create mode 100644 pytorch_lightning/accelerators/cpu.py create mode 100644 pytorch_lightning/accelerators/gpu.py create mode 100644 pytorch_lightning/accelerators/tpu.py create mode 100644 test.py diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 07777d982b2d6..81eb112206d28 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -21,7 +21,7 @@ from pytorch_lightning.utilities.apply_func import move_data_to_device -class NewAccelerator(object): +class Accelerator(object): def __init__( self, precision_plugin: PrecisionPlugin, @@ -161,15 +161,6 @@ def _clip_gradients(self, optimizer, grad_clip_val): if grad_clip_val <= 0: return - # TODO: Change this. Probably to isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.APEX - - # if self.trainer.amp_backend == AMPType.APEX: - # parameters = self.precision_plugin.master_params(optimizer) - # else: - # parameters = model.parameters() - - # TODO - # ... or we call master_params() and in the default plugin we return the model.parameters() parameters = self.precision_plugin.master_params(optimizer) max_norm = grad_clip_val @@ -246,7 +237,6 @@ def scaler(self): def rpc_enabled(self): return self.training_type_plugin.rpc_enabled - # TODO: Check where this comes from and why it is needed def optimizer_state(self, optimizer: Optimizer) -> dict: """ Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom @@ -259,48 +249,4 @@ def optimizer_state(self, optimizer: Optimizer) -> dict: return optimizer.state_dict() def on_save(self, checkpoint): - return checkpoint - - -class NewCPUAccelerator(NewAccelerator): - def setup(self, trainer, model): - if isinstance(self.precision_plugin, MixedPrecisionPlugin): - MisconfigurationException("amp + cpu is not supported. Please use a GPU option") - - if "cpu" not in str(self.root_device): - raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead") - - return super().setup(trainer, model) - - -class NewGPUAccelerator(NewAccelerator): - def setup(self, trainer, model): - if "cuda" not in str(self.root_device): - raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") - torch.cuda.set_device(self.root_device) - model.to(self.root_device) - - return super().setup(trainer, model) - - def on_train_start(self): - # clear cache before training - # use context because of: - # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() - - def on_train_end(self): - # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() - -# TODO: Complete the TPUAccelerator -class NewTPUAccelerator(NewAccelerator): - def setup(self, trainer, model): - raise NotImplementedError - - def on_train_start(self): - raise NotImplementedError - - def on_train_end(self): - raise NotImplementedError + return checkpoint \ No newline at end of file diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 7addf4bdd72c2..e03e51cbba6ed 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -15,7 +15,9 @@ import os import torch -from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewAccelerator, NewGPUAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.cpu import CPUAccelerator +from pytorch_lightning.accelerators.gpu import GPUAccelerator from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ DataParallelPlugin, DDP2Plugin, HorovodPlugin from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin @@ -241,14 +243,14 @@ def select_training_type_plugin(self): return plugin def select_accelerator(self): - if isinstance(self.distributed_backend, NewAccelerator): + if isinstance(self.distributed_backend, Accelerator): # custom accelerator from user return self.distributed_backend if self.on_gpu: - acc_cls = NewGPUAccelerator + acc_cls = GPUAccelerator else: - acc_cls = NewCPUAccelerator + acc_cls = CPUAccelerator return acc_cls( precision_plugin=self.select_precision_plugin(), diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py new file mode 100644 index 0000000000000..e9f49e20a464f --- /dev/null +++ b/pytorch_lightning/accelerators/cpu.py @@ -0,0 +1,14 @@ +from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class CPUAccelerator(Accelerator): + def setup(self, trainer, model): + if isinstance(self.precision_plugin, MixedPrecisionPlugin): + MisconfigurationException("amp + cpu is not supported. Please use a GPU option") + + if "cpu" not in str(self.root_device): + raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead") + + return super().setup(trainer, model) \ No newline at end of file diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py new file mode 100644 index 0000000000000..7b2cbe3627e0b --- /dev/null +++ b/pytorch_lightning/accelerators/gpu.py @@ -0,0 +1,25 @@ +import torch +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.accelerators.accelerator import Accelerator + + +class GPUAccelerator(Accelerator): + def setup(self, trainer, model): + if "cuda" not in str(self.root_device): + raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + torch.cuda.set_device(self.root_device) + model.to(self.root_device) + + return super().setup(trainer, model) + + def on_train_start(self): + # clear cache before training + # use context because of: + # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() + + def on_train_end(self): + # clean up memory + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() \ No newline at end of file diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py new file mode 100644 index 0000000000000..bf922b1c2df8e --- /dev/null +++ b/pytorch_lightning/accelerators/tpu.py @@ -0,0 +1,13 @@ +# TODO: Complete the TPUAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator + + +class TPUAccelerator(Accelerator): + def setup(self, trainer, model): + raise NotImplementedError + + def on_train_start(self): + raise NotImplementedError + + def on_train_end(self): + raise NotImplementedError diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index cc5fc492b3a6a..4c77f353c0688 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -21,7 +21,7 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler -from pytorch_lightning.accelerators.accelerator import NewAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.core import LightningModule from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities import rank_zero_warn @@ -51,7 +51,7 @@ class TrainerDataLoadingMixin(ABC): limit_val_batches: Union[int, float] limit_test_batches: Union[int, float] replace_sampler_ddp: bool - accelerator_backend: NewAccelerator + accelerator_backend: Accelerator num_nodes: int num_processes: int distributed_backend: Optional[str] diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 62241722ff365..494e91a298843 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -17,7 +17,7 @@ from argparse import ArgumentParser, Namespace from typing import cast, List, Optional, Type, TypeVar, Union -from pytorch_lightning.accelerators.accelerator import NewAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.accelerator_connector import BackendConnector from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint, EarlyStopping from pytorch_lightning.core.lightning import LightningModule @@ -63,7 +63,7 @@ class TrainerProperties(ABC): limit_val_batches: int _default_root_dir: str _weights_save_path: str - accelerator_backend: NewAccelerator + accelerator_backend: Accelerator num_nodes: int num_processes: int accelerator_connector: BackendConnector diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 382b6e3c5ae8e..4d0718c5e2b48 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -79,7 +79,7 @@ from pytorch_lightning.utilities.model_utils import is_overridden from pytorch_lightning.trainer.properties import TrainerProperties from pytorch_lightning.plugins.plugin_connector import PluginConnector -from pytorch_lightning.accelerators.accelerator import NewAccelerator +from pytorch_lightning.accelerators.accelerator import Accelerator # warnings to ignore in trainer warnings.filterwarnings( @@ -129,7 +129,7 @@ def __init__( val_check_interval: Union[int, float] = 1.0, flush_logs_every_n_steps: int = 100, log_every_n_steps: int = 50, - accelerator: Optional[Union[str, NewAccelerator]] = None, + accelerator: Optional[Union[str, Accelerator]] = None, sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = "top", diff --git a/test.py b/test.py new file mode 100644 index 0000000000000..959436c179c21 --- /dev/null +++ b/test.py @@ -0,0 +1,97 @@ +import torch +import pytorch_lightning as pl + +class RandomDataset(torch.utils.data.Dataset): + def __init__(self, size, length): + self.len = length + self.data = torch.randn(length, size) + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return self.len + + +class BoringModel(pl.LightningModule): + + def __init__(self): + """ + Testing PL Module + + Use as follows: + - subclass + - modify the behavior for what you want + + class TestModel(BaseTestModel): + def training_step(...): + # do your own thing + + or: + + model = BaseTestModel() + model.training_epoch_end = None + + """ + super().__init__() + self.layer = torch.nn.Linear(32, 2) + + def forward(self, x): + return self.layer(x) + + def loss(self, batch, prediction): + # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls + return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) + + def step(self, x): + x = self(x) + out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) + return out + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"loss": loss} + + def training_step_end(self, training_step_outputs): + return training_step_outputs + + def training_epoch_end(self, outputs) -> None: + torch.stack([x["loss"] for x in outputs]).mean() + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"x": loss} + + # def validation_epoch_end(self, outputs) -> None: + # torch.stack([x['x'] for x in outputs]).mean() + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"y": loss} + + def test_epoch_end(self, outputs) -> None: + torch.stack([x["y"] for x in outputs]).mean() + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + def train_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + def val_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + def test_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + @property + def automatic_optimization(self): + return True + +if __name__ == '__main__': + pl.Trainer(gpus=[1,], max_epochs=20, amp_backend='native').fit(BoringModel(), torch.utils.data.DataLoader(RandomDataset(32, 500))) \ No newline at end of file diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index 37a1911be38d3..b6f27f32a85fc 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -18,10 +18,12 @@ import pytest import torch -from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator, NewGPUAccelerator, NewAccelerator +from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.cpu import CPUAccelerator +from pytorch_lightning.accelerators.gpu import GPUAccelerator from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin from pytorch_lightning.accelerators.precision import PrecisionPlugin -from pytorch_lightning import Trainer from pytorch_lightning.callbacks import Callback from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment from tests.base.boring_model import BoringModel @@ -31,7 +33,7 @@ def test_accelerator_choice_cpu(tmpdir): trainer = Trainer( fast_dev_run=True, ) - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) assert isinstance(trainer.training_type_plugin, SingleDevicePlugin) @@ -40,7 +42,7 @@ def test_accelerator_choice_ddp_cpu(tmpdir): fast_dev_run=True, accelerator='ddp_cpu', ) - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) @@ -53,7 +55,7 @@ def test_accelerator_choice_ddp(tmpdir): accelerator='ddp', gpus=1, ) - assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.accelerator_backend, GPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) @@ -66,7 +68,7 @@ def test_accelerator_choice_ddp_spawn(tmpdir): accelerator='ddp_spawn', gpus=1, ) - assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.accelerator_backend, GPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) @@ -84,7 +86,7 @@ class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert trainer.accelerator_connector.is_slurm_managing_tasks - assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.accelerator_backend, GPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) assert trainer.training_type_plugin.task_idx == 10 @@ -117,7 +119,7 @@ class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert trainer.accelerator_connector.is_slurm_managing_tasks - assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.accelerator_backend, GPUAccelerator) assert isinstance(trainer.training_type_plugin, DDP2Plugin) assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) assert trainer.training_type_plugin.task_idx == 10 @@ -148,7 +150,7 @@ def test_accelerator_choice_ddp_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.accelerator_backend, GPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) assert trainer.training_type_plugin.task_idx == 10 @@ -178,7 +180,7 @@ def test_accelerator_choice_ddp2_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 - assert isinstance(trainer.accelerator_backend, NewGPUAccelerator) + assert isinstance(trainer.accelerator_backend, GPUAccelerator) assert isinstance(trainer.training_type_plugin, DDP2Plugin) assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) assert trainer.training_type_plugin.task_idx == 10 @@ -207,7 +209,7 @@ def test_accelerator_choice_ddp_cpu_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) assert trainer.training_type_plugin.task_idx == 10 @@ -239,7 +241,7 @@ class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert trainer.accelerator_connector.is_slurm_managing_tasks - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) raise SystemExit() @@ -276,7 +278,7 @@ def master_address(self): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster) raise SystemExit() @@ -303,7 +305,7 @@ def on_fit_start(self, trainer, pl_module): }) @mock.patch('torch.cuda.device_count', return_value=0) def test_custom_accelerator(tmpdir): - class Accel(NewAccelerator): + class Accel(Accelerator): pass class Prec(PrecisionPlugin): @@ -337,7 +339,7 @@ class TrainTypePlugin(SingleDevicePlugin): def test_dist_backend_accelerator_mapping(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) assert isinstance(trainer.training_type_plugin, DDPPlugin) raise SystemExit() diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 7796c9c074d6e..c28e1bdb8d658 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -398,7 +398,7 @@ def test_full_loop_dp(tmpdir): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock) +@mock.patch("pytorch_lightning.accelerators.accelerator.Accelerator.lightning_module", new_callable=PropertyMock) def test_dm_transfer_batch_to_device(get_module_mock): class CustomBatch: def __init__(self, data): diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index b2491389135f2..cfcd680cb0080 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -55,20 +55,19 @@ def test_training_epoch_end_metrics_collection(tmpdir): num_epochs = 3 class CurrentModel(EvalModelTemplate): - def training_step(self, *args, **kwargs): output = super().training_step(*args, **kwargs) - output['progress_bar'].update({'step_metric': torch.tensor(-1)}) - output['progress_bar'].update({'shared_metric': 100}) + output["progress_bar"].update({"step_metric": torch.tensor(-1)}) + output["progress_bar"].update({"shared_metric": 100}) return output def training_epoch_end(self, outputs): epoch = self.current_epoch # both scalar tensors and Python numbers are accepted return { - 'progress_bar': { - f'epoch_metric_{epoch}': torch.tensor(epoch), # add a new metric key every epoch - 'shared_metric': 111, + "progress_bar": { + f"epoch_metric_{epoch}": torch.tensor(epoch), # add a new metric key every epoch + "shared_metric": 111, } } @@ -83,20 +82,18 @@ def training_epoch_end(self, outputs): metrics = trainer.progress_bar_dict # metrics added in training step should be unchanged by epoch end method - assert metrics['step_metric'] == -1 + assert metrics["step_metric"] == -1 # a metric shared in both methods gets overwritten by epoch_end - assert metrics['shared_metric'] == 111 + assert metrics["shared_metric"] == 111 # metrics are kept after each epoch for i in range(num_epochs): - assert metrics[f'epoch_metric_{i}'] == i + assert metrics[f"epoch_metric_{i}"] == i @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -@mock.patch("pytorch_lightning.accelerators.accelerator.NewAccelerator.lightning_module", new_callable=PropertyMock) +@mock.patch("pytorch_lightning.accelerators.accelerator.Accelerator.lightning_module", new_callable=PropertyMock) def test_transfer_batch_hook(model_getter_mock): - class CustomBatch: - def __init__(self, data): self.samples = data[0] self.targets = data[1] @@ -120,16 +117,13 @@ def transfer_batch_to_device(self, data, device): trainer = Trainer(gpus=1) # running .fit() would require us to implement custom data loaders, we mock the model reference instead model_getter_mock.return_value = model - batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0')) - expected = torch.device('cuda', 0) + batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device("cuda:0")) + expected = torch.device("cuda", 0) assert model.hook_called assert batch_gpu.samples.device == batch_gpu.targets.device == expected -@pytest.mark.parametrize( - 'max_epochs,batch_idx_', - [(2, 5), (3, 8), (4, 12)] -) +@pytest.mark.parametrize("max_epochs,batch_idx_", [(2, 5), (3, 8), (4, 12)]) def test_on_train_batch_start_hook(max_epochs, batch_idx_): class CurrentModel(EvalModelTemplate): def on_train_batch_start(self, batch, batch_idx, dataloader_idx): diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 623f329035533..ca56a987aab98 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -26,7 +26,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.accelerator import NewCPUAccelerator +from pytorch_lightning.accelerators.accelerator import CPUAccelerator from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState @@ -312,7 +312,7 @@ def _compute_batch(): accelerator='horovod', ) - assert isinstance(trainer.accelerator_backend, NewCPUAccelerator) + assert isinstance(trainer.accelerator_backend, CPUAccelerator) # TODO: test that we selected the correct training_type_plugin based on horovod flags metric = Accuracy(compute_on_step=True, diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 45cd9b2154c43..8278ef60dc6bd 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -20,7 +20,7 @@ import tests.base.develop_pipelines as tpipes from pytorch_lightning import Trainer, seed_everything -from pytorch_lightning.accelerators.accelerator import NewTPUAccelerator +from pytorch_lightning.accelerators.accelerator import TPUAccelerator from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _TPU_AVAILABLE @@ -250,7 +250,7 @@ def test_broadcast_on_tpu(): """ Checks if an object from the master process is broadcasted to other processes correctly""" def test_broadcast(rank): trainer = Trainer(tpu_cores=8) - assert isinstance(trainer.accelerator_backend, NewTPUAccelerator) + assert isinstance(trainer.accelerator_backend, TPUAccelerator) obj = ("ver_0.5", "logger_name", rank) result = trainer.accelerator_backend.broadcast(obj) assert result == ("ver_0.5", "logger_name", 0) From 348a1b04efd006a1694b3415ca28d166e0862f68 Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 6 Jan 2021 18:14:25 +0100 Subject: [PATCH 096/157] move old plugins --- pytorch_lightning/plugins/__init__.py | 1 + pytorch_lightning/plugins/old/__init__.py | 0 pytorch_lightning/plugins/{ => old}/apex.py | 0 .../plugins/{ => old}/ddp_plugin.py | 0 .../{ => old}/ddp_sequential_plugin.py | 0 .../plugins/{ => old}/native_amp.py | 0 pytorch_lightning/plugins/{ => old}/plugin.py | 0 .../plugins/{ => old}/plugin_connector.py | 0 .../plugins/{ => old}/precision_plugin.py | 0 .../plugins/{ => old}/rpc_plugin.py | 0 .../{ => old}/sharded_native_amp_plugin.py | 0 .../plugins/{ => old}/sharded_plugin.py | 0 pytorch_lightning/trainer/optimizers.py | 21 ------------------- 13 files changed, 1 insertion(+), 21 deletions(-) create mode 100644 pytorch_lightning/plugins/old/__init__.py rename pytorch_lightning/plugins/{ => old}/apex.py (100%) rename pytorch_lightning/plugins/{ => old}/ddp_plugin.py (100%) rename pytorch_lightning/plugins/{ => old}/ddp_sequential_plugin.py (100%) rename pytorch_lightning/plugins/{ => old}/native_amp.py (100%) rename pytorch_lightning/plugins/{ => old}/plugin.py (100%) rename pytorch_lightning/plugins/{ => old}/plugin_connector.py (100%) rename pytorch_lightning/plugins/{ => old}/precision_plugin.py (100%) rename pytorch_lightning/plugins/{ => old}/rpc_plugin.py (100%) rename pytorch_lightning/plugins/{ => old}/sharded_native_amp_plugin.py (100%) rename pytorch_lightning/plugins/{ => old}/sharded_plugin.py (100%) diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index e69de29bb2d1d..b416a9f56aebe 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -0,0 +1 @@ +from pytorch_lightning.accelerators.plugins import * \ No newline at end of file diff --git a/pytorch_lightning/plugins/old/__init__.py b/pytorch_lightning/plugins/old/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/old/apex.py similarity index 100% rename from pytorch_lightning/plugins/apex.py rename to pytorch_lightning/plugins/old/apex.py diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/old/ddp_plugin.py similarity index 100% rename from pytorch_lightning/plugins/ddp_plugin.py rename to pytorch_lightning/plugins/old/ddp_plugin.py diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/old/ddp_sequential_plugin.py similarity index 100% rename from pytorch_lightning/plugins/ddp_sequential_plugin.py rename to pytorch_lightning/plugins/old/ddp_sequential_plugin.py diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/old/native_amp.py similarity index 100% rename from pytorch_lightning/plugins/native_amp.py rename to pytorch_lightning/plugins/old/native_amp.py diff --git a/pytorch_lightning/plugins/plugin.py b/pytorch_lightning/plugins/old/plugin.py similarity index 100% rename from pytorch_lightning/plugins/plugin.py rename to pytorch_lightning/plugins/old/plugin.py diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/old/plugin_connector.py similarity index 100% rename from pytorch_lightning/plugins/plugin_connector.py rename to pytorch_lightning/plugins/old/plugin_connector.py diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/old/precision_plugin.py similarity index 100% rename from pytorch_lightning/plugins/precision_plugin.py rename to pytorch_lightning/plugins/old/precision_plugin.py diff --git a/pytorch_lightning/plugins/rpc_plugin.py b/pytorch_lightning/plugins/old/rpc_plugin.py similarity index 100% rename from pytorch_lightning/plugins/rpc_plugin.py rename to pytorch_lightning/plugins/old/rpc_plugin.py diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py similarity index 100% rename from pytorch_lightning/plugins/sharded_native_amp_plugin.py rename to pytorch_lightning/plugins/old/sharded_native_amp_plugin.py diff --git a/pytorch_lightning/plugins/sharded_plugin.py b/pytorch_lightning/plugins/old/sharded_plugin.py similarity index 100% rename from pytorch_lightning/plugins/sharded_plugin.py rename to pytorch_lightning/plugins/old/sharded_plugin.py diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index e56856dfb2b4f..33a7836ab974a 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -140,27 +140,6 @@ def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None): raise ValueError(f'The provided lr scheduler "{scheduler}" is invalid') return lr_schedulers - def reinit_scheduler_properties(self, optimizers: list, schedulers: list): - # Reinitialize optimizer.step properties added by schedulers - for scheduler in schedulers: - scheduler = scheduler['scheduler'] - - for optimizer in optimizers: - # check that we dont mix users optimizers and schedulers - if scheduler.optimizer == optimizer: - # Find the mro belonging to the base lr scheduler class - for i, mro in enumerate(scheduler.__class__.__mro__): - if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau): - idx = i - state = scheduler.state_dict() - else: - state = None - - scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) - if state is not None: - scheduler.load_state_dict(state) - - class _MockOptimizer(Optimizer): """The `_MockOptimizer` will be used inplace of an optimizer in the event that `None` is returned from `configure_optimizers`. From 14f2f6e9a8cd4438a305f6be1ae05320b370e8fd Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 6 Jan 2021 18:14:47 +0100 Subject: [PATCH 097/157] move to plugins --- pytorch_lightning/accelerators/{ => plugins}/base_plugin.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pytorch_lightning/accelerators/{ => plugins}/base_plugin.py (100%) diff --git a/pytorch_lightning/accelerators/base_plugin.py b/pytorch_lightning/accelerators/plugins/base_plugin.py similarity index 100% rename from pytorch_lightning/accelerators/base_plugin.py rename to pytorch_lightning/accelerators/plugins/base_plugin.py From 2f779c618f2cc8bad00f2df978f971eb9ff08f1b Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 6 Jan 2021 18:15:18 +0100 Subject: [PATCH 098/157] create precision subpackage --- .../plugins/precision/__init__.py | 4 + .../plugins/precision/apex_amp.py | 115 +++++++++++ .../accelerators/plugins/precision/mixed.py | 7 + .../plugins/precision/native_amp.py | 48 +++++ .../plugins/precision/precision_plugin.py | 45 +++++ pytorch_lightning/accelerators/precision.py | 189 ------------------ 6 files changed, 219 insertions(+), 189 deletions(-) create mode 100644 pytorch_lightning/accelerators/plugins/precision/__init__.py create mode 100644 pytorch_lightning/accelerators/plugins/precision/apex_amp.py create mode 100644 pytorch_lightning/accelerators/plugins/precision/mixed.py create mode 100644 pytorch_lightning/accelerators/plugins/precision/native_amp.py create mode 100644 pytorch_lightning/accelerators/plugins/precision/precision_plugin.py delete mode 100644 pytorch_lightning/accelerators/precision.py diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py new file mode 100644 index 0000000000000..4f30fe58910f4 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/precision/__init__.py @@ -0,0 +1,4 @@ +from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py new file mode 100644 index 0000000000000..9bb749bf18dbb --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py @@ -0,0 +1,115 @@ +from contextlib import contextmanager +from typing import List, Tuple +import torch +from torch.optim import Optimizer +from pytorch_lightning.core import LightningModule +from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, rank_zero_warn +from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin + +if APEX_AVAILABLE: + from apex import amp + +class ApexMixedPrecisionPlugin(MixedPrecisionPlugin): + def __init__(self, amp_level): + self.backend = AMPType.APEX + self.amp_level = amp_level + + def master_params(self, optimizer): + return amp.master_params(optimizer) + + def connect(self, model, optimizers, lr_schedulers): + model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level) + self.reinit_scheduler_properties(optimizers, lr_schedulers) + return model, optimizers, lr_schedulers + + def backward( + self, + model: LightningModule, + closure_loss: torch.Tensor, + optimizer: torch.optim.Optimizer, + opt_idx: int, + should_accumulate: bool, + *args, + **kwargs, + ): + closure_loss = amp.scale_loss(closure_loss, optimizer) + + # enter apex context + context = closure_loss + closure_loss = closure_loss.__enter__() + + # do backward pass + # TODO: not entirely sure, why we need this + if model is not None and isinstance(model, LightningModule): + model.backward(closure_loss, optimizer, opt_idx) + else: + closure_loss.backward(*args, **kwargs) + + # exit amp context + a, b, c = None, None, None + error = context.__exit__(a, b, c) + if error: + rank_zero_warn(a, b, c) + raise Exception("apex unscale error") + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + return closure_loss + + def configure_apex( + self, + amp: object, + model: LightningModule, + optimizers: List[Optimizer], + amp_level: str, + ) -> Tuple[LightningModule, List[Optimizer]]: + r""" + Override to init AMP your own way. + Must return a model and list of optimizers. + + Args: + amp: pointer to amp library object. + model: pointer to current :class:`LightningModule`. + optimizers: list of optimizers passed in :meth:`configure_optimizers`. + amp_level: AMP mode chosen ('O1', 'O2', etc...) + + Return: + Apex wrapped model and optimizers + + Examples: + .. code-block:: python + + # Default implementation used by Trainer. + def configure_apex(self, amp, model, optimizers, amp_level): + model, optimizers = amp.initialize( + model, optimizers, opt_level=amp_level, + ) + + return model, optimizers + """ + model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level) + return model, optimizers + + @staticmethod + def reinit_scheduler_properties(optimizers: list, schedulers: list): + # Reinitialize optimizer.step properties added by schedulers + for scheduler in schedulers: + scheduler = scheduler['scheduler'] + + for optimizer in optimizers: + state = None + idx = 0 + + # check that we dont mix users optimizers and schedulers + if scheduler.optimizer == optimizer: + # Find the mro belonging to the base lr scheduler class + for i, mro in enumerate(scheduler.__class__.__mro__): + if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau): + idx = i + state = scheduler.state_dict() + else: + state = None + + scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) + if state is not None: + scheduler.load_state_dict(state) \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/precision/mixed.py b/pytorch_lightning/accelerators/plugins/precision/mixed.py new file mode 100644 index 0000000000000..1eb1ea18ebc23 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/precision/mixed.py @@ -0,0 +1,7 @@ +from pytorch_lightning.utilities import AMPType +from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin + +class MixedPrecisionPlugin(PrecisionPlugin): + EPSILON = 1e-5 + backend: AMPType + precision = "mixed" \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/precision/native_amp.py b/pytorch_lightning/accelerators/plugins/precision/native_amp.py new file mode 100644 index 0000000000000..f233a43dfdd53 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/precision/native_amp.py @@ -0,0 +1,48 @@ +from contextlib import contextmanager +import torch +from pytorch_lightning.core import LightningModule +from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin + + +class NativeMixedPrecisionPlugin(MixedPrecisionPlugin): + def __init__(self): + self.backend = AMPType.NATIVE + self.scaler = torch.cuda.amp.GradScaler() + + def pre_optimizer_step(self, optimizer, optimizer_idx): + if isinstance(optimizer, torch.optim.LBFGS): + raise MisconfigurationException( + f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})." + " To request, please file a Github issue in PyTorch and tag @mcarilli" + ) + + def post_optimizer_step(self, optimizer, optimizer_idx): + self.scaler.update() + + def backward( + self, + model: LightningModule, + closure_loss: torch.Tensor, + optimizer: torch.optim.Optimizer, + opt_idx: int, + should_accumulate: bool, + *args, + **kwargs, + ): + closure_loss = self.scaler.scale(closure_loss) + + automatic_optimization = model.automatic_optimization + + closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs) + + # unscale gradient to allow analyze within `on_after_backward` + if not should_accumulate and automatic_optimization: + self.scaler.unscale_(optimizer) + + return closure_loss + + @contextmanager + def train_step_context(self): + yield torch.cuda.amp.autocast() \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py new file mode 100644 index 0000000000000..048a645de250a --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py @@ -0,0 +1,45 @@ +import torch +from pytorch_lightning.core import LightningModule +from pytorch_lightning.accelerators.plugins.base_plugin import Plugin + + +class PrecisionPlugin(Plugin): + EPSILON = 1e-6 + precision = 32 + + def pre_optimizer_step(self, optimizer, optimizer_idx): + pass + + def post_optimizer_step(self, optimizer, optimizer_idx): + pass + + def master_params(self, optimizer): + for group in optimizer.param_groups: + for p in group["params"]: + yield p + + def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): + return model, optimizers, lr_schedulers + + def backward( + self, + model: LightningModule, + closure_loss: torch.Tensor, + optimizer: torch.optim.Optimizer, + opt_idx: int, + should_accumulate: bool, + *args, + **kwargs, + ): + automatic_optimization = model.automatic_optimization + + # do backward pass + if automatic_optimization: + model.backward(closure_loss, optimizer, opt_idx) + else: + closure_loss.backward(*args, **kwargs) + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + + return closure_loss \ No newline at end of file diff --git a/pytorch_lightning/accelerators/precision.py b/pytorch_lightning/accelerators/precision.py deleted file mode 100644 index a2ee98b686bae..0000000000000 --- a/pytorch_lightning/accelerators/precision.py +++ /dev/null @@ -1,189 +0,0 @@ -from contextlib import contextmanager -from pytorch_lightning.accelerators.base_plugin import Plugin -from pytorch_lightning.accelerators.scheduler_properties import reinit_scheduler_properties -from pytorch_lightning.core.lightning import LightningModule -from typing import List, Tuple -import torch -from torch.optim import Optimizer - -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities import AMPType, rank_zero_warn - -try: - from apex import amp -except ImportError: - amp = None - - -class PrecisionPlugin(Plugin): - EPSILON = 1e-6 - precision = 32 - - def pre_optimizer_step(self, optimizer, optimizer_idx): - pass - - def post_optimizer_step(self, optimizer, optimizer_idx): - pass - - def master_params(self, optimizer): - for group in optimizer.param_groups: - for p in group["params"]: - yield p - - def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): - return model, optimizers, lr_schedulers - - def backward( - self, - model: LightningModule, - closure_loss: torch.Tensor, - optimizer: torch.optim.Optimizer, - opt_idx: int, - should_accumulate: bool, - *args, - **kwargs, - ): - # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) - automatic_optimization = model.automatic_optimization - - # do backward pass - if automatic_optimization: - model.backward(closure_loss, optimizer, opt_idx) - else: - closure_loss.backward(*args, **kwargs) - - # once backward has been applied, release graph - closure_loss = closure_loss.detach() - - return closure_loss - - -class MixedPrecisionPlugin(PrecisionPlugin): - EPSILON = 1e-5 - backend: AMPType - precision = "mixed" - - -class NativeMixedPrecisionPlugin(MixedPrecisionPlugin): - def __init__(self): - self.backend = AMPType.NATIVE - self.scaler = torch.cuda.amp.GradScaler() - - def pre_optimizer_step(self, optimizer, optimizer_idx): - if isinstance(optimizer, torch.optim.LBFGS): - raise MisconfigurationException( - f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})." - " To request, please file a Github issue in PyTorch and tag @mcarilli" - ) - - def post_optimizer_step(self, optimizer, optimizer_idx): - self.scaler.update() - - def backward( - self, - model: LightningModule, - closure_loss: torch.Tensor, - optimizer: torch.optim.Optimizer, - opt_idx: int, - should_accumulate: bool, - *args, - **kwargs, - ): - closure_loss = self.scaler.scale(closure_loss) - - # TODO: Check where we can get automatic_optimization from (probably when setting up the model after https://github.com/PyTorchLightning/pytorch-lightning/issues/4317) - automatic_optimization = model.automatic_optimization - - closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs) - - # unscale gradient to allow analyze within `on_after_backward` - if not should_accumulate and automatic_optimization: - self.scaler.unscale_(optimizer) - - return closure_loss - - @contextmanager - def train_step_context(self): - yield torch.cuda.amp.autocast() - - -class ApexMixedPrecisionPlugin(MixedPrecisionPlugin): - def __init__(self, amp_level): - self.backend = AMPType.APEX - self.amp_level = amp_level - - def master_params(self, optimizer): - return amp.master_params(optimizer) - - def connect(self, model, optimizers, lr_schedulers): - model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level) - reinit_scheduler_properties(optimizers, lr_schedulers) - return model, optimizers, lr_schedulers - - def backward( - self, - model: LightningModule, - closure_loss: torch.Tensor, - optimizer: torch.optim.Optimizer, - opt_idx: int, - should_accumulate: bool, - *args, - **kwargs, - ): - closure_loss = amp.scale_loss(closure_loss, optimizer) - - # enter apex context - context = closure_loss - closure_loss = closure_loss.__enter__() - - # do backward pass - # TODO: not entirely sure, why we need this - if model is not None and isinstance(model, LightningModule): - model.backward(closure_loss, optimizer, opt_idx) - else: - closure_loss.backward(*args, **kwargs) - - # exit amp context - a, b, c = None, None, None - error = context.__exit__(a, b, c) - if error: - rank_zero_warn(a, b, c) - raise Exception("apex unscale error") - - # once backward has been applied, release graph - closure_loss = closure_loss.detach() - return closure_loss - - def configure_apex( - self, - amp: object, - model: LightningModule, - optimizers: List[Optimizer], - amp_level: str, - ) -> Tuple[LightningModule, List[Optimizer]]: - r""" - Override to init AMP your own way. - Must return a model and list of optimizers. - - Args: - amp: pointer to amp library object. - model: pointer to current :class:`LightningModule`. - optimizers: list of optimizers passed in :meth:`configure_optimizers`. - amp_level: AMP mode chosen ('O1', 'O2', etc...) - - Return: - Apex wrapped model and optimizers - - Examples: - .. code-block:: python - - # Default implementation used by Trainer. - def configure_apex(self, amp, model, optimizers, amp_level): - model, optimizers = amp.initialize( - model, optimizers, opt_level=amp_level, - ) - - return model, optimizers - """ - model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level) - return model, optimizers \ No newline at end of file From 58536f673aaf1b352babccbfde3fc7cbb5eb9038 Mon Sep 17 00:00:00 2001 From: justusschock Date: Wed, 6 Jan 2021 18:15:33 +0100 Subject: [PATCH 099/157] create training_type subpackage --- .../accelerators/data_parallel.py | 843 ------------------ .../accelerators/plugins/__init__.py | 3 + .../plugins/training_type/__init__.py | 8 + .../accelerators/plugins/training_type/ddp.py | 244 +++++ .../plugins/training_type/ddp2.py | 5 + .../plugins/training_type/ddp_spawn.py | 213 +++++ .../accelerators/plugins/training_type/dp.py | 44 + .../plugins/training_type/horovod.py | 148 +++ .../plugins/training_type/parallel.py | 91 ++ .../plugins/training_type/single_device.py | 40 + .../training_type/training_type_plugin.py | 93 ++ .../accelerators/scheduler_properties.py | 25 - 12 files changed, 889 insertions(+), 868 deletions(-) delete mode 100644 pytorch_lightning/accelerators/data_parallel.py create mode 100644 pytorch_lightning/accelerators/plugins/__init__.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/__init__.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/ddp.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/ddp2.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/dp.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/horovod.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/parallel.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/single_device.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py delete mode 100644 pytorch_lightning/accelerators/scheduler_properties.py diff --git a/pytorch_lightning/accelerators/data_parallel.py b/pytorch_lightning/accelerators/data_parallel.py deleted file mode 100644 index 02a748222732e..0000000000000 --- a/pytorch_lightning/accelerators/data_parallel.py +++ /dev/null @@ -1,843 +0,0 @@ -from abc import ABC, abstractmethod -import re -from contextlib import contextmanager, ExitStack - -from torch.optim.lr_scheduler import _LRScheduler - -from pytorch_lightning.cluster_environments import TorchElasticEnvironment, ClusterEnvironment -from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.utilities import HOROVOD_AVAILABLE -from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load -from pytorch_lightning.accelerators.base_plugin import Plugin - -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities.seed import seed_everything -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.distributed.dist import LightningDistributed -import torch -import os -from pytorch_lightning.core.step_result import Result -from typing import Any, Dict, List, Optional, Union -from pytorch_lightning.overrides.data_parallel import LightningDataParallel, LightningDistributedDataParallel -import sys -from os.path import abspath -from time import sleep -import subprocess -from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only -import numpy as np -import torch.distributed as torch_distrib -from pytorch_lightning import _logger as log -import torch.multiprocessing as mp -from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn, rank_zero_info - -if HOROVOD_AVAILABLE: - import horovod.torch as hvd - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - -if torch.distributed.is_available(): - from torch.distributed import ReduceOp -else: - - class ReduceOp: - SUM = None - - -class TrainingTypePlugin(Plugin, ABC): - def __init__(self): - self._model = None - self._results = None - self.global_rank = 0 - - @property - @abstractmethod - def on_gpu(self): - raise NotImplementedError - - @property - @abstractmethod - def root_device(self) -> torch.device: - raise NotImplementedError - - @abstractmethod - def model_to_device(self): - raise NotImplementedError - - @property - @abstractmethod - def is_global_zero(self): - raise NotImplementedError - - @abstractmethod - def reduce(self, output, *args, **kwargs): - raise NotImplementedError - - @abstractmethod - def barrier(self, name: Optional[str] = None): - raise NotImplementedError - - @abstractmethod - def broadcast(self, obj: object, src: int = 0) -> object: - raise NotImplementedError - - # TODO method this is currently unused - def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): - if device_ids is None: - return - - # set the correct cuda visible devices (using pci order) - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) - devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) - log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]') - - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - return should_stop - - @property - def model(self): - return self._model - - @model.setter - def model(self, new_model): - self._model = new_model - - @property - def lightning_module(self): - return self._model - - @property - def results(self): - """ - The results of the last training/testing run will be cached here. - In distributed training, we make sure to transfer the results to the appropriate master process. - """ - # TODO: improve these docs - return self._results - - @property - def rpc_enabled(self): - return False - - def start_training(self, trainer): - # double dispatch to initiate the training loop - self._results = trainer.train() - - def start_testing(self, trainer): - # double dispatch to initiate the test loop - self._results = trainer.run_test() - - -class SingleDevicePlugin(TrainingTypePlugin): - def __init__(self, device): - super().__init__() - self.device: torch.device = device - - @property - def on_gpu(self): - return self.device.type == "cuda" and torch.cuda.is_available() - - def reduce(self, output, *args, **kwargs): - return output - - @property - def root_device(self): - return self.device - - def model_to_device(self): - if self.on_gpu: - torch.cuda.set_device(self.root_device) - - self._model.to(self.root_device) - - def connect(self, model: torch.nn.Module): - self._model = model - self.model_to_device() - return self.model - - @property - def is_global_zero(self): - return True - - def barrier(self, *args, **kwargs): - pass - - def broadcast(self, obj: object, src: int = 0) -> object: - return obj - - -class ParallelPlugin(TrainingTypePlugin, ABC): - def __init__( - self, - parallel_devices: List[torch.device], - cluster_environment: Optional[ClusterEnvironment] = None, - ): - super().__init__() - self.parallel_devices = parallel_devices - self.local_rank = 0 - self.world_size = 1 - self.cluster_environment = cluster_environment - - @property - @abstractmethod - def root_device(self): - raise NotImplementedError - - @property - def on_gpu(self): - return self.root_device.type == "cuda" and torch.cuda.is_available() - - @abstractmethod - def setup(self, model): - raise NotImplementedError - - def connect(self, model, *args, **kwargs): - self.setup(model) - return self.model - - @property - def is_global_zero(self) -> bool: - return self.global_rank == 0 - - @property - def distributed_sampler_kwargs(self): - distributed_sampler_kwargs = dict( - num_replicas=len(self.parallel_devices), - rank=self.global_rank - ) - return distributed_sampler_kwargs - - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device) - should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM) - should_stop = bool(should_stop == self.world_size) - return should_stop - - @staticmethod - def configure_sync_batchnorm(model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) - return model - - @contextmanager - def block_backward_sync(self): - """ - Blocks ddp sync gradients behaviour on backwards pass. - This is useful for skipping sync when accumulating gradients, reducing communication overhead - Returns: context manager with sync behaviour off - """ - if isinstance(self.model, LightningDistributedDataParallel): - yield self.model.no_sync() - else: - yield None - - -class DataParallelPlugin(ParallelPlugin): - - def __init__(self, parallel_devices: List[torch.device]): - super().__init__(parallel_devices=parallel_devices, cluster_environment=None) - - def setup(self, model): - self._model = LightningDataParallel(model, self.parallel_devices) - - def reduce(self, output, *args, **kwargs): - if isinstance(output, Result): - output.dp_reduce() - - elif isinstance(output, torch.Tensor): - output = output.mean() - - return output - - @property - def root_device(self): - return self.parallel_devices[0] - - @property - def lightning_module(self): - return self._model.module - - def model_to_device(self): - # no need to do anything when model is wrapped in torch.nn.DataParallel - pass - - def barrier(self, *args, **kwargs): - pass - - def broadcast(self, obj: object, src: int = 0) -> object: - return obj - - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - return should_stop - - -class DDPPlugin(ParallelPlugin): - - distributed_backend = "ddp" - - def __init__( - self, - parallel_devices, - num_nodes=1, - cluster_environment: ClusterEnvironment = None, - sync_batchnorm=False, - **kwargs: Dict[str, Any], - ) -> None: - super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) - self.interactive_ddp_procs = [] - self.num_nodes = num_nodes - self.sync_batchnorm = sync_batchnorm - self.dist = LightningDistributed() - self._ddp_kwargs = kwargs - self._has_spawned_children = False - self.task_idx = None - self.node_rank = 0 - self.num_processes = len(parallel_devices) - - @property - def root_device(self): - return self.parallel_devices[self.local_rank] - - @property - def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - return getattr(self._model, "module", self._model) - - @property - def distributed_sampler_kwargs(self): - distributed_sampler_kwargs = dict( - num_replicas=(self.num_nodes * self.num_processes), - rank=self.global_rank - ) - return distributed_sampler_kwargs - - def setup(self, model): - self._model = model - - # start the other scripts - # TODO: make sure this works, in torchelastic we should not launch child processes! - if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": - self._call_children_scripts() - - # set the task idx - self.task_idx = self.cluster_environment.local_rank() - - def _call_children_scripts(self): - - # bookkeeping of spawned processes - assert self.global_rank == 0 - self._check_can_spawn_children() - self._has_spawned_children = True - - # DDP Environment variables - os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1") - os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port())) - - # allow the user to pass the node rank - node_rank = "0" - node_rank = os.environ.get("NODE_RANK", node_rank) - node_rank = os.environ.get("GROUP_RANK", node_rank) - os.environ["NODE_RANK"] = node_rank - os.environ["LOCAL_RANK"] = "0" - - # when user is using hydra find the absolute path - path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path - - # pull out the commands used to run the script and resolve the abs file path - command = sys.argv - try: - full_path = path_lib(command[0]) - except Exception as e: - full_path = abspath(command[0]) - - command[0] = full_path - # use the same python interpreter and actually running - command = [sys.executable] + command - - # the visible devices tell us how many GPUs we want to use. - # when the trainer script was called the device has already been scoped by the time - # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone - # but forward the GPUs selected via environment variables - if self.parallel_devices is None: - raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") - - os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices]) - os.environ["PL_IN_DDP_SUBPROCESS"] = "1" - - if self.lightning_module.logger is not None: - os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) - - num_gpus = len(self.parallel_devices) - os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" - - self.interactive_ddp_procs = [] - - for local_rank in range(1, self.num_processes): - env_copy = os.environ.copy() - env_copy["LOCAL_RANK"] = f"{local_rank}" - - # remove env var if global seed not set - if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: - del env_copy["PL_GLOBAL_SEED"] - - # start process - # if hydra is available and initialized, make sure to set the cwd correctly - cwd: Optional[str] = None - if HYDRA_AVAILABLE: - if HydraConfig.initialized(): - cwd = get_original_cwd() - proc = subprocess.Popen(command, env=env_copy, cwd=cwd) - self.interactive_ddp_procs.append(proc) - - # starting all processes at once can cause issues - # with dataloaders delay between 1-10 seconds - delay = np.random.uniform(1, 5, 1)[0] - sleep(delay) - - def _check_can_spawn_children(self): - if self._has_spawned_children: - raise RuntimeError( - "You tried to run `.fit` or `.test` multiple times in the same script." - " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead." - ) - - def set_world_ranks(self): - self.local_rank = self.task_idx - self.node_rank = self.cluster_environment.node_rank() - self.global_rank = self.node_rank * self.num_processes + self.local_rank - self.world_size = self.num_nodes * self.num_processes - - def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self._model = LightningDistributedDataParallel( - self.model, - device_ids=self.determine_ddp_device_ids(), - **self._ddp_kwargs, - ) - - def determine_ddp_device_ids(self): - if self.root_device.type == "cpu": - return None - return [self.root_device.index] - - def init_ddp_connection(self, global_rank: int, world_size: int) -> None: - # TODO: From where to get cluster environment? - os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) - torch_backend = "nccl" if self.on_gpu else "gloo" - - if not torch.distributed.is_initialized(): - log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - - def pre_training(self): - # TODO: check if needed - seed = os.environ.get("PL_GLOBAL_SEED") - if seed is not None: - seed_everything(int(seed)) - - # determine which process we are and world size - self.set_world_ranks() - - # set warning rank - rank_zero_only.rank = self.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - self.init_ddp_connection(self.global_rank, self.world_size) - - # TODO: we moved it to the trainer.fit after calling pre_training - # ... need to double check that it is the correct place - # self.trainer.call_setup_hook(self.model) - - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - - # set the ranks and devices - self.dist.rank = self.global_rank - self.dist.device = self.root_device - - if self.sync_batchnorm: - self.model = self.configure_sync_batchnorm(self.model) - - # move the model to the correct device - self.model_to_device() - - self.configure_ddp() - - self.barrier() - - def post_training(self): - if "WORLD_SIZE" in os.environ: - del os.environ["WORLD_SIZE"] - - def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def broadcast(self, obj: object, src: int = 0) -> object: - return self.dist.broadcast(obj) - - def model_to_device(self): - if self.root_device.type == "cuda": - torch.cuda.set_device(self.root_device) - self.model.to(self.root_device) - - def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): - if isinstance(output, torch.Tensor): - output = sync_ddp_if_available(output, group, reduce_op) - return output - - -class DDPSpawnPlugin(ParallelPlugin): - - distributed_backend = "ddp_spawn" - - def __init__( - self, - parallel_devices, - num_nodes=1, - cluster_environment: ClusterEnvironment = None, - sync_batchnorm=False, - **kwargs: Dict[str, Any] - ): - super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) - self.num_nodes = num_nodes - self.sync_batchnorm = sync_batchnorm - self._ddp_kwargs = kwargs - self.dist = LightningDistributed() - self.num_processes = len(parallel_devices) - self.node_rank = 0 - self.mp_queue = None - - @property - def root_device(self): - return self.parallel_devices[self.local_rank] - - @property - def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - return getattr(self._model, "module", self._model) - - @property - def distributed_sampler_kwargs(self): - distributed_sampler_kwargs = dict( - num_replicas=(self.num_nodes * self.num_processes), - rank=self.global_rank - ) - return distributed_sampler_kwargs - - def setup(self, model): - self._model = model - - os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) - - # pass in a state q - smp = mp.get_context('spawn') - self.mp_queue = smp.SimpleQueue() - - def set_world_ranks(self, process_idx): - self.local_rank = process_idx - self.node_rank = self.cluster_environment.node_rank() - self.global_rank = self.node_rank * self.num_processes + self.local_rank - self.world_size = self.num_nodes * self.num_processes - - def start_training(self, trainer): - mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,)) - - def start_testing(self, trainer): - mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer, )) - - def new_process(self, process_idx, trainer): - # TODO: check if needed - seed = os.environ.get("PL_GLOBAL_SEED") - if seed is not None: - seed_everything(int(seed)) - - self.set_world_ranks(process_idx) - - # set warning rank - rank_zero_only.rank = self.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - self.init_ddp_connection(self.global_rank, self.world_size) - - # TODO: we moved it to the trainer.fit after calling pre_training - # ... need to double check that it is the correct place - # self.trainer.call_setup_hook(self.model) - - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - - # set the ranks and devices - self.dist.rank = self.global_rank - self.dist.device = self.root_device - - if self.sync_batchnorm: - self.model = self.configure_sync_batchnorm(self.model) - - # move the model to the correct device - self.model_to_device() - - self.configure_ddp() - - self.barrier() - - if trainer.testing: - results = trainer.run_test() - else: - results = trainer.train() - - # persist info in ddp_spawn - self.transfer_distrib_spawn_state_on_fit_end(results) - - def post_training(self): - # restore main state with best weights - best_path = self.mp_queue.get() - last_path = self.mp_queue.get() - self._results = self.mp_queue.get() - - # recover the weights of the processes trained in the children - self.__recover_child_process_weights(best_path, last_path) - - def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self.model = LightningDistributedDataParallel( - self.model, - device_ids=self.determine_ddp_device_ids(), - **self._ddp_kwargs, - ) - - def init_ddp_connection(self, global_rank: int, world_size: int) -> None: - # TODO: this code is duplicated in DDP and DDPSpawn, make this a function - os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) - os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) - torch_backend = "nccl" if self.on_gpu else "gloo" - - if not torch.distributed.is_initialized(): - log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - - def determine_ddp_device_ids(self): - if self.root_device.type == "cpu": - return None - return [self.root_device.index] - - def transfer_distrib_spawn_state_on_fit_end(self, results): - # TODO: is there a better way than accessing callback through model -> trainer -> callback? - best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path - - if self.global_rank == 0 and self.mp_queue is not None: - rank_zero_warn('cleaning up ddp environment...') - - # save the last weights - last_path = None - # TODO: is there a better way than accessing trainer through model -> trainer? - if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0: - last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) - atomic_save(self.lightning_module.state_dict(), last_path) - - # todo, pass complete checkpoint as state dictionary - self.mp_queue.put(best_model_path) - self.mp_queue.put(last_path) - self.mp_queue.put(results) - - def __recover_child_process_weights(self, best_path, last_path): - # TODO: is there a better way than accessing callback through model -> trainer -> callback? - # transfer back the best path to the trainer - if self.lightning_module.trainer.checkpoint_callback: - self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path - # todo, pass also best score - - # load last weights - if last_path is not None and not self.lightning_module.trainer.testing: - ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) - self.lightning_module.load_state_dict(ckpt) - - def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def broadcast(self, obj: object, src: int = 0) -> object: - return self.dist.broadcast(obj) - - def model_to_device(self): - if self.root_device.type == "cuda": - torch.cuda.set_device(self.root_device) - self.model.to(self.root_device) - - def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): - if isinstance(output, torch.Tensor): - output = sync_ddp_if_available(output, group, reduce_op) - return output - - -# TODO: DDP2 -class DDP2Plugin(DDPPlugin): - pass - - -class HorovodPlugin(ParallelPlugin): - - def __init__(self, parallel_devices: List[torch.device]): - super().__init__(parallel_devices=parallel_devices, cluster_environment=None) - - @property - def root_device(self): - return self.parallel_devices[self.local_rank] - - @property - def distributed_sampler_kwargs(self): - distributed_sampler_kwargs = dict( - num_replicas=hvd.size(), - rank=hvd.rank() - ) - return distributed_sampler_kwargs - - def setup(self, model): - self._model = model - - self.global_rank = hvd.rank() - self.local_rank = hvd.local_rank() - rank_zero_only.rank = self.global_rank - - self.model_to_device() - - def pre_training(self): - - def _unpack_lightning_optimizer(opt): - return opt._optimizer if isinstance(opt, LightningOptimizer) else opt - - optimizers = self.lightning_module.trainer.optimizers - optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers] - - # Horovod: scale the learning rate by the number of workers to account for - # increased total batch size - for optimizer in optimizers: - for param_group in optimizer.param_groups: - param_group['lr'] *= hvd.size() - - # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR - lr_schedulers = self.lightning_module.trainer.lr_schedulers - for scheduler in lr_schedulers: - scheduler = scheduler['scheduler'] - if isinstance(scheduler, _LRScheduler): - scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs] - - # Horovod: broadcast parameters & optimizer state to ensure consistent initialization - hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0) - for optimizer in optimizers: - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - - def _filter_named_parameters(model, optimizer): - opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])]) - return [(name, p) for name, p in model.named_parameters() if p in opt_params] - - # Horovod: wrap optimizers to perform gradient aggregation via allreduce - optimizers = [ - hvd.DistributedOptimizer(optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer)) - for optimizer in optimizers - ] - - optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers) - self.lightning_module.trainer.optimizers = optimizers - - def start_training(self, trainer): - with ExitStack() as stack: - for optimizer in trainer.optimizers: - # Synchronization will be performed explicitly following backward() - stack.enter_context(optimizer.skip_synchronize()) - - # set up training routine - self._results = trainer.train() - - # Make sure all workers have finished training before returning to the user - hvd.join() - - def start_testing(self, trainer): - with ExitStack() as stack: - # set up training routine - # self.trainer.train_loop.setup_training(self.trainer.model) - self._results = trainer.run_test() - - # Make sure all workers have finished training before returning to the user - hvd.join() - - def barrier(self, *args, **kwargs): - hvd.join() - - def broadcast(self, obj: object, src: int = 0) -> object: - obj = hvd.broadcast_object(obj, src) - return obj - - def model_to_device(self): - if self.on_gpu: - torch.cuda.set_device(self.root_device) - self.model.to(self.root_device) - - def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): - if group is not None: - raise ValueError( - "Horovod does not support allreduce using a subcommunicator at this time. " - "Unset `group`." - ) - - if reduce_op is None or reduce_op == "sum": - reduce_op = hvd.Sum - elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): - reduce_op = hvd.Average - else: - raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") - - # sync all processes before reduction - hvd.join() - return hvd.allreduce(output, op=reduce_op) - - def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): - if group is not None: - raise ValueError( - "Horovod does not support allgather using a subcommunicator at this time. " - "Unset `group`." - ) - - if len(result.shape) == 0: - # Convert scalars to single dimension tensors - result = result.reshape(1) - - # sync and gather all - hvd.join() - gathered = hvd.allgather(result) - gathered_result = list(gathered.split(1, dim=0)) - return gathered_result diff --git a/pytorch_lightning/accelerators/plugins/__init__.py b/pytorch_lightning/accelerators/plugins/__init__.py new file mode 100644 index 0000000000000..119284ef33c76 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/__init__.py @@ -0,0 +1,3 @@ +from pytorch_lightning.accelerators.plugins.base_plugin import Plugin +from pytorch_lightning.accelerators.plugins.precision import * +from pytorch_lightning.accelerators.plugins.training_type import * diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py new file mode 100644 index 0000000000000..532ea418a40bd --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py @@ -0,0 +1,8 @@ +from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin +from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin +from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin +from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin +from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin +from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py new file mode 100644 index 0000000000000..ec275f227016a --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py @@ -0,0 +1,244 @@ +import os +import sys +import subprocess +from time import sleep +import numpy as np +from typing import Any, Dict, Optional + +import torch +import torch.distributed as torch_distrib + +from pytorch_lightning import _logger as log +from pytorch_lightning.utilities import HYDRA_AVAILABLE +from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.seed import seed_everything + +if HYDRA_AVAILABLE: + from hydra.utils import to_absolute_path, get_original_cwd + from hydra.core.hydra_config import HydraConfig + + +class DDPPlugin(ParallelPlugin): + + distributed_backend = "ddp" + + def __init__( + self, + parallel_devices, + num_nodes=1, + cluster_environment: ClusterEnvironment = None, + sync_batchnorm=False, + **kwargs: Dict[str, Any], + ) -> None: + super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) + self.interactive_ddp_procs = [] + self.num_nodes = num_nodes + self.sync_batchnorm = sync_batchnorm + self.dist = LightningDistributedDataParallel() + self._ddp_kwargs = kwargs + self._has_spawned_children = False + self.task_idx = None + self.node_rank = 0 + self.num_processes = len(parallel_devices) + + @property + def root_device(self): + return self.parallel_devices[self.local_rank] + + @property + def lightning_module(self): + # the model may not be wrapped with DistributedDataParallel if calling this too early + return getattr(self._model, "module", self._model) + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) + return distributed_sampler_kwargs + + def setup(self, model): + self._model = model + + # start the other scripts + # TODO: make sure this works, in torchelastic we should not launch child processes! + if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1": + self._call_children_scripts() + + # set the task idx + self.task_idx = self.cluster_environment.local_rank() + + def _call_children_scripts(self): + + # bookkeeping of spawned processes + assert self.global_rank == 0 + self._check_can_spawn_children() + self._has_spawned_children = True + + # DDP Environment variables + os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1") + os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port())) + + # allow the user to pass the node rank + node_rank = "0" + node_rank = os.environ.get("NODE_RANK", node_rank) + node_rank = os.environ.get("GROUP_RANK", node_rank) + os.environ["NODE_RANK"] = node_rank + os.environ["LOCAL_RANK"] = "0" + + # when user is using hydra find the absolute path + path_lib = os.path.abspath if not HYDRA_AVAILABLE else to_absolute_path + + # pull out the commands used to run the script and resolve the abs file path + command = sys.argv + try: + full_path = path_lib(command[0]) + except Exception as e: + full_path = os.path.abspath(command[0]) + + command[0] = full_path + # use the same python interpreter and actually running + command = [sys.executable] + command + + # the visible devices tell us how many GPUs we want to use. + # when the trainer script was called the device has already been scoped by the time + # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone + # but forward the GPUs selected via environment variables + if self.parallel_devices is None: + raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") + + os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices]) + os.environ["PL_IN_DDP_SUBPROCESS"] = "1" + + if self.lightning_module.logger is not None: + os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) + + num_gpus = len(self.parallel_devices) + os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" + + self.interactive_ddp_procs = [] + + for local_rank in range(1, self.num_processes): + env_copy = os.environ.copy() + env_copy["LOCAL_RANK"] = f"{local_rank}" + + # remove env var if global seed not set + if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: + del env_copy["PL_GLOBAL_SEED"] + + # start process + # if hydra is available and initialized, make sure to set the cwd correctly + cwd: Optional[str] = None + if HYDRA_AVAILABLE: + if HydraConfig.initialized(): + cwd = get_original_cwd() + proc = subprocess.Popen(command, env=env_copy, cwd=cwd) + self.interactive_ddp_procs.append(proc) + + # starting all processes at once can cause issues + # with dataloaders delay between 1-10 seconds + delay = np.random.uniform(1, 5, 1)[0] + sleep(delay) + + def _check_can_spawn_children(self): + if self._has_spawned_children: + raise RuntimeError( + "You tried to run `.fit` or `.test` multiple times in the same script." + " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead." + ) + + def set_world_ranks(self): + self.local_rank = self.task_idx + self.node_rank = self.cluster_environment.node_rank() + self.global_rank = self.node_rank * self.num_processes + self.local_rank + self.world_size = self.num_nodes * self.num_processes + + def configure_ddp(self): + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) + self._model = LightningDistributedDataParallel( + self.model, + device_ids=self.determine_ddp_device_ids(), + **self._ddp_kwargs, + ) + + def determine_ddp_device_ids(self): + if self.root_device.type == "cpu": + return None + return [self.root_device.index] + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + # TODO: From where to get cluster environment? + os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) + torch_backend = "nccl" if self.on_gpu else "gloo" + + if not torch.distributed.is_initialized(): + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + + def pre_training(self): + # TODO: check if needed + seed = os.environ.get("PL_GLOBAL_SEED") + if seed is not None: + seed_everything(int(seed)) + + # determine which process we are and world size + self.set_world_ranks() + + # set warning rank + rank_zero_only.rank = self.global_rank + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + self.init_ddp_connection(self.global_rank, self.world_size) + + # TODO: we moved it to the trainer.fit after calling pre_training + # ... need to double check that it is the correct place + # self.trainer.call_setup_hook(self.model) + + # on world_size=0 let everyone know training is starting + if self.is_global_zero and not torch.distributed.is_initialized(): + log.info("-" * 100) + log.info(f"distributed_backend={self.distributed_backend}") + log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + log.info("-" * 100) + + # set the ranks and devices + self.dist.rank = self.global_rank + self.dist.device = self.root_device + + if self.sync_batchnorm: + self.model = self.configure_sync_batchnorm(self.model) + + # move the model to the correct device + self.model_to_device() + + self.configure_ddp() + + self.barrier() + + def post_training(self): + if "WORLD_SIZE" in os.environ: + del os.environ["WORLD_SIZE"] + + def barrier(self, *args, **kwargs): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + def broadcast(self, obj: object, src: int = 0) -> object: + return self.dist.broadcast(obj) + + def model_to_device(self): + if self.root_device.type == "cuda": + torch.cuda.set_device(self.root_device) + self.model.to(self.root_device) + + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): + if isinstance(output, torch.Tensor): + output = sync_ddp_if_available(output, group, reduce_op) + return output diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py new file mode 100644 index 0000000000000..078dfe6cd6ec1 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py @@ -0,0 +1,5 @@ +from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin + +# TODO: DDP2 +class DDP2Plugin(DDPPlugin): + pass \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py new file mode 100644 index 0000000000000..e2c61bfe6e3fd --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py @@ -0,0 +1,213 @@ +import re +import os +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from typing import Any, Dict, Optional, Union +import torch + +import torch.multiprocessing as mp +import torch.distributed as torch_distrib + +from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load +from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only +from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn +from pytorch_lightning.utilities.seed import seed_everything + +from pytorch_lightning import _logger as log + +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + + class ReduceOp: + SUM = None + + +class DDPSpawnPlugin(ParallelPlugin): + + distributed_backend = "ddp_spawn" + + def __init__( + self, + parallel_devices, + num_nodes=1, + cluster_environment: ClusterEnvironment = None, + sync_batchnorm=False, + **kwargs: Dict[str, Any], + ): + super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) + self.num_nodes = num_nodes + self.sync_batchnorm = sync_batchnorm + self._ddp_kwargs = kwargs + self.dist = LightningDistributed() + self.num_processes = len(parallel_devices) + self.node_rank = 0 + self.mp_queue = None + + @property + def root_device(self): + return self.parallel_devices[self.local_rank] + + @property + def lightning_module(self): + # the model may not be wrapped with DistributedDataParallel if calling this too early + return getattr(self._model, "module", self._model) + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) + return distributed_sampler_kwargs + + def setup(self, model): + self._model = model + + os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", str(find_free_network_port())) + + # pass in a state q + smp = mp.get_context("spawn") + self.mp_queue = smp.SimpleQueue() + + def set_world_ranks(self, process_idx): + self.local_rank = process_idx + self.node_rank = self.cluster_environment.node_rank() + self.global_rank = self.node_rank * self.num_processes + self.local_rank + self.world_size = self.num_nodes * self.num_processes + + def start_training(self, trainer): + mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,)) + + def start_testing(self, trainer): + mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,)) + + def new_process(self, process_idx, trainer): + # TODO: check if needed + seed = os.environ.get("PL_GLOBAL_SEED") + if seed is not None: + seed_everything(int(seed)) + + self.set_world_ranks(process_idx) + + # set warning rank + rank_zero_only.rank = self.global_rank + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + self.init_ddp_connection(self.global_rank, self.world_size) + + # TODO: we moved it to the trainer.fit after calling pre_training + # ... need to double check that it is the correct place + # self.trainer.call_setup_hook(self.model) + + # on world_size=0 let everyone know training is starting + if self.is_global_zero and not torch.distributed.is_initialized(): + log.info("-" * 100) + log.info(f"distributed_backend={self.distributed_backend}") + log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + log.info("-" * 100) + + # set the ranks and devices + self.dist.rank = self.global_rank + self.dist.device = self.root_device + + if self.sync_batchnorm: + self.model = self.configure_sync_batchnorm(self.model) + + # move the model to the correct device + self.model_to_device() + + self.configure_ddp() + + self.barrier() + + if trainer.testing: + results = trainer.run_test() + else: + results = trainer.train() + + # persist info in ddp_spawn + self.transfer_distrib_spawn_state_on_fit_end(results) + + def post_training(self): + # restore main state with best weights + best_path = self.mp_queue.get() + last_path = self.mp_queue.get() + self._results = self.mp_queue.get() + + # recover the weights of the processes trained in the children + self.__recover_child_process_weights(best_path, last_path) + + def configure_ddp(self): + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) + self.model = LightningDistributedDataParallel( + self.model, + device_ids=self.determine_ddp_device_ids(), + **self._ddp_kwargs, + ) + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + # TODO: this code is duplicated in DDP and DDPSpawn, make this a function + os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) + os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) + os.environ["WORLD_SIZE"] = str(self.cluster_environment.world_size()) + torch_backend = "nccl" if self.on_gpu else "gloo" + + if not torch.distributed.is_initialized(): + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + + def determine_ddp_device_ids(self): + if self.root_device.type == "cpu": + return None + return [self.root_device.index] + + def transfer_distrib_spawn_state_on_fit_end(self, results): + # TODO: is there a better way than accessing callback through model -> trainer -> callback? + best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path + + if self.global_rank == 0 and self.mp_queue is not None: + rank_zero_warn("cleaning up ddp environment...") + + # save the last weights + last_path = None + # TODO: is there a better way than accessing trainer through model -> trainer? + if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0: + last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path) + atomic_save(self.lightning_module.state_dict(), last_path) + + # todo, pass complete checkpoint as state dictionary + self.mp_queue.put(best_model_path) + self.mp_queue.put(last_path) + self.mp_queue.put(results) + + def __recover_child_process_weights(self, best_path, last_path): + # TODO: is there a better way than accessing callback through model -> trainer -> callback? + # transfer back the best path to the trainer + if self.lightning_module.trainer.checkpoint_callback: + self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path + # todo, pass also best score + + # load last weights + if last_path is not None and not self.lightning_module.trainer.testing: + ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) + self.lightning_module.load_state_dict(ckpt) + + def barrier(self, *args, **kwargs): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + def broadcast(self, obj: object, src: int = 0) -> object: + return self.dist.broadcast(obj) + + def model_to_device(self): + if self.root_device.type == "cuda": + torch.cuda.set_device(self.root_device) + self.model.to(self.root_device) + + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): + if isinstance(output, torch.Tensor): + output = sync_ddp_if_available(output, group, reduce_op) + return output \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/training_type/dp.py b/pytorch_lightning/accelerators/plugins/training_type/dp.py new file mode 100644 index 0000000000000..0c50d077633af --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/dp.py @@ -0,0 +1,44 @@ +from typing import List + +import torch +from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.core.step_result import Result +from pytorch_lightning.overrides.data_parallel import LightningDataParallel + +class DataParallelPlugin(ParallelPlugin): + + def __init__(self, parallel_devices: List[torch.device]): + super().__init__(parallel_devices=parallel_devices, cluster_environment=None) + + def setup(self, model): + self._model = LightningDataParallel(model, self.parallel_devices) + + def reduce(self, output, *args, **kwargs): + if isinstance(output, Result): + output.dp_reduce() + + elif isinstance(output, torch.Tensor): + output = output.mean() + + return output + + @property + def root_device(self): + return self.parallel_devices[0] + + @property + def lightning_module(self): + return self._model.module + + def model_to_device(self): + # no need to do anything when model is wrapped in torch.nn.DataParallel + pass + + def barrier(self, *args, **kwargs): + pass + + def broadcast(self, obj: object, src: int = 0) -> object: + return obj + + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + return should_stop \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/accelerators/plugins/training_type/horovod.py new file mode 100644 index 0000000000000..72e14c1a6a790 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/horovod.py @@ -0,0 +1,148 @@ +from contextlib import ExitStack +from pytorch_lightning.utilities.distributed import rank_zero_only +from typing import Any, List, Optional, Union + +import torch +from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.utilities import HOROVOD_AVAILABLE +from pytorch_lightning.core.optimizer import LightningOptimizer +from torch.optim.lr_scheduler import _LRScheduler + +if HOROVOD_AVAILABLE: + import horovod.torch as hvd + +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + + class ReduceOp: + SUM = None + + +class HorovodPlugin(ParallelPlugin): + def __init__(self, parallel_devices: List[torch.device]): + super().__init__(parallel_devices=parallel_devices, cluster_environment=None) + + @property + def root_device(self): + return self.parallel_devices[self.local_rank] + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict(num_replicas=hvd.size(), rank=hvd.rank()) + return distributed_sampler_kwargs + + def setup(self, model): + self._model = model + + self.global_rank = hvd.rank() + self.local_rank = hvd.local_rank() + rank_zero_only.rank = self.global_rank + + self.model_to_device() + + def pre_training(self): + def _unpack_lightning_optimizer(opt): + return opt._optimizer if isinstance(opt, LightningOptimizer) else opt + + optimizers = self.lightning_module.trainer.optimizers + optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers] + + # Horovod: scale the learning rate by the number of workers to account for + # increased total batch size + for optimizer in optimizers: + for param_group in optimizer.param_groups: + param_group["lr"] *= hvd.size() + + # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR + lr_schedulers = self.lightning_module.trainer.lr_schedulers + for scheduler in lr_schedulers: + scheduler = scheduler["scheduler"] + if isinstance(scheduler, _LRScheduler): + scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs] + + # Horovod: broadcast parameters & optimizer state to ensure consistent initialization + hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0) + for optimizer in optimizers: + hvd.broadcast_optimizer_state(optimizer, root_rank=0) + + def _filter_named_parameters(model, optimizer): + opt_params = set([p for group in optimizer.param_groups for p in group.get("params", [])]) + return [(name, p) for name, p in model.named_parameters() if p in opt_params] + + # Horovod: wrap optimizers to perform gradient aggregation via allreduce + optimizers = [ + hvd.DistributedOptimizer( + optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer) + ) + for optimizer in optimizers + ] + + optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers) + self.lightning_module.trainer.optimizers = optimizers + + def start_training(self, trainer): + with ExitStack() as stack: + for optimizer in trainer.optimizers: + # Synchronization will be performed explicitly following backward() + stack.enter_context(optimizer.skip_synchronize()) + + # set up training routine + self._results = trainer.train() + + # Make sure all workers have finished training before returning to the user + hvd.join() + + def start_testing(self, trainer): + with ExitStack() as stack: + # set up training routine + # self.trainer.train_loop.setup_training(self.trainer.model) + self._results = trainer.run_test() + + # Make sure all workers have finished training before returning to the user + hvd.join() + + def barrier(self, *args, **kwargs): + hvd.join() + + def broadcast(self, obj: object, src: int = 0) -> object: + obj = hvd.broadcast_object(obj, src) + return obj + + def model_to_device(self): + if self.on_gpu: + torch.cuda.set_device(self.root_device) + self.model.to(self.root_device) + + def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): + if group is not None: + raise ValueError( + "Horovod does not support allreduce using a subcommunicator at this time. " "Unset `group`." + ) + + if reduce_op is None or reduce_op == "sum": + reduce_op = hvd.Sum + elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): + reduce_op = hvd.Average + else: + raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") + + # sync all processes before reduction + hvd.join() + return hvd.allreduce(output, op=reduce_op) + + def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): + if group is not None: + raise ValueError( + "Horovod does not support allgather using a subcommunicator at this time. " "Unset `group`." + ) + + if len(result.shape) == 0: + # Convert scalars to single dimension tensors + result = result.reshape(1) + + # sync and gather all + hvd.join() + gathered = hvd.allgather(result) + gathered_result = list(gathered.split(1, dim=0)) + return gathered_result diff --git a/pytorch_lightning/accelerators/plugins/training_type/parallel.py b/pytorch_lightning/accelerators/plugins/training_type/parallel.py new file mode 100644 index 0000000000000..fd366f677b55f --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/parallel.py @@ -0,0 +1,91 @@ +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import List, Optional +import torch +from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.core import LightningModule +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel + +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + + class ReduceOp: + SUM = None + +class ParallelPlugin(TrainingTypePlugin, ABC): + def __init__( + self, + parallel_devices: List[torch.device], + cluster_environment: Optional[ClusterEnvironment] = None, + ): + super().__init__() + self.parallel_devices = parallel_devices + self.local_rank = 0 + self.world_size = 1 + self.cluster_environment = cluster_environment + + @property + @abstractmethod + def root_device(self): + raise NotImplementedError + + @property + def on_gpu(self): + return self.root_device.type == "cuda" and torch.cuda.is_available() + + @abstractmethod + def setup(self, model): + raise NotImplementedError + + def connect(self, model, *args, **kwargs): + self.setup(model) + return self.model + + @property + def is_global_zero(self) -> bool: + return self.global_rank == 0 + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=len(self.parallel_devices), + rank=self.global_rank + ) + return distributed_sampler_kwargs + + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device) + should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM) + should_stop = bool(should_stop == self.world_size) + return should_stop + + @staticmethod + def configure_sync_batchnorm(model: LightningModule) -> LightningModule: + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + return model + + @contextmanager + def block_backward_sync(self): + """ + Blocks ddp sync gradients behaviour on backwards pass. + This is useful for skipping sync when accumulating gradients, reducing communication overhead + Returns: context manager with sync behaviour off + """ + if isinstance(self.model, LightningDistributedDataParallel): + yield self.model.no_sync() + else: + yield None \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/training_type/single_device.py b/pytorch_lightning/accelerators/plugins/training_type/single_device.py new file mode 100644 index 0000000000000..2e674ef87fbb4 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/single_device.py @@ -0,0 +1,40 @@ +import torch +from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin + + +class SingleDevicePlugin(TrainingTypePlugin): + def __init__(self, device): + super().__init__() + self.device: torch.device = device + + @property + def on_gpu(self): + return self.device.type == "cuda" and torch.cuda.is_available() + + def reduce(self, output, *args, **kwargs): + return output + + @property + def root_device(self): + return self.device + + def model_to_device(self): + if self.on_gpu: + torch.cuda.set_device(self.root_device) + + self._model.to(self.root_device) + + def connect(self, model: torch.nn.Module): + self._model = model + self.model_to_device() + return self.model + + @property + def is_global_zero(self): + return True + + def barrier(self, *args, **kwargs): + pass + + def broadcast(self, obj: object, src: int = 0) -> object: + return obj \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py new file mode 100644 index 0000000000000..94d4dbf9d3409 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py @@ -0,0 +1,93 @@ +import os + +from abc import ABC, abstractmethod +from typing import Optional +import torch + +from pytorch_lightning.accelerators.plugins.base_plugin import Plugin + +from pytorch_lightning import _logger as log + +class TrainingTypePlugin(Plugin, ABC): + def __init__(self): + self._model = None + self._results = None + self.global_rank = 0 + + @property + @abstractmethod + def on_gpu(self): + raise NotImplementedError + + @property + @abstractmethod + def root_device(self) -> torch.device: + raise NotImplementedError + + @abstractmethod + def model_to_device(self): + raise NotImplementedError + + @property + @abstractmethod + def is_global_zero(self): + raise NotImplementedError + + @abstractmethod + def reduce(self, output, *args, **kwargs): + raise NotImplementedError + + @abstractmethod + def barrier(self, name: Optional[str] = None): + raise NotImplementedError + + @abstractmethod + def broadcast(self, obj: object, src: int = 0) -> object: + raise NotImplementedError + + # TODO method this is currently unused + def set_nvidia_flags(self, is_slurm_managing_tasks, device_ids): + if device_ids is None: + return + + # set the correct cuda visible devices (using pci order) + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) + devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) + log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]') + + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + return should_stop + + @property + def model(self): + return self._model + + @model.setter + def model(self, new_model): + self._model = new_model + + @property + def lightning_module(self): + return self._model + + @property + def results(self): + """ + The results of the last training/testing run will be cached here. + In distributed training, we make sure to transfer the results to the appropriate master process. + """ + # TODO: improve these docs + return self._results + + @property + def rpc_enabled(self): + return False + + def start_training(self, trainer): + # double dispatch to initiate the training loop + self._results = trainer.train() + + def start_testing(self, trainer): + # double dispatch to initiate the test loop + self._results = trainer.run_test() diff --git a/pytorch_lightning/accelerators/scheduler_properties.py b/pytorch_lightning/accelerators/scheduler_properties.py deleted file mode 100644 index 37dbdd13c3c58..0000000000000 --- a/pytorch_lightning/accelerators/scheduler_properties.py +++ /dev/null @@ -1,25 +0,0 @@ -from torch import optim - - -def reinit_scheduler_properties(optimizers: list, schedulers: list): - # Reinitialize optimizer.step properties added by schedulers - for scheduler in schedulers: - scheduler = scheduler['scheduler'] - - for optimizer in optimizers: - state = None - idx = 0 - - # check that we dont mix users optimizers and schedulers - if scheduler.optimizer == optimizer: - # Find the mro belonging to the base lr scheduler class - for i, mro in enumerate(scheduler.__class__.__mro__): - if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau): - idx = i - state = scheduler.state_dict() - else: - state = None - - scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) - if state is not None: - scheduler.load_state_dict(state) \ No newline at end of file From ee53c90fd06fef04cfec7f22feb73cd9e720d5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 7 Jan 2021 21:21:02 +0100 Subject: [PATCH 100/157] fix all new import errors --- pytorch_lightning/accelerators/__init__.py | 4 ++++ pytorch_lightning/accelerators/accelerator.py | 8 ++------ .../accelerators/accelerator_connector.py | 16 +++++++-------- pytorch_lightning/accelerators/cpu.py | 2 +- .../plugins/precision/apex_amp.py | 4 ++-- .../accelerators/plugins/training_type/ddp.py | 20 +++++++++++++------ .../plugins/training_type/horovod.py | 4 ++-- .../cluster_environment.py | 2 +- pytorch_lightning/plugins/old/apex.py | 2 +- pytorch_lightning/plugins/old/ddp_plugin.py | 2 +- .../plugins/old/ddp_sequential_plugin.py | 2 +- pytorch_lightning/plugins/old/native_amp.py | 2 +- .../plugins/old/plugin_connector.py | 10 +++++----- .../plugins/old/precision_plugin.py | 2 +- pytorch_lightning/plugins/old/rpc_plugin.py | 2 +- .../plugins/old/sharded_native_amp_plugin.py | 2 +- .../plugins/old/sharded_plugin.py | 4 ++-- .../trainer/connectors/precision_connector.py | 4 ++-- pytorch_lightning/trainer/trainer.py | 5 +---- pytorch_lightning/trainer/training_loop.py | 2 +- tests/backends/test_accelerator_connector.py | 4 ++-- tests/models/test_gpu.py | 4 +--- tests/models/test_horovod.py | 3 +-- tests/models/test_tpu.py | 2 +- tests/plugins/test_plugin_properties.py | 2 +- 25 files changed, 58 insertions(+), 56 deletions(-) diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index e69de29bb2d1d..2ec118303d153 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -0,0 +1,4 @@ +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.cpu import CPUAccelerator +from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.tpu import TPUAccelerator diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 81eb112206d28..f9b18304316ef 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,8 +1,4 @@ -import os - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.data_parallel import TrainingTypePlugin, HorovodPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin from pytorch_lightning.utilities import AMPType from typing import Any import math @@ -11,7 +7,7 @@ from torch.optim import Optimizer from pytorch_lightning.core import LightningModule -from pytorch_lightning.accelerators.precision import ( +from pytorch_lightning.accelerators.plugins.precision import ( ApexMixedPrecisionPlugin, MixedPrecisionPlugin, NativeMixedPrecisionPlugin, diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index e03e51cbba6ed..e3467e4be3617 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -18,11 +18,11 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ +from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ DataParallelPlugin, DDP2Plugin, HorovodPlugin -from pytorch_lightning.accelerators.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin +from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus -from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, device_parser +from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -40,9 +40,9 @@ try: import horovod.torch as hvd except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False + _HOROVOD_AVAILABLE = False else: - HOROVOD_AVAILABLE = True + _HOROVOD_AVAILABLE = True class BackendConnector(object): @@ -180,7 +180,7 @@ def select_precision_plugin(self): elif self.precision == 16: if self.amp_type == 'native': - if not NATIVE_AMP_AVAILABLE: + if not _NATIVE_AMP_AVAILABLE: rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.' ' Consider upgrading with `pip install torch>=1.6`.' ' We will attempt to use NVIDIA Apex for this session.') @@ -191,7 +191,7 @@ def select_precision_plugin(self): return NativeMixedPrecisionPlugin() if self.amp_type =='apex': - if not APEX_AVAILABLE: + if not _APEX_AVAILABLE: rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.' ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux') else: @@ -371,7 +371,7 @@ def _set_horovod_backend(self): def check_horovod(self): """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" - if not HOROVOD_AVAILABLE: + if not _HOROVOD_AVAILABLE: raise MisconfigurationException( 'Requested `distributed_backend="horovod"`, but Horovod is not installed.' "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index e9f49e20a464f..820fab6d7d0f8 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.precision import MixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py index 9bb749bf18dbb..08b4fe7906732 100644 --- a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py +++ b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py @@ -3,10 +3,10 @@ import torch from torch.optim import Optimizer from pytorch_lightning.core import LightningModule -from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities import AMPType, _APEX_AVAILABLE, rank_zero_warn from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin -if APEX_AVAILABLE: +if _APEX_AVAILABLE: from apex import amp class ApexMixedPrecisionPlugin(MixedPrecisionPlugin): diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py index ec275f227016a..4e865a959ae73 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py @@ -3,13 +3,14 @@ import subprocess from time import sleep import numpy as np -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Union import torch import torch.distributed as torch_distrib from pytorch_lightning import _logger as log -from pytorch_lightning.utilities import HYDRA_AVAILABLE +from pytorch_lightning.distributed import LightningDistributed +from pytorch_lightning.utilities import _HYDRA_AVAILABLE from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel @@ -17,10 +18,17 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything -if HYDRA_AVAILABLE: +if _HYDRA_AVAILABLE: from hydra.utils import to_absolute_path, get_original_cwd from hydra.core.hydra_config import HydraConfig +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + + class ReduceOp: + SUM = None + class DDPPlugin(ParallelPlugin): @@ -38,7 +46,7 @@ def __init__( self.interactive_ddp_procs = [] self.num_nodes = num_nodes self.sync_batchnorm = sync_batchnorm - self.dist = LightningDistributedDataParallel() + self.dist = LightningDistributed() self._ddp_kwargs = kwargs self._has_spawned_children = False self.task_idx = None @@ -89,7 +97,7 @@ def _call_children_scripts(self): os.environ["LOCAL_RANK"] = "0" # when user is using hydra find the absolute path - path_lib = os.path.abspath if not HYDRA_AVAILABLE else to_absolute_path + path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv @@ -131,7 +139,7 @@ def _call_children_scripts(self): # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None - if HYDRA_AVAILABLE: + if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/accelerators/plugins/training_type/horovod.py index 72e14c1a6a790..fee77f762fde1 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/horovod.py +++ b/pytorch_lightning/accelerators/plugins/training_type/horovod.py @@ -4,11 +4,11 @@ import torch from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin -from pytorch_lightning.utilities import HOROVOD_AVAILABLE +from pytorch_lightning.utilities import _HOROVOD_AVAILABLE from pytorch_lightning.core.optimizer import LightningOptimizer from torch.optim.lr_scheduler import _LRScheduler -if HOROVOD_AVAILABLE: +if _HOROVOD_AVAILABLE: import horovod.torch as hvd if torch.distributed.is_available(): diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index 6de290cd63ee9..8652d701dbf83 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.plugins.plugin import LightningPlugin +from pytorch_lightning.plugins.old.plugin import LightningPlugin class ClusterEnvironment(LightningPlugin): diff --git a/pytorch_lightning/plugins/old/apex.py b/pytorch_lightning/plugins/old/apex.py index f80461e5d4fe5..d917924eb0960 100644 --- a/pytorch_lightning/plugins/old/apex.py +++ b/pytorch_lightning/plugins/old/apex.py @@ -17,7 +17,7 @@ from torch.optim.optimizer import Optimizer from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType from pytorch_lightning.utilities.distributed import rank_zero_warn diff --git a/pytorch_lightning/plugins/old/ddp_plugin.py b/pytorch_lightning/plugins/old/ddp_plugin.py index f0da9e5ff1a2d..360479de5a665 100644 --- a/pytorch_lightning/plugins/old/ddp_plugin.py +++ b/pytorch_lightning/plugins/old/ddp_plugin.py @@ -22,7 +22,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.data_parallel import LightningDistributedModule, prepare_for_backward -from pytorch_lightning.plugins.plugin import LightningPlugin +from pytorch_lightning.plugins.old.plugin import LightningPlugin from pytorch_lightning.utilities import DeviceType diff --git a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py b/pytorch_lightning/plugins/old/ddp_sequential_plugin.py index 82250d1ed9fdd..dc39d648d2f13 100644 --- a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py +++ b/pytorch_lightning/plugins/old/ddp_sequential_plugin.py @@ -21,7 +21,7 @@ from pytorch_lightning import LightningModule from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.plugins.old.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/plugins/old/native_amp.py b/pytorch_lightning/plugins/old/native_amp.py index 4df5d128476a4..832d6acc672b4 100644 --- a/pytorch_lightning/plugins/old/native_amp.py +++ b/pytorch_lightning/plugins/old/native_amp.py @@ -16,7 +16,7 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin class NativeAMPPlugin(PrecisionPlugin): diff --git a/pytorch_lightning/plugins/old/plugin_connector.py b/pytorch_lightning/plugins/old/plugin_connector.py index e1071fa24ec04..77dae1229743e 100644 --- a/pytorch_lightning/plugins/old/plugin_connector.py +++ b/pytorch_lightning/plugins/old/plugin_connector.py @@ -15,11 +15,11 @@ from typing import List, Optional, Union from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.plugins.apex import ApexPlugin -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.native_amp import NativeAMPPlugin -from pytorch_lightning.plugins.plugin import LightningPlugin -from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin +from pytorch_lightning.plugins.old.apex import ApexPlugin +from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.old.plugin import LightningPlugin +from pytorch_lightning.plugins.old.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import AMPType, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/plugins/old/precision_plugin.py b/pytorch_lightning/plugins/old/precision_plugin.py index aaac3ede3c623..69d8e3670678d 100644 --- a/pytorch_lightning/plugins/old/precision_plugin.py +++ b/pytorch_lightning/plugins/old/precision_plugin.py @@ -15,7 +15,7 @@ from torch.optim import Optimizer -from pytorch_lightning.plugins.plugin import LightningPlugin +from pytorch_lightning.plugins.old.plugin import LightningPlugin class PrecisionPlugin(LightningPlugin): diff --git a/pytorch_lightning/plugins/old/rpc_plugin.py b/pytorch_lightning/plugins/old/rpc_plugin.py index fd3825a343463..4445b1d35970e 100644 --- a/pytorch_lightning/plugins/old/rpc_plugin.py +++ b/pytorch_lightning/plugins/old/rpc_plugin.py @@ -18,7 +18,7 @@ import torch from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin from pytorch_lightning.utilities import _RPC_AVAILABLE DEFAULT_RPC_TIMEOUT_SEC = 60. diff --git a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py index 5ddd29521203d..c29821dcd8a8d 100644 --- a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py @@ -15,7 +15,7 @@ from torch.optim import Optimizer -from pytorch_lightning.plugins.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: diff --git a/pytorch_lightning/plugins/old/sharded_plugin.py b/pytorch_lightning/plugins/old/sharded_plugin.py index ec1500ca7abf4..19e0859587585 100644 --- a/pytorch_lightning/plugins/old/sharded_plugin.py +++ b/pytorch_lightning/plugins/old/sharded_plugin.py @@ -15,8 +15,8 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.optimizer import is_lightning_optimizer -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin +from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.old.sharded_native_amp_plugin import ShardedNativeAMPPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, AMPType, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py index 78f1635fb7f4d..af8db214eff9d 100644 --- a/pytorch_lightning/trainer/connectors/precision_connector.py +++ b/pytorch_lightning/trainer/connectors/precision_connector.py @@ -13,8 +13,8 @@ # limitations under the License. from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.apex import ApexPlugin -from pytorch_lightning.plugins.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.old.apex import ApexPlugin +from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4d0718c5e2b48..5bf2fdcea7991 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -16,7 +16,6 @@ import os from pytorch_lightning.core.memory import ModelSummary -from pytorch_lightning.accelerators.precision import PrecisionPlugin import warnings from pathlib import Path from typing import Dict, Iterable, List, Optional, Union @@ -25,7 +24,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log -from pytorch_lightning.core.memory import ModelSummary +from pytorch_lightning.plugins.old.plugin_connector import PluginConnector from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes from pytorch_lightning.callbacks import Callback from pytorch_lightning.accelerators.accelerator_connector import BackendConnector @@ -34,7 +33,6 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.step_result import EvalResult, Result from pytorch_lightning.loggers import LightningLoggerBase -from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.profiler import BaseProfiler from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import ConfigValidator @@ -78,7 +76,6 @@ from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.model_utils import is_overridden from pytorch_lightning.trainer.properties import TrainerProperties -from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.accelerators.accelerator import Accelerator # warnings to ignore in trainer diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 7c010ba72c137..b3510f0f400fe 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -18,7 +18,7 @@ import numpy as np import torch -from pytorch_lightning.accelerators.data_parallel import ParallelPlugin +from pytorch_lightning.accelerators.plugins import ParallelPlugin from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.memory import ModelSummary diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index b6f27f32a85fc..92950274e49cd 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -22,8 +22,8 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.data_parallel import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin -from pytorch_lightning.accelerators.precision import PrecisionPlugin +from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin +from pytorch_lightning.accelerators.plugins import PrecisionPlugin from pytorch_lightning.callbacks import Callback from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment from tests.base.boring_model import BoringModel diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 5643dce5a6160..bcc3709d129cf 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -21,11 +21,9 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import EvalModelTemplate - +from tests.base import BoringModel PRETEND_N_OF_GPUS = 16 diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index ca56a987aab98..62782921ef85c 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -26,8 +26,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.accelerator import CPUAccelerator -from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult +from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 8278ef60dc6bd..20e9473b3a910 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -20,7 +20,7 @@ import tests.base.develop_pipelines as tpipes from pytorch_lightning import Trainer, seed_everything -from pytorch_lightning.accelerators.accelerator import TPUAccelerator +from pytorch_lightning.accelerators import TPUAccelerator from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _TPU_AVAILABLE diff --git a/tests/plugins/test_plugin_properties.py b/tests/plugins/test_plugin_properties.py index 5466bd07cd03a..ef87a79d4bb5c 100644 --- a/tests/plugins/test_plugin_properties.py +++ b/tests/plugins/test_plugin_properties.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning import Trainer -from pytorch_lightning.plugins.plugin_connector import LightningCustomPlugins, PluginConnector +from pytorch_lightning.plugins.old.plugin_connector import LightningCustomPlugins, PluginConnector def test_available_plugins_trainer(): From 894e604f7b3fcc8284035c6efefc5ec722346dc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 7 Jan 2021 21:27:36 +0100 Subject: [PATCH 101/157] fix wrong arguments order passed to test --- tests/trainer/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 42d9072e476d6..b3105e97e18c1 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -129,7 +129,7 @@ def test_multiple_val_dataloader(tmpdir): # make sure predictions are good for each val set for dataloader in trainer.val_dataloaders: - tpipes.run_prediction(dataloader, model) + tpipes.run_prediction(trained_model=model, dataloader=dataloader) @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific']) From 2bdc836b24b095cec757dd36bd73491b0d6fdd7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Jan 2021 04:52:51 +0100 Subject: [PATCH 102/157] fix LR finder --- pytorch_lightning/trainer/properties.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 494e91a298843..2e7e122730472 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -15,7 +15,7 @@ import os from abc import ABC from argparse import ArgumentParser, Namespace -from typing import cast, List, Optional, Type, TypeVar, Union +from typing import cast, List, Optional, Type, TypeVar, Union, Any from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.accelerator_connector import BackendConnector @@ -358,7 +358,7 @@ def save_checkpoint(self, filepath, weights_only: bool = False): self.checkpoint_connector.save_checkpoint(filepath, weights_only) @property - def model(self): + def model(self) -> Any: """ The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel. To access the pure LightningModule, use @@ -366,6 +366,18 @@ def model(self): """ return self.accelerator.model + @model.setter + def model(self, model: Any): + """ + Setter for the model, pass-through to accelerator and plugin where the model reference is stored. + Used by the Tuner to reset the state of Trainer and Accelerator. + + Args: + model: The LightningModule, possibly wrapped into DataParallel or DistributedDataParallel, depending + on the backend. + """ + self.accelerator.model = model + def get_model(self): # TODO: rename this to lightning_module (see training type plugin) # backward compatible From 48b9882e52768e079c15f399556e4f58a6675029 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 11 Jan 2021 14:04:05 +0000 Subject: [PATCH 103/157] Added sharded training type and amp plugin --- .../plugins/precision/__init__.py | 1 + .../plugins/precision/sharded_native_amp.py | 37 +++++++++++++++++++ .../plugins/training_type/__init__.py | 1 + .../plugins/training_type/sharded.py | 36 ++++++++++++++++++ 4 files changed, 75 insertions(+) create mode 100644 pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py create mode 100644 pytorch_lightning/accelerators/plugins/training_type/sharded.py diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py index 4f30fe58910f4..e4c6f2076e14b 100644 --- a/pytorch_lightning/accelerators/plugins/precision/__init__.py +++ b/pytorch_lightning/accelerators/plugins/precision/__init__.py @@ -1,4 +1,5 @@ from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py new file mode 100644 index 0000000000000..fb332f0572fd6 --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py @@ -0,0 +1,37 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Union, cast + +from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE +from torch.optim import Optimizer + +from pytorch_lightning.accelerators.plugins import NativeMixedPrecisionPlugin + +if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: + from fairscale.optim import OSS + from fairscale.optim.grad_scaler import ShardedGradScaler + + +class ShardedNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): + + def __init__(self): + super().__init__() + self.scaler = ShardedGradScaler() + + def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + # todo: accelerator needs to rely on precision plugin to clip gradients. + max_norm = grad_clip_val + norm_type = float(2.0) + optimizer = cast(OSS, optimizer) + optimizer.clip_grad_norm(max_norm, norm_type=norm_type) diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py index 532ea418a40bd..d9955969480f7 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py +++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py @@ -2,6 +2,7 @@ from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedPlugin from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py new file mode 100644 index 0000000000000..83aa2f317b07b --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py @@ -0,0 +1,36 @@ +from pytorch_lightning.accelerators.plugins import DDPPlugin +from pytorch_lightning.core.optimizer import is_lightning_optimizer +from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE + +if _FAIRSCALE_AVAILABLE: + from fairscale.optim import OSS + + from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel + + +class ShardedPlugin(DDPPlugin): + def configure_ddp(self): + self._model = LightningShardedDataParallel( + self.model, + sharded_optimizer=self.lightning_module.trainer.optimizers + ) + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + super().init_ddp_connection(global_rank, world_size) + self._reinit_optimizers_with_oss() + + def _reinit_optimizers_with_oss(self): + optimizers = self.lightning_module.trainer.optimizers + for x, optimizer in enumerate(optimizers): + if is_lightning_optimizer(optimizer): + optimizer = optimizer._optimizer + if not isinstance(optimizer, OSS): + optim_class = type(optimizer) + zero_optimizer = OSS( + params=optimizer.param_groups, + optim=optim_class, + **optimizer.defaults + ) + optimizers[x] = zero_optimizer + del optimizer + self.lightning_module.trainer.convert_to_lightning_optimizers() From 38452b643ad9bf0444503b3d43a46ff9bfbf2c7e Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 11 Jan 2021 17:08:35 +0000 Subject: [PATCH 104/157] Move clip grad to precision plugin --- pytorch_lightning/accelerators/accelerator.py | 38 +---------------- .../plugins/precision/precision_plugin.py | 42 ++++++++++++++++++- .../plugins/precision/sharded_native_amp.py | 7 +--- 3 files changed, 44 insertions(+), 43 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index f9b18304316ef..3a6c0e8f6bfbe 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -145,43 +145,7 @@ def optimizer_zero_grad(self, current_epoch, batch_idx, optimizer, opt_idx): model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx) def clip_gradients(self, optimizer, clip_val): - # TODO: separate TPU case from here - self._clip_gradients(optimizer, clip_val) - - def _clip_gradients(self, optimizer, grad_clip_val): - if grad_clip_val is None: - return - - grad_clip_val = float(grad_clip_val) - - if grad_clip_val <= 0: - return - - parameters = self.precision_plugin.master_params(optimizer) - - max_norm = grad_clip_val - norm_type = float(2.0) - - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - - device = parameters[0].device - - if norm_type == math.inf: - total_norm = max(p.grad.data.abs().max() for p in parameters) - else: - out = torch.empty(len(parameters), device=device) - for i, p in enumerate(parameters): - torch.norm(p.grad.data.to(device), norm_type, out=out[i]) - total_norm = torch.norm(out, norm_type) - - eps = self.precision_plugin.EPSILON - - clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps) - clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) - for p in parameters: - p.grad.data.mul_(clip_coef.to(p.grad.data.device)) + self.precision_plugin.clip_gradients(optimizer, clip_val) def on_train_epoch_end(self, outputs): pass diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py index 048a645de250a..6098edfde60b4 100644 --- a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py +++ b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py @@ -1,4 +1,9 @@ +import math +from typing import Union + import torch +from torch.optim import Optimizer + from pytorch_lightning.core import LightningModule from pytorch_lightning.accelerators.plugins.base_plugin import Plugin @@ -42,4 +47,39 @@ def backward( # once backward has been applied, release graph closure_loss = closure_loss.detach() - return closure_loss \ No newline at end of file + return closure_loss + + def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)): + # TODO: separate TPU case from here + if clip_val is None: + return + + grad_clip_val = float(clip_val) + + if grad_clip_val <= 0: + return + + parameters = self.master_params(optimizer) + + max_norm = grad_clip_val + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + + device = parameters[0].device + + if norm_type == math.inf: + total_norm = max(p.grad.data.abs().max() for p in parameters) + else: + out = torch.empty(len(parameters), device=device) + for i, p in enumerate(parameters): + torch.norm(p.grad.data.to(device), norm_type, out=out[i]) + total_norm = torch.norm(out, norm_type) + + eps = self.EPSILON + + clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps) + clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) + for p in parameters: + p.grad.data.mul_(clip_coef.to(p.grad.data.device)) diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py index fb332f0572fd6..4d27cb2cebc04 100644 --- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py +++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py @@ -29,9 +29,6 @@ def __init__(self): super().__init__() self.scaler = ShardedGradScaler() - def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): - # todo: accelerator needs to rely on precision plugin to clip gradients. - max_norm = grad_clip_val - norm_type = float(2.0) + def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)): optimizer = cast(OSS, optimizer) - optimizer.clip_grad_norm(max_norm, norm_type=norm_type) + optimizer.clip_grad_norm(clip_val, norm_type=norm_type) From 173b22c49c9efff79b090bbe21fcae3773137e44 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 12 Jan 2021 15:40:55 +0000 Subject: [PATCH 105/157] Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically --- .../accelerators/accelerator_connector.py | 19 +++++++--- .../plugins/training_type/__init__.py | 3 +- .../plugins/training_type/sharded.py | 2 +- .../plugins/training_type/sharded_spawn.py | 36 +++++++++++++++++++ 4 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index e3467e4be3617..65529ddc89825 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -19,8 +19,9 @@ from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ - DataParallelPlugin, DDP2Plugin, HorovodPlugin -from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin + DataParallelPlugin, DDP2Plugin, HorovodPlugin, ShardedDDPPlugin, ShardedSpawnDDPPlugin +from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \ + PrecisionPlugin, ShardedNativeMixedPrecisionPlugin from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser from pytorch_lightning.utilities import rank_zero_only @@ -187,13 +188,15 @@ def select_precision_plugin(self): self.amp_type = 'apex' else: log.info('Using native 16bit precision.') + if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn': + return ShardedNativeMixedPrecisionPlugin() self.amp_type = AMPType.NATIVE return NativeMixedPrecisionPlugin() - if self.amp_type =='apex': + if self.amp_type == 'apex': if not _APEX_AVAILABLE: rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.' - ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux') + ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux') else: log.info('Using APEX 16bit precision.') self.amp_type = AMPType.APEX @@ -215,13 +218,19 @@ def select_training_type_plugin(self): use_ddp_cpu_spawn = self.use_ddp and self.distributed_backend == "ddp_cpu" use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks + use_ddp_sharded = self.distributed_backend == "ddp_sharded" + use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn" # ddp script mode uses the same flags as TE # TODO: decouple from TE if os.environ.get('PL_IN_DDP_SUBPROCESS', False): use_torchelastic_ddp = False - if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: + if use_ddp_sharded: + ddp_plugin_cls = ShardedDDPPlugin + elif use_ddp_sharded_spawn: + ddp_plugin_cls = ShardedSpawnDDPPlugin + elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py index d9955969480f7..1da1a00e0c1a1 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py +++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py @@ -2,7 +2,8 @@ from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedDDPPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import ShardedSpawnDDPPlugin from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py index 83aa2f317b07b..5aebd58937165 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py @@ -8,7 +8,7 @@ from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel -class ShardedPlugin(DDPPlugin): +class ShardedDDPPlugin(DDPPlugin): def configure_ddp(self): self._model = LightningShardedDataParallel( self.model, diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py new file mode 100644 index 0000000000000..3f6862cb9ff7f --- /dev/null +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py @@ -0,0 +1,36 @@ +from pytorch_lightning.accelerators.plugins import DDPSpawnPlugin +from pytorch_lightning.core.optimizer import is_lightning_optimizer +from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE + +if _FAIRSCALE_AVAILABLE: + from fairscale.optim import OSS + + from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel + + +class ShardedSpawnDDPPlugin(DDPSpawnPlugin): + def configure_ddp(self): + self._model = LightningShardedDataParallel( + self.model, + sharded_optimizer=self.lightning_module.trainer.optimizers + ) + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + super().init_ddp_connection(global_rank, world_size) + self._reinit_optimizers_with_oss() + + def _reinit_optimizers_with_oss(self): + optimizers = self.lightning_module.trainer.optimizers + for x, optimizer in enumerate(optimizers): + if is_lightning_optimizer(optimizer): + optimizer = optimizer._optimizer + if not isinstance(optimizer, OSS): + optim_class = type(optimizer) + zero_optimizer = OSS( + params=optimizer.param_groups, + optim=optim_class, + **optimizer.defaults + ) + optimizers[x] = zero_optimizer + del optimizer + self.lightning_module.trainer.convert_to_lightning_optimizers() From 79803f69c61cfaeea71741e1c337792917bdd8a6 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 12 Jan 2021 16:57:09 +0000 Subject: [PATCH 106/157] Fix import issue, attempting to fix tests --- benchmarks/test_sharded_parity.py | 51 +++++-------------- .../accelerators/accelerator_connector.py | 10 ++-- .../plugins/precision/__init__.py | 2 +- .../plugins/precision/sharded_native_amp.py | 2 +- .../plugins/training_type/__init__.py | 4 +- .../plugins/training_type/sharded.py | 4 +- .../plugins/training_type/sharded_spawn.py | 4 +- tests/plugins/test_sharded_plugin.py | 21 ++++---- 8 files changed, 39 insertions(+), 59 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 05fde8e11523a..67b2c2e7c70a1 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -15,14 +15,12 @@ import os import platform import time -from typing import Type, Union +from typing import Type import pytest import torch from pytorch_lightning import seed_everything, Trainer -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE from tests.backends import DDPLauncher from tests.base.boring_model import BoringModel, RandomDataset @@ -32,10 +30,8 @@ @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_one_gpu(): - plugin_parity_test( + sharded_parity_test( gpus=1, - accelerator='ddp_spawn', - plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, ) @@ -45,11 +41,9 @@ def test_ddp_sharded_plugin_correctness_one_gpu(): @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_amp_one_gpu(): - plugin_parity_test( + sharded_parity_test( gpus=1, precision=16, - accelerator='ddp_spawn', - plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, ) @@ -59,10 +53,8 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu(): @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu(): - plugin_parity_test( + sharded_parity_test( gpus=2, - accelerator='ddp_spawn', - plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -73,11 +65,9 @@ def test_ddp_sharded_plugin_correctness_multi_gpu(): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): - plugin_parity_test( + sharded_parity_test( gpus=2, precision=16, - accelerator='ddp_spawn', - plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -88,11 +78,9 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): - plugin_parity_test( + sharded_parity_test( gpus=2, precision=16, - accelerator='ddp_spawn', - plugin='ddp_sharded', model_cls=SeedTrainLoaderModel, max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -104,11 +92,9 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): reason="test should be run outside of pytest") @DDPLauncher.run("--accelerator ddp --gpus 2 --precision 32") def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None): - plugin_parity_test( + sharded_parity_test( gpus=args.gpus, precision=args.precision, - accelerator=args.accelerator, - plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, ) @@ -119,11 +105,9 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None): reason="test should be run outside of pytest") @DDPLauncher.run("--accelerator ddp --gpus 2 --precision 16") def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None): - plugin_parity_test( + sharded_parity_test( gpus=args.gpus, precision=args.precision, - accelerator=args.accelerator, - plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, ) @@ -136,10 +120,8 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): """ Ensures same results using multiple optimizers across multiple GPUs """ - plugin_parity_test( - plugin=DDPShardedPlugin(), + sharded_parity_test( gpus=2, - accelerator='ddp_spawn', model_cls=SeedTrainLoaderMultipleOptimizersModel, max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -153,10 +135,8 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): """ Ensures using multiple optimizers across multiple GPUs with manual optimization """ - plugin_parity_test( - plugin=DDPShardedPlugin(), + sharded_parity_test( gpus=2, - accelerator='ddp_spawn', model_cls=SeedTrainLoaderManualModel, max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -253,11 +233,9 @@ def record_ddp_fit_model_stats(trainer, model, use_cuda): return max_memory, total_time -def plugin_parity_test( +def sharded_parity_test( model_cls: Type[SeedTrainLoaderModel], - plugin: Union[str, DDPPlugin], seed: int = 42, - accelerator: str = 'ddp_spawn', gpus: int = 0, precision: int = 32, max_percent_speed_diff: float = 0.1, @@ -268,9 +246,7 @@ def plugin_parity_test( Args: model_cls: Model class to use for test. - plugin: Plugin to parity test. seed: Seed for generators. Note that this does not handle the seed for data-loading on multi-process. - accelerator: Accelerator type for test. gpus: Number of GPUS to enable. precision: Whether to use AMP or normal FP32 training. max_percent_speed_diff: The maximum speed difference compared to normal DDP training. @@ -288,7 +264,7 @@ def plugin_parity_test( max_epochs=1, gpus=gpus, precision=precision, - accelerator=accelerator, + accelerator='ddp_spawn', ) max_memory_ddp, ddp_time = record_ddp_fit_model_stats( @@ -306,8 +282,7 @@ def plugin_parity_test( max_epochs=1, gpus=gpus, precision=precision, - accelerator=accelerator, - plugins=[plugin], + accelerator='ddp_sharded_spawn', ) max_memory_custom, custom_model_time = record_ddp_fit_model_stats( diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 65529ddc89825..eca02dbc2f902 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -19,7 +19,7 @@ from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ - DataParallelPlugin, DDP2Plugin, HorovodPlugin, ShardedDDPPlugin, ShardedSpawnDDPPlugin + DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \ PrecisionPlugin, ShardedNativeMixedPrecisionPlugin from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus @@ -227,9 +227,9 @@ def select_training_type_plugin(self): use_torchelastic_ddp = False if use_ddp_sharded: - ddp_plugin_cls = ShardedDDPPlugin + ddp_plugin_cls = DDPShardedPlugin elif use_ddp_sharded_spawn: - ddp_plugin_cls = ShardedSpawnDDPPlugin + ddp_plugin_cls = DDPSpawnShardedPlugin elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: @@ -347,6 +347,10 @@ def set_distributed_mode(self): self.parallel_device_ids = None self.use_ddp = True + # Sharded DDP + elif self.distributed_backend in ("ddp_sharded", "ddp_sharded_spawn"): + self.use_ddp = True + # HOROVOD elif self.distributed_backend == "horovod": self._set_horovod_backend() diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py index e4c6f2076e14b..0c7265f4be29d 100644 --- a/pytorch_lightning/accelerators/plugins/precision/__init__.py +++ b/pytorch_lightning/accelerators/plugins/precision/__init__.py @@ -1,5 +1,5 @@ from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.accelerators.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin \ No newline at end of file diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py index 4d27cb2cebc04..9df1e330bef47 100644 --- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py +++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py @@ -16,7 +16,7 @@ from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE from torch.optim import Optimizer -from pytorch_lightning.accelerators.plugins import NativeMixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py index 1da1a00e0c1a1..8ff2d65c4f6d7 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py +++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py @@ -2,8 +2,8 @@ from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded import ShardedDDPPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import ShardedSpawnDDPPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py index 5aebd58937165..ea5842c4b34d5 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.plugins import DDPPlugin +from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE @@ -8,7 +8,7 @@ from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel -class ShardedDDPPlugin(DDPPlugin): +class DDPShardedPlugin(DDPPlugin): def configure_ddp(self): self._model = LightningShardedDataParallel( self.model, diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py index 3f6862cb9ff7f..a38d283cdc003 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.plugins import DDPSpawnPlugin +from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE @@ -8,7 +8,7 @@ from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel -class ShardedSpawnDDPPlugin(DDPSpawnPlugin): +class DDPSpawnShardedPlugin(DDPSpawnPlugin): def configure_ddp(self): self._model = LightningShardedDataParallel( self.model, diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 80226bc8ef941..fc4f35b33b241 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -6,10 +6,9 @@ import torch from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin -from pytorch_lightning.plugins.sharded_plugin import _FAIRSCALE_AVAILABLE, DDPShardedPlugin -from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE +from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import BoringModel @@ -26,28 +25,30 @@ }, ) @mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("torch.cuda.is_available", return_value=True) @pytest.mark.parametrize( - ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + ["accelerator", "gpus"], + [("ddp_sharded", 1), ("ddp_sharded_spawn", 1)] ) @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") -def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes): +def test_ddp_choice_sharded(tmpdir, accelerator, gpus): """ Test to ensure that plugin is correctly chosen """ class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin) + if accelerator == 'ddp_sharded': + assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPShardedPlugin) + if accelerator == 'ddp_sharded_spawn': + assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPSpawnShardedPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, gpus=gpus, - num_processes=num_processes, - accelerator=ddp_backend, - plugins=[DDPShardedPlugin()], + accelerator=accelerator, callbacks=[CB()], ) From a7c0d8fb2a195df2ab2d6eb6bf8a6a5106b154f8 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 12 Jan 2021 20:35:07 +0000 Subject: [PATCH 107/157] Fix initial test --- tests/plugins/test_sharded_plugin.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index fc4f35b33b241..c0b4877e82ad7 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -13,25 +13,12 @@ from tests.base.boring_model import BoringModel -@mock.patch.dict( - os.environ, - { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_LOCALID": "0", - }, -) -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) @pytest.mark.parametrize( - ["accelerator", "gpus"], - [("ddp_sharded", 1), ("ddp_sharded_spawn", 1)] + ["accelerator"], + [("ddp_sharded",), ("ddp_sharded_spawn",)] ) @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") -def test_ddp_choice_sharded(tmpdir, accelerator, gpus): +def test_sharded_ddp_choice(tmpdir, accelerator): """ Test to ensure that plugin is correctly chosen """ @@ -40,14 +27,13 @@ class CB(Callback): def on_fit_start(self, trainer, pl_module): if accelerator == 'ddp_sharded': assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPShardedPlugin) - if accelerator == 'ddp_sharded_spawn': + elif accelerator == 'ddp_sharded_spawn': assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPSpawnShardedPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, - gpus=gpus, accelerator=accelerator, callbacks=[CB()], ) @@ -67,8 +53,7 @@ def test_invalid_apex_sharded(tmpdir): with pytest.raises(MisconfigurationException, match='Sharded Plugin is not supported with Apex AMP'): trainer = Trainer( fast_dev_run=True, - accelerator='ddp_spawn', - plugins=[DDPShardedPlugin()], + accelerator='ddp_sharded_spawn', precision=16, amp_backend='apex', ) From 02df0adf128d2a0162810bbf3b1b1e7748fb4687 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 14 Jan 2021 12:26:23 +0000 Subject: [PATCH 108/157] Reflect hook logic from master, should wrap model after move to device --- .../accelerators/accelerator_connector.py | 5 ++ .../plugins/training_type/sharded.py | 32 ++++++++-- .../plugins/training_type/sharded_spawn.py | 32 ++++++++-- tests/plugins/test_sharded_plugin.py | 64 ++++++------------- 4 files changed, 76 insertions(+), 57 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index eca02dbc2f902..56fd5e16642e4 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -198,6 +198,11 @@ def select_precision_plugin(self): rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.' ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux') else: + if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn': + raise MisconfigurationException( + 'Sharded Plugin is not supported with Apex AMP, ' + 'please using native AMP for 16-bit precision.' + ) log.info('Using APEX 16bit precision.') self.amp_type = AMPType.APEX return ApexMixedPrecisionPlugin(self.amp_level) diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/accelerators/plugins/training_type/sharded.py index ea5842c4b34d5..1ba54bf8419bb 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded.py @@ -1,6 +1,8 @@ +from typing import Optional + from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.core.optimizer import is_lightning_optimizer -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE +from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS @@ -10,15 +12,12 @@ class DDPShardedPlugin(DDPPlugin): def configure_ddp(self): + self._wrap_optimizers() self._model = LightningShardedDataParallel( self.model, sharded_optimizer=self.lightning_module.trainer.optimizers ) - def init_ddp_connection(self, global_rank: int, world_size: int) -> None: - super().init_ddp_connection(global_rank, world_size) - self._reinit_optimizers_with_oss() - def _reinit_optimizers_with_oss(self): optimizers = self.lightning_module.trainer.optimizers for x, optimizer in enumerate(optimizers): @@ -33,4 +32,25 @@ def _reinit_optimizers_with_oss(self): ) optimizers[x] = zero_optimizer del optimizer - self.lightning_module.trainer.convert_to_lightning_optimizers() + trainer = self.lightning_module.trainer + trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers) + + def _wrap_optimizers(self): + trainer = self.model.trainer + if trainer.testing is True: + return + self._reinit_optimizers_with_oss() + + def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]: + if is_lightning_optimizer(optimizer): + optimizer = optimizer._optimizer + optimizer.consolidate_state_dict() + return self._optim_state_dict(optimizer) + + @rank_zero_only + def _optim_state_dict(self, optimizer): + """ + Retrieves state dict only on rank 0, which contains the entire optimizer state after calling + :meth:`consolidate_state_dict`. + """ + return optimizer.state_dict() diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py index a38d283cdc003..d2346831579b8 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py @@ -1,6 +1,8 @@ +from typing import Optional + from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.core.optimizer import is_lightning_optimizer -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE +from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS @@ -10,15 +12,12 @@ class DDPSpawnShardedPlugin(DDPSpawnPlugin): def configure_ddp(self): + self._wrap_optimizers() self._model = LightningShardedDataParallel( self.model, sharded_optimizer=self.lightning_module.trainer.optimizers ) - def init_ddp_connection(self, global_rank: int, world_size: int) -> None: - super().init_ddp_connection(global_rank, world_size) - self._reinit_optimizers_with_oss() - def _reinit_optimizers_with_oss(self): optimizers = self.lightning_module.trainer.optimizers for x, optimizer in enumerate(optimizers): @@ -33,4 +32,25 @@ def _reinit_optimizers_with_oss(self): ) optimizers[x] = zero_optimizer del optimizer - self.lightning_module.trainer.convert_to_lightning_optimizers() + trainer = self.lightning_module.trainer + trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers) + + def _wrap_optimizers(self): + trainer = self.model.trainer + if trainer.testing is True: + return + self._reinit_optimizers_with_oss() + + def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]: + if is_lightning_optimizer(optimizer): + optimizer = optimizer._optimizer + optimizer.consolidate_state_dict() + return self._optim_state_dict(optimizer) + + @rank_zero_only + def _optim_state_dict(self, optimizer): + """ + Retrieves state dict only on rank 0, which contains the entire optimizer state after calling + :meth:`consolidate_state_dict`. + """ + return optimizer.state_dict() diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index c0b4877e82ad7..471f919d3245f 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -1,12 +1,12 @@ import os import platform -from unittest import mock import pytest import torch from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin +from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin, \ + ShardedNativeMixedPrecisionPlugin from pytorch_lightning.callbacks import Callback from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -61,43 +61,28 @@ def test_invalid_apex_sharded(tmpdir): trainer.fit(model) -@mock.patch.dict( - os.environ, - { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_LOCALID": "0", - }, -) -@mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( - ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + ["accelerator"], + [("ddp_sharded",), ("ddp_sharded_spawn",)] ) @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP") -def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes): +def test_ddp_choice_sharded_amp(tmpdir, accelerator): """ Test to ensure that plugin native amp plugin is correctly chosen when using sharded """ class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin) - assert isinstance(trainer.precision_connector.backend, ShardedNativeAMPPlugin) + assert isinstance(trainer.precision_connector.backend, ShardedNativeMixedPrecisionPlugin) raise SystemExit() model = BoringModel() trainer = Trainer( fast_dev_run=True, - gpus=gpus, + gpus=1, precision=16, - num_processes=num_processes, - accelerator=ddp_backend, - plugins=[DDPShardedPlugin()], + accelerator=accelerator, callbacks=[CB()], ) @@ -114,9 +99,8 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir): """ model = BoringModel() trainer = Trainer( - accelerator='ddp_cpu', + accelerator='ddp_sharded_spawn', num_processes=2, - plugins=[DDPShardedPlugin()], fast_dev_run=True, ) @@ -142,8 +126,7 @@ def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir): model = BoringModel() trainer = Trainer( gpus=2, - accelerator='ddp_spawn', - plugins=[DDPShardedPlugin()], + accelerator='ddp_sharded_spawn', fast_dev_run=True, ) @@ -169,8 +152,7 @@ def test_ddp_sharded_plugin_finetune(tmpdir): model = BoringModel() trainer = Trainer( gpus=2, - accelerator='ddp_spawn', - plugins=[DDPShardedPlugin()], + accelerator='ddp_sharded_spawn', fast_dev_run=True, ) trainer.fit(model) @@ -194,9 +176,8 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir): """ model = BoringModel() trainer = Trainer( - accelerator='ddp_cpu', + accelerator='ddp_sharded_spawn', num_processes=2, - plugins=[DDPShardedPlugin()], fast_dev_run=True, ) @@ -208,9 +189,8 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir): model = BoringModel() trainer = Trainer( - accelerator='ddp_cpu', + accelerator='ddp_sharded_spawn', num_processes=2, - plugins=[DDPShardedPlugin()], fast_dev_run=True, resume_from_checkpoint=checkpoint_path ) @@ -230,8 +210,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir): """ model = BoringModel() trainer = Trainer( - accelerator='ddp_spawn', - plugins=[DDPShardedPlugin()], + accelerator='ddp_sharded_spawn', fast_dev_run=True, gpus=2, ) @@ -244,8 +223,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir): model = BoringModel() trainer = Trainer( - accelerator='ddp_spawn', - plugins=[DDPShardedPlugin()], + accelerator='ddp_sharded_spawn', fast_dev_run=True, gpus=1, resume_from_checkpoint=checkpoint_path @@ -264,8 +242,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir): """ model = BoringModel() trainer = Trainer( - accelerator='ddp_spawn', - plugins=[DDPShardedPlugin()], + accelerator='ddp_sharded_spawn', gpus=1, fast_dev_run=True ) @@ -278,8 +255,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir): model = BoringModel() trainer = Trainer( - plugins=[DDPShardedPlugin()], - accelerator='ddp_cpu', + accelerator='ddp_sharded_spawn', num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path @@ -297,9 +273,8 @@ def test_ddp_sharded_plugin_test(tmpdir): """ model = BoringModel() trainer = Trainer( - accelerator='ddp_cpu', + accelerator='ddp_sharded_spawn', num_processes=2, - plugins=[DDPShardedPlugin()], fast_dev_run=True, ) @@ -316,9 +291,8 @@ def test_ddp_sharded_plugin_test_multigpu(tmpdir): """ model = BoringModel() trainer = Trainer( - accelerator='ddp_spawn', + accelerator='ddp_sharded_spawn', gpus=2, - plugins=[DDPShardedPlugin()], fast_dev_run=True, ) From d0ebcba37e733b26a3bc0e60e35884796102aa14 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Fri, 22 Jan 2021 18:03:33 +0100 Subject: [PATCH 109/157] Optional state consolidation, since master has optimizers not wrapped --- .../accelerators/plugins/training_type/sharded_spawn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py index d2346831579b8..04e171bb9d5a0 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py @@ -35,6 +35,7 @@ def _reinit_optimizers_with_oss(self): trainer = self.lightning_module.trainer trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers) + def _wrap_optimizers(self): trainer = self.model.trainer if trainer.testing is True: @@ -44,7 +45,9 @@ def _wrap_optimizers(self): def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]: if is_lightning_optimizer(optimizer): optimizer = optimizer._optimizer - optimizer.consolidate_state_dict() + + if isinstance(optimizer, OSS): + optimizer.consolidate_state_dict() return self._optim_state_dict(optimizer) @rank_zero_only From 319c3e8d8509bf37f598be40c347d114849337f2 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Fri, 22 Jan 2021 18:08:20 +0100 Subject: [PATCH 110/157] change attribute for instance test --- tests/plugins/test_sharded_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 471f919d3245f..ac20cd68e36d5 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -74,7 +74,7 @@ def test_ddp_choice_sharded_amp(tmpdir, accelerator): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.precision_connector.backend, ShardedNativeMixedPrecisionPlugin) + assert isinstance(trainer.accelerator_backend.precision_plugin, ShardedNativeMixedPrecisionPlugin)) raise SystemExit() model = BoringModel() From a34cd15d16a42a0939748e8e97460a52c830b4d3 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Fri, 22 Jan 2021 18:10:25 +0100 Subject: [PATCH 111/157] reset optimizers optimizers are not used in main process, so state would be wrong. --- .../accelerators/plugins/training_type/ddp_spawn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py index e2c61bfe6e3fd..e9e4fc364fa03 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py @@ -77,6 +77,8 @@ def set_world_ranks(self, process_idx): def start_training(self, trainer): mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,)) + # reset optimizers, since main process is never used for training and thus does not have a valid optim state + trainer.optimizers = [] def start_testing(self, trainer): mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer,)) @@ -210,4 +212,4 @@ def model_to_device(self): def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) - return output \ No newline at end of file + return output From c95b06af23ae764ca445d52a63a44037f9b49bd0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 23 Jan 2021 00:45:27 +0100 Subject: [PATCH 112/157] legacy --- pytorch_lightning/accelerators/{old => legacy}/__init__.py | 0 .../accelerators/{old => legacy}/accelerator_connector.py | 0 pytorch_lightning/accelerators/{old => legacy}/cpu_accelerator.py | 0 pytorch_lightning/accelerators/{old => legacy}/ddp_accelerator.py | 0 .../accelerators/{old => legacy}/ddp_hpc_accelerator.py | 0 .../accelerators/{old => legacy}/ddp_spawn_accelerator.py | 0 .../accelerators/{old => legacy}/horovod_accelerator.py | 0 pytorch_lightning/accelerators/{old => legacy}/tpu_accelerator.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename pytorch_lightning/accelerators/{old => legacy}/__init__.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/accelerator_connector.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/cpu_accelerator.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/ddp_accelerator.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/ddp_hpc_accelerator.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/ddp_spawn_accelerator.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/horovod_accelerator.py (100%) rename pytorch_lightning/accelerators/{old => legacy}/tpu_accelerator.py (100%) diff --git a/pytorch_lightning/accelerators/old/__init__.py b/pytorch_lightning/accelerators/legacy/__init__.py similarity index 100% rename from pytorch_lightning/accelerators/old/__init__.py rename to pytorch_lightning/accelerators/legacy/__init__.py diff --git a/pytorch_lightning/accelerators/old/accelerator_connector.py b/pytorch_lightning/accelerators/legacy/accelerator_connector.py similarity index 100% rename from pytorch_lightning/accelerators/old/accelerator_connector.py rename to pytorch_lightning/accelerators/legacy/accelerator_connector.py diff --git a/pytorch_lightning/accelerators/old/cpu_accelerator.py b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/old/cpu_accelerator.py rename to pytorch_lightning/accelerators/legacy/cpu_accelerator.py diff --git a/pytorch_lightning/accelerators/old/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/old/ddp_accelerator.py rename to pytorch_lightning/accelerators/legacy/ddp_accelerator.py diff --git a/pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/old/ddp_hpc_accelerator.py rename to pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py diff --git a/pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/old/ddp_spawn_accelerator.py rename to pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py diff --git a/pytorch_lightning/accelerators/old/horovod_accelerator.py b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/old/horovod_accelerator.py rename to pytorch_lightning/accelerators/legacy/horovod_accelerator.py diff --git a/pytorch_lightning/accelerators/old/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py similarity index 100% rename from pytorch_lightning/accelerators/old/tpu_accelerator.py rename to pytorch_lightning/accelerators/legacy/tpu_accelerator.py From 9ff0c64f16194463dcc87f7773f8773fe81f56c6 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 23 Jan 2021 00:46:20 +0100 Subject: [PATCH 113/157] imports in accel --- pytorch_lightning/accelerators/accelerator.py | 8 +++----- .../accelerators/accelerator_connector.py | 11 ++++++----- pytorch_lightning/accelerators/cpu.py | 2 +- pytorch_lightning/accelerators/gpu.py | 3 ++- pytorch_lightning/accelerators/legacy/__init__.py | 3 ++- .../accelerators/legacy/ddp_accelerator.py | 4 ++-- .../accelerators/legacy/ddp_hpc_accelerator.py | 6 +++--- .../accelerators/legacy/ddp_spawn_accelerator.py | 4 ++-- .../accelerators/plugins/base_plugin.py | 2 ++ .../accelerators/plugins/precision/apex_amp.py | 5 +++-- .../accelerators/plugins/precision/mixed.py | 3 ++- .../accelerators/plugins/precision/native_amp.py | 4 +++- .../plugins/precision/precision_plugin.py | 2 +- .../plugins/precision/sharded_native_amp.py | 2 +- .../plugins/training_type/__init__.py | 4 ++-- .../accelerators/plugins/training_type/ddp.py | 10 +++++----- .../plugins/training_type/ddp_spawn.py | 15 +++++++-------- .../accelerators/plugins/training_type/dp.py | 2 ++ .../accelerators/plugins/training_type/horovod.py | 7 ++++--- .../plugins/training_type/parallel.py | 2 ++ .../plugins/training_type/single_device.py | 1 + .../plugins/training_type/training_type_plugin.py | 4 ++-- 22 files changed, 58 insertions(+), 46 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3a6c0e8f6bfbe..4834fdf39f0ae 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -1,19 +1,17 @@ -from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin -from pytorch_lightning.utilities import AMPType from typing import Any -import math import torch from torch.optim import Optimizer -from pytorch_lightning.core import LightningModule +from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin from pytorch_lightning.accelerators.plugins.precision import ( ApexMixedPrecisionPlugin, MixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin, ) - +from pytorch_lightning.core import LightningModule +from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.apply_func import move_data_to_device diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 56fd5e16642e4..808472f4a4c73 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -13,23 +13,24 @@ # limitations under the License. import os + import torch +from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ - DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \ PrecisionPlugin, ShardedNativeMixedPrecisionPlugin +from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ + DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin +from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment +from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning import _logger as log -from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment -from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment try: import torch_xla diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 820fab6d7d0f8..a39aace801993 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -1,5 +1,5 @@ -from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 7b2cbe3627e0b..8084217019c0f 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -1,6 +1,7 @@ import torch -from pytorch_lightning.utilities.exceptions import MisconfigurationException + from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.utilities.exceptions import MisconfigurationException class GPUAccelerator(Accelerator): diff --git a/pytorch_lightning/accelerators/legacy/__init__.py b/pytorch_lightning/accelerators/legacy/__init__.py index d8bf7061de11f..d566b7301b788 100644 --- a/pytorch_lightning/accelerators/legacy/__init__.py +++ b/pytorch_lightning/accelerators/legacy/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.ddp2_accelerator import DDP2Accelerator # noqa: F401 from pytorch_lightning.accelerators.ddp_accelerator import DDPAccelerator # noqa: F401 @@ -23,3 +22,5 @@ from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator # noqa: F401 from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator # noqa: F401 + +from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py index 56f6eaa2223a3..987eda50476f1 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py @@ -21,6 +21,8 @@ import numpy as np import torch import torch.distributed as torch_distrib +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log @@ -28,8 +30,6 @@ from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import _HYDRA_AVAILABLE, AMPType from pytorch_lightning.utilities.distributed import ( all_gather_ddp_if_available, diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py index cf6aad9999223..8df353b025378 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py @@ -14,8 +14,10 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib import torch.distributed as dist +import torch.distributed as torch_distrib +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log @@ -23,8 +25,6 @@ from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py index e23943e9262f8..33af749a229ee 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py @@ -18,6 +18,8 @@ import torch import torch.distributed as torch_distrib import torch.multiprocessing as mp +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log @@ -25,8 +27,6 @@ from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load diff --git a/pytorch_lightning/accelerators/plugins/base_plugin.py b/pytorch_lightning/accelerators/plugins/base_plugin.py index 3ecfb48726f76..7c818db322916 100644 --- a/pytorch_lightning/accelerators/plugins/base_plugin.py +++ b/pytorch_lightning/accelerators/plugins/base_plugin.py @@ -1,6 +1,8 @@ import contextlib + import torch + class Plugin(object): def connect(self, model: torch.nn.Module, *args, **kwargs): diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py index 08b4fe7906732..967324b1a3490 100644 --- a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py +++ b/pytorch_lightning/accelerators/plugins/precision/apex_amp.py @@ -1,10 +1,11 @@ -from contextlib import contextmanager from typing import List, Tuple + import torch from torch.optim import Optimizer + +from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import AMPType, _APEX_AVAILABLE, rank_zero_warn -from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin if _APEX_AVAILABLE: from apex import amp diff --git a/pytorch_lightning/accelerators/plugins/precision/mixed.py b/pytorch_lightning/accelerators/plugins/precision/mixed.py index 1eb1ea18ebc23..f96a47f35c04c 100644 --- a/pytorch_lightning/accelerators/plugins/precision/mixed.py +++ b/pytorch_lightning/accelerators/plugins/precision/mixed.py @@ -1,5 +1,6 @@ -from pytorch_lightning.utilities import AMPType from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.utilities import AMPType + class MixedPrecisionPlugin(PrecisionPlugin): EPSILON = 1e-5 diff --git a/pytorch_lightning/accelerators/plugins/precision/native_amp.py b/pytorch_lightning/accelerators/plugins/precision/native_amp.py index f233a43dfdd53..fad0d1f469c34 100644 --- a/pytorch_lightning/accelerators/plugins/precision/native_amp.py +++ b/pytorch_lightning/accelerators/plugins/precision/native_amp.py @@ -1,9 +1,11 @@ from contextlib import contextmanager + import torch + +from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin class NativeMixedPrecisionPlugin(MixedPrecisionPlugin): diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py index 6098edfde60b4..120fbcafbecf9 100644 --- a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py +++ b/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py @@ -4,8 +4,8 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.core import LightningModule from pytorch_lightning.accelerators.plugins.base_plugin import Plugin +from pytorch_lightning.core import LightningModule class PrecisionPlugin(Plugin): diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py index 9df1e330bef47..969780dd1df7e 100644 --- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py +++ b/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py @@ -13,10 +13,10 @@ # limitations under the License. from typing import Union, cast -from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE from torch.optim import Optimizer from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py index 8ff2d65c4f6d7..152fdc68d552e 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py +++ b/pytorch_lightning/accelerators/plugins/training_type/__init__.py @@ -2,9 +2,9 @@ from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin +from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py index 4e865a959ae73..b314a230076b0 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py @@ -1,19 +1,19 @@ import os -import sys import subprocess +import sys from time import sleep -import numpy as np from typing import Any, Dict, Optional, Union +import numpy as np import torch import torch.distributed as torch_distrib from pytorch_lightning import _logger as log -from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.utilities import _HYDRA_AVAILABLE -from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.distributed import LightningDistributed from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.utilities import _HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py index e9e4fc364fa03..f572f9af36f06 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py @@ -1,22 +1,21 @@ -import re import os -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +import re from typing import Any, Dict, Optional, Union -import torch -import torch.multiprocessing as mp +import torch import torch.distributed as torch_distrib +import torch.multiprocessing as mp +from pytorch_lightning import _logger as log +from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load -from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only -from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn from pytorch_lightning.utilities.seed import seed_everything -from pytorch_lightning import _logger as log - if torch.distributed.is_available(): from torch.distributed import ReduceOp else: diff --git a/pytorch_lightning/accelerators/plugins/training_type/dp.py b/pytorch_lightning/accelerators/plugins/training_type/dp.py index 0c50d077633af..d77aa52fc700c 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/dp.py +++ b/pytorch_lightning/accelerators/plugins/training_type/dp.py @@ -1,10 +1,12 @@ from typing import List import torch + from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.core.step_result import Result from pytorch_lightning.overrides.data_parallel import LightningDataParallel + class DataParallelPlugin(ParallelPlugin): def __init__(self, parallel_devices: List[torch.device]): diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/accelerators/plugins/training_type/horovod.py index fee77f762fde1..eb2edd2f3e414 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/horovod.py +++ b/pytorch_lightning/accelerators/plugins/training_type/horovod.py @@ -1,12 +1,13 @@ from contextlib import ExitStack -from pytorch_lightning.utilities.distributed import rank_zero_only from typing import Any, List, Optional, Union import torch +from torch.optim.lr_scheduler import _LRScheduler + from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin -from pytorch_lightning.utilities import _HOROVOD_AVAILABLE from pytorch_lightning.core.optimizer import LightningOptimizer -from torch.optim.lr_scheduler import _LRScheduler +from pytorch_lightning.utilities import _HOROVOD_AVAILABLE +from pytorch_lightning.utilities.distributed import rank_zero_only if _HOROVOD_AVAILABLE: import horovod.torch as hvd diff --git a/pytorch_lightning/accelerators/plugins/training_type/parallel.py b/pytorch_lightning/accelerators/plugins/training_type/parallel.py index fd366f677b55f..865e7e6b4bd1c 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/parallel.py +++ b/pytorch_lightning/accelerators/plugins/training_type/parallel.py @@ -1,7 +1,9 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from typing import List, Optional + import torch + from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule diff --git a/pytorch_lightning/accelerators/plugins/training_type/single_device.py b/pytorch_lightning/accelerators/plugins/training_type/single_device.py index 2e674ef87fbb4..200072ee82651 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/single_device.py +++ b/pytorch_lightning/accelerators/plugins/training_type/single_device.py @@ -1,4 +1,5 @@ import torch + from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py index 94d4dbf9d3409..c5e400494e82c 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py @@ -1,12 +1,12 @@ import os - from abc import ABC, abstractmethod from typing import Optional + import torch +from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.plugins.base_plugin import Plugin -from pytorch_lightning import _logger as log class TrainingTypePlugin(Plugin, ABC): def __init__(self): From 67d4e47281942e2a79d279a6f6774843c6ab1f16 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 23 Jan 2021 00:46:45 +0100 Subject: [PATCH 114/157] legacy2 --- .../cluster_environments/cluster_environment.py | 2 +- pytorch_lightning/plugins/{old => legacy}/__init__.py | 0 pytorch_lightning/plugins/{old => legacy}/apex.py | 2 +- .../plugins/{old => legacy}/ddp_plugin.py | 2 +- .../plugins/{old => legacy}/ddp_sequential_plugin.py | 2 +- .../plugins/{old => legacy}/native_amp.py | 2 +- pytorch_lightning/plugins/{old => legacy}/plugin.py | 0 .../plugins/{old => legacy}/plugin_connector.py | 10 +++++----- .../plugins/{old => legacy}/precision_plugin.py | 2 +- .../plugins/{old => legacy}/rpc_plugin.py | 2 +- .../{old => legacy}/sharded_native_amp_plugin.py | 2 +- .../plugins/{old => legacy}/sharded_plugin.py | 4 ++-- .../trainer/connectors/precision_connector.py | 4 ++-- pytorch_lightning/trainer/trainer.py | 2 +- tests/plugins/test_plugin_properties.py | 2 +- 15 files changed, 19 insertions(+), 19 deletions(-) rename pytorch_lightning/plugins/{old => legacy}/__init__.py (100%) rename pytorch_lightning/plugins/{old => legacy}/apex.py (98%) rename pytorch_lightning/plugins/{old => legacy}/ddp_plugin.py (99%) rename pytorch_lightning/plugins/{old => legacy}/ddp_sequential_plugin.py (99%) rename pytorch_lightning/plugins/{old => legacy}/native_amp.py (97%) rename pytorch_lightning/plugins/{old => legacy}/plugin.py (100%) rename pytorch_lightning/plugins/{old => legacy}/plugin_connector.py (95%) rename pytorch_lightning/plugins/{old => legacy}/precision_plugin.py (95%) rename pytorch_lightning/plugins/{old => legacy}/rpc_plugin.py (98%) rename pytorch_lightning/plugins/{old => legacy}/sharded_native_amp_plugin.py (94%) rename pytorch_lightning/plugins/{old => legacy}/sharded_plugin.py (95%) diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index 8652d701dbf83..41af4fe84c7f0 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.plugins.old.plugin import LightningPlugin +from pytorch_lightning.plugins.legacy.plugin import LightningPlugin class ClusterEnvironment(LightningPlugin): diff --git a/pytorch_lightning/plugins/old/__init__.py b/pytorch_lightning/plugins/legacy/__init__.py similarity index 100% rename from pytorch_lightning/plugins/old/__init__.py rename to pytorch_lightning/plugins/legacy/__init__.py diff --git a/pytorch_lightning/plugins/old/apex.py b/pytorch_lightning/plugins/legacy/apex.py similarity index 98% rename from pytorch_lightning/plugins/old/apex.py rename to pytorch_lightning/plugins/legacy/apex.py index d917924eb0960..d8562c6a70d71 100644 --- a/pytorch_lightning/plugins/old/apex.py +++ b/pytorch_lightning/plugins/legacy/apex.py @@ -17,7 +17,7 @@ from torch.optim.optimizer import Optimizer from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.legacy.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType from pytorch_lightning.utilities.distributed import rank_zero_warn diff --git a/pytorch_lightning/plugins/old/ddp_plugin.py b/pytorch_lightning/plugins/legacy/ddp_plugin.py similarity index 99% rename from pytorch_lightning/plugins/old/ddp_plugin.py rename to pytorch_lightning/plugins/legacy/ddp_plugin.py index 360479de5a665..24455bc873919 100644 --- a/pytorch_lightning/plugins/old/ddp_plugin.py +++ b/pytorch_lightning/plugins/legacy/ddp_plugin.py @@ -22,7 +22,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.data_parallel import LightningDistributedModule, prepare_for_backward -from pytorch_lightning.plugins.old.plugin import LightningPlugin +from pytorch_lightning.plugins.legacy.plugin import LightningPlugin from pytorch_lightning.utilities import DeviceType diff --git a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py b/pytorch_lightning/plugins/legacy/ddp_sequential_plugin.py similarity index 99% rename from pytorch_lightning/plugins/old/ddp_sequential_plugin.py rename to pytorch_lightning/plugins/legacy/ddp_sequential_plugin.py index dc39d648d2f13..a80f3ef7c795f 100644 --- a/pytorch_lightning/plugins/old/ddp_sequential_plugin.py +++ b/pytorch_lightning/plugins/legacy/ddp_sequential_plugin.py @@ -21,7 +21,7 @@ from pytorch_lightning import LightningModule from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.old.rpc_plugin import RPCPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/plugins/old/native_amp.py b/pytorch_lightning/plugins/legacy/native_amp.py similarity index 97% rename from pytorch_lightning/plugins/old/native_amp.py rename to pytorch_lightning/plugins/legacy/native_amp.py index 832d6acc672b4..d691134f0b4da 100644 --- a/pytorch_lightning/plugins/old/native_amp.py +++ b/pytorch_lightning/plugins/legacy/native_amp.py @@ -16,7 +16,7 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.plugins.old.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.legacy.precision_plugin import PrecisionPlugin class NativeAMPPlugin(PrecisionPlugin): diff --git a/pytorch_lightning/plugins/old/plugin.py b/pytorch_lightning/plugins/legacy/plugin.py similarity index 100% rename from pytorch_lightning/plugins/old/plugin.py rename to pytorch_lightning/plugins/legacy/plugin.py diff --git a/pytorch_lightning/plugins/old/plugin_connector.py b/pytorch_lightning/plugins/legacy/plugin_connector.py similarity index 95% rename from pytorch_lightning/plugins/old/plugin_connector.py rename to pytorch_lightning/plugins/legacy/plugin_connector.py index 77dae1229743e..c6af30613c39a 100644 --- a/pytorch_lightning/plugins/old/plugin_connector.py +++ b/pytorch_lightning/plugins/legacy/plugin_connector.py @@ -15,11 +15,11 @@ from typing import List, Optional, Union from pytorch_lightning.cluster_environments import ClusterEnvironment -from pytorch_lightning.plugins.old.apex import ApexPlugin -from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin -from pytorch_lightning.plugins.old.plugin import LightningPlugin -from pytorch_lightning.plugins.old.sharded_plugin import DDPShardedPlugin +from pytorch_lightning.plugins.legacy.apex import ApexPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.legacy.plugin import LightningPlugin +from pytorch_lightning.plugins.legacy.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import AMPType, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/plugins/old/precision_plugin.py b/pytorch_lightning/plugins/legacy/precision_plugin.py similarity index 95% rename from pytorch_lightning/plugins/old/precision_plugin.py rename to pytorch_lightning/plugins/legacy/precision_plugin.py index 69d8e3670678d..1041e9d6b0faf 100644 --- a/pytorch_lightning/plugins/old/precision_plugin.py +++ b/pytorch_lightning/plugins/legacy/precision_plugin.py @@ -15,7 +15,7 @@ from torch.optim import Optimizer -from pytorch_lightning.plugins.old.plugin import LightningPlugin +from pytorch_lightning.plugins.legacy.plugin import LightningPlugin class PrecisionPlugin(LightningPlugin): diff --git a/pytorch_lightning/plugins/old/rpc_plugin.py b/pytorch_lightning/plugins/legacy/rpc_plugin.py similarity index 98% rename from pytorch_lightning/plugins/old/rpc_plugin.py rename to pytorch_lightning/plugins/legacy/rpc_plugin.py index 4445b1d35970e..89f60f1d783c8 100644 --- a/pytorch_lightning/plugins/old/rpc_plugin.py +++ b/pytorch_lightning/plugins/legacy/rpc_plugin.py @@ -18,7 +18,7 @@ import torch from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin from pytorch_lightning.utilities import _RPC_AVAILABLE DEFAULT_RPC_TIMEOUT_SEC = 60. diff --git a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/legacy/sharded_native_amp_plugin.py similarity index 94% rename from pytorch_lightning/plugins/old/sharded_native_amp_plugin.py rename to pytorch_lightning/plugins/legacy/sharded_native_amp_plugin.py index c29821dcd8a8d..f507c8c3bd6c0 100644 --- a/pytorch_lightning/plugins/old/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/legacy/sharded_native_amp_plugin.py @@ -15,7 +15,7 @@ from torch.optim import Optimizer -from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: diff --git a/pytorch_lightning/plugins/old/sharded_plugin.py b/pytorch_lightning/plugins/legacy/sharded_plugin.py similarity index 95% rename from pytorch_lightning/plugins/old/sharded_plugin.py rename to pytorch_lightning/plugins/legacy/sharded_plugin.py index 19e0859587585..bf008e34fc3ca 100644 --- a/pytorch_lightning/plugins/old/sharded_plugin.py +++ b/pytorch_lightning/plugins/legacy/sharded_plugin.py @@ -15,8 +15,8 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.optimizer import is_lightning_optimizer -from pytorch_lightning.plugins.old.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.old.sharded_native_amp_plugin import ShardedNativeAMPPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.sharded_native_amp_plugin import ShardedNativeAMPPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, AMPType, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py index af8db214eff9d..f3c9de66a811d 100644 --- a/pytorch_lightning/trainer/connectors/precision_connector.py +++ b/pytorch_lightning/trainer/connectors/precision_connector.py @@ -13,8 +13,8 @@ # limitations under the License. from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.old.apex import ApexPlugin -from pytorch_lightning.plugins.old.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.legacy.apex import ApexPlugin +from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 5bf2fdcea7991..11e440bf0f52d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -24,7 +24,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.old.plugin_connector import PluginConnector +from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes from pytorch_lightning.callbacks import Callback from pytorch_lightning.accelerators.accelerator_connector import BackendConnector diff --git a/tests/plugins/test_plugin_properties.py b/tests/plugins/test_plugin_properties.py index ef87a79d4bb5c..1a6556c0f76ff 100644 --- a/tests/plugins/test_plugin_properties.py +++ b/tests/plugins/test_plugin_properties.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning import Trainer -from pytorch_lightning.plugins.old.plugin_connector import LightningCustomPlugins, PluginConnector +from pytorch_lightning.plugins.legacy.plugin_connector import LightningCustomPlugins, PluginConnector def test_available_plugins_trainer(): From 577b00df62cc2b3cbee99a254e44a03578a9d489 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 23 Jan 2021 00:47:27 +0100 Subject: [PATCH 115/157] trainer imports --- pytorch_lightning/trainer/trainer.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 11e440bf0f52d..a6b35a468e48d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -15,7 +15,6 @@ """Trainer to automate the training.""" import os -from pytorch_lightning.core.memory import ModelSummary import warnings from pathlib import Path from typing import Dict, Iterable, List, Optional, Union @@ -24,15 +23,14 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector -from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes -from pytorch_lightning.callbacks import Callback from pytorch_lightning.accelerators.accelerator_connector import BackendConnector -from pytorch_lightning.callbacks import Callback, ModelCheckpoint +from pytorch_lightning.callbacks import Callback from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.step_result import EvalResult, Result from pytorch_lightning.loggers import LightningLoggerBase +from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector from pytorch_lightning.profiler import BaseProfiler from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import ConfigValidator @@ -44,7 +42,6 @@ from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.trainer.connectors.model_connector import ModelConnector from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector -from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector @@ -59,15 +56,6 @@ from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin -from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector -from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector -from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector -from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector -from pytorch_lightning.trainer.connectors.model_connector import ModelConnector -from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector -from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector -from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector -from pytorch_lightning import _logger as log from pytorch_lightning.tuner.tuning import Tuner from pytorch_lightning.utilities import DeviceType, rank_zero_warn from pytorch_lightning.utilities.cloud_io import load as pl_load @@ -75,8 +63,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.model_utils import is_overridden -from pytorch_lightning.trainer.properties import TrainerProperties -from pytorch_lightning.accelerators.accelerator import Accelerator # warnings to ignore in trainer warnings.filterwarnings( From aa4858b070bca27f0c21f1128c0fc1dc734e1958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 25 Jan 2021 03:19:54 +0100 Subject: [PATCH 116/157] fix import errors after rebase --- pytorch_lightning/trainer/trainer.py | 1 + tests/deprecated_api/test_remove_1-4.py | 2 +- tests/models/test_sync_batchnorm.py | 2 +- tests/plugins/test_amp_plugin.py | 2 +- tests/plugins/test_apex_plugin.py | 2 +- tests/plugins/test_ddp_plugin.py | 4 ++-- tests/plugins/test_ddp_sequential_plugin.py | 2 +- tests/plugins/test_plugin.py | 4 ++-- tests/plugins/test_rpc_plugin.py | 2 +- tests/plugins/test_sharded_plugin.py | 2 +- 10 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index a6b35a468e48d..584dae3437ff2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -23,6 +23,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.accelerators.accelerator_connector import BackendConnector from pytorch_lightning.callbacks import Callback from pytorch_lightning.core.datamodule import LightningDataModule diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py index 00f02076fccef..fc3b201d88a74 100644 --- a/tests/deprecated_api/test_remove_1-4.py +++ b/tests/deprecated_api/test_remove_1-4.py @@ -19,7 +19,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin from tests.base import BoringModel from tests.deprecated_api import _soft_unimport_module diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py index fe00acff62624..444067d82bd9e 100644 --- a/tests/models/test_sync_batchnorm.py +++ b/tests/models/test_sync_batchnorm.py @@ -17,7 +17,7 @@ import torch.nn.functional as F from pytorch_lightning import LightningModule, seed_everything, Trainer -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import FLOAT16_EPSILON from tests.base.datamodules import MNISTDataModule diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py index 1e98740f99d62..48833e292564a 100644 --- a/tests/plugins/test_amp_plugin.py +++ b/tests/plugins/test_amp_plugin.py @@ -6,7 +6,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE from tests.base.boring_model import BoringModel diff --git a/tests/plugins/test_apex_plugin.py b/tests/plugins/test_apex_plugin.py index df6d76547bcf6..1f452933ec6a0 100644 --- a/tests/plugins/test_apex_plugin.py +++ b/tests/plugins/test_apex_plugin.py @@ -5,7 +5,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins.apex import ApexPlugin +from pytorch_lightning.plugins.legacy.apex import ApexPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE from tests.base.boring_model import BoringModel diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index fe8fc555ba06c..4bdaad74b67ab 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -6,8 +6,8 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import BoringModel diff --git a/tests/plugins/test_ddp_sequential_plugin.py b/tests/plugins/test_ddp_sequential_plugin.py index 460d195f6723b..ddb1bd6768e29 100644 --- a/tests/plugins/test_ddp_sequential_plugin.py +++ b/tests/plugins/test_ddp_sequential_plugin.py @@ -20,7 +20,7 @@ from torch import nn from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin +from pytorch_lightning.plugins.legacy.ddp_sequential_plugin import DDPSequentialPlugin from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import RandomDataset diff --git a/tests/plugins/test_plugin.py b/tests/plugins/test_plugin.py index 05789596879b4..4b01b4402611d 100644 --- a/tests/plugins/test_plugin.py +++ b/tests/plugins/test_plugin.py @@ -17,8 +17,8 @@ import pytest from pytorch_lightning import Callback, Trainer -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.native_amp import NativeAMPPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import BoringModel diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py index a28cd4b50e4f4..77937c16058dc 100644 --- a/tests/plugins/test_rpc_plugin.py +++ b/tests/plugins/test_rpc_plugin.py @@ -7,7 +7,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin from pytorch_lightning.utilities import _RPC_AVAILABLE from tests.base.boring_model import BoringModel diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index ac20cd68e36d5..0bd13db5a9052 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -74,7 +74,7 @@ def test_ddp_choice_sharded_amp(tmpdir, accelerator): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend.precision_plugin, ShardedNativeMixedPrecisionPlugin)) + assert isinstance(trainer.accelerator_backend.precision_plugin, ShardedNativeMixedPrecisionPlugin) raise SystemExit() model = BoringModel() From f81a44f22a40d5433e7fc41b5f24331703a5059c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 25 Jan 2021 03:33:00 +0100 Subject: [PATCH 117/157] move hook to new setup location --- pytorch_lightning/trainer/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 584dae3437ff2..96f4eaf430101 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -500,7 +500,6 @@ def fit( # SET UP TRAINING # ---------------------------- # self.accelerator_backend = self.accelerator_connector.select_accelerator() - self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator_backend.setup(self, model) self.train_loop.setup_training(model) @@ -511,6 +510,8 @@ def fit( self.call_hook("on_fit_start") # plugin will setup training (e.g. ddp will launch child processes) + # TODO: the old setup is now called "pre_training", where should this hook be called now? + self.call_hook("on_before_accelerator_backend_setup", model) self.training_type_plugin.pre_training() self.call_setup_hook(self.lightning_module) From a2856650291de3b1d0befbd6acc8547029c32b81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 25 Jan 2021 03:44:05 +0100 Subject: [PATCH 118/157] provide unwrapping logic --- .../accelerators/plugins/training_type/ddp.py | 4 ++-- .../accelerators/plugins/training_type/ddp_spawn.py | 4 ++-- pytorch_lightning/overrides/data_parallel.py | 9 +++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/accelerators/plugins/training_type/ddp.py index b314a230076b0..08f27f3d9e15c 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp.py @@ -12,7 +12,7 @@ from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module from pytorch_lightning.utilities import _HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -60,7 +60,7 @@ def root_device(self): @property def lightning_module(self): # the model may not be wrapped with DistributedDataParallel if calling this too early - return getattr(self._model, "module", self._model) + return unwrap_lightning_module(self._model) @property def distributed_sampler_kwargs(self): diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py index f572f9af36f06..622ac2a726998 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py @@ -10,7 +10,7 @@ from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.utilities.distributed import find_free_network_port, rank_zero_only from pytorch_lightning.utilities.distributed import sync_ddp_if_available, rank_zero_warn @@ -52,7 +52,7 @@ def root_device(self): @property def lightning_module(self): # the model may not be wrapped with DistributedDataParallel if calling this too early - return getattr(self._model, "module", self._model) + return unwrap_lightning_module(self._model) @property def distributed_sampler_kwargs(self): diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 69676cf77e079..84475a755065a 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -62,6 +62,15 @@ def get_a_var(obj): # pragma: no-cover warning_cache = WarningCache() +def unwrap_lightning_module(wrapped_model): + model = wrapped_model + if isinstance(model, (LightningDistributedDataParallel, LightningDataParallel)): + model = model.module + if isinstance(model, LightningDistributedModule): + model = model.module + return model + + class LightningDataParallel(DataParallel): """ Override the forward call in lightning so it goes to training and validation step respectively From bf78d7048315ff735c70c9cfe8cfbdd0770a0b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 25 Jan 2021 03:50:57 +0100 Subject: [PATCH 119/157] fix trainer callback system --- tests/callbacks/test_callbacks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index f3e1dabfb6e59..e9bb7452a1abb 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -55,8 +55,8 @@ def test_trainer_callback_system(torch_save): assert callback_mock.method_calls == [ call.on_init_start(trainer), call.on_init_end(trainer), - call.on_before_accelerator_backend_setup(trainer, model), call.on_fit_start(trainer, model), + call.on_before_accelerator_backend_setup(trainer, model), call.setup(trainer, model, 'fit'), call.on_pretrain_routine_start(trainer, model), call.on_pretrain_routine_end(trainer, model), @@ -111,6 +111,7 @@ def test_trainer_callback_system(torch_save): call.on_init_start(trainer), call.on_init_end(trainer), call.on_fit_start(trainer, model), + call.on_before_accelerator_backend_setup(trainer, model), call.setup(trainer, model, 'test'), # call.on_pretrain_routine_start(trainer, model), # call.on_pretrain_routine_end(trainer, model), From 34947cf0840909bdff0e955dbdac315c89868370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 25 Jan 2021 06:04:09 +0100 Subject: [PATCH 120/157] added ddp2 implementation --- .../plugins/training_type/ddp2.py | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py index 078dfe6cd6ec1..ff55ef72e0f83 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py +++ b/pytorch_lightning/accelerators/plugins/training_type/ddp2.py @@ -1,5 +1,41 @@ +import torch + from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.core.step_result import Result + -# TODO: DDP2 class DDP2Plugin(DDPPlugin): - pass \ No newline at end of file + + def setup(self, model): + self._model = model + # set the task idx + self.task_idx = self.cluster_environment.local_rank() + # the difference to DDP is that we don't call children processes here + + def reduce(self, output, *args, **kwargs): + if isinstance(output, Result): + output.dp_reduce() + + elif isinstance(output, torch.Tensor): + output = output.mean() + + return output + + @property + def root_device(self): + return self.parallel_devices[0] + + def model_to_device(self): + # no need to do anything when model is wrapped in torch.nn.DataParallel + pass + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict(num_replicas=self.num_nodes, rank=self.global_rank) + return distributed_sampler_kwargs + + def set_world_ranks(self): + self.local_rank = self.task_idx + self.node_rank = self.cluster_environment.node_rank() + self.global_rank = self.node_rank + self.world_size = self.num_nodes From 49bec5391ab019bef1301bb05bb8546e7df463bf Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 25 Jan 2021 10:15:05 +0100 Subject: [PATCH 121/157] fix imports .legacy --- .../basic_examples/conv_sequential_example.py | 2 +- .../accelerators/legacy/__init__.py | 24 +++++++++---------- .../accelerators/legacy/cpu_accelerator.py | 13 ++++++---- .../accelerators/legacy/ddp_accelerator.py | 8 +++---- .../legacy/ddp_hpc_accelerator.py | 9 +++---- .../legacy/ddp_spawn_accelerator.py | 8 +++---- .../legacy/horovod_accelerator.py | 4 ++-- .../accelerators/legacy/tpu_accelerator.py | 3 ++- 8 files changed, 38 insertions(+), 33 deletions(-) diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py index 84efb4bea7670..38e077071d59e 100644 --- a/pl_examples/basic_examples/conv_sequential_example.py +++ b/pl_examples/basic_examples/conv_sequential_example.py @@ -32,7 +32,7 @@ from pl_examples import cli_lightning_logo from pytorch_lightning import Trainer from pytorch_lightning.metrics.functional import accuracy -from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin +from pytorch_lightning.plugins.legacy.ddp_sequential_plugin import DDPSequentialPlugin from pytorch_lightning.utilities import _BOLTS_AVAILABLE, _FAIRSCALE_PIPE_AVAILABLE if _BOLTS_AVAILABLE: diff --git a/pytorch_lightning/accelerators/legacy/__init__.py b/pytorch_lightning/accelerators/legacy/__init__.py index d566b7301b788..a388f522d63bf 100644 --- a/pytorch_lightning/accelerators/legacy/__init__.py +++ b/pytorch_lightning/accelerators/legacy/__init__.py @@ -11,16 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator # noqa: F401 -from pytorch_lightning.accelerators.ddp2_accelerator import DDP2Accelerator # noqa: F401 -from pytorch_lightning.accelerators.ddp_accelerator import DDPAccelerator # noqa: F401 -from pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator # noqa: F401 -from pytorch_lightning.accelerators.ddp_cpu_spawn_accelerator import DDPCPUSpawnAccelerator # noqa: F401 -from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator # noqa: F401 -from pytorch_lightning.accelerators.ddp_spawn_accelerator import DDPSpawnAccelerator # noqa: F401 -from pytorch_lightning.accelerators.dp_accelerator import DataParallelAccelerator # noqa: F401 -from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator # noqa: F401 -from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator # noqa: F401 -from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.cpu_accelerator import CPUAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.ddp2_accelerator import DDP2Accelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.ddp_accelerator import DDPAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.ddp_cpu_spawn_accelerator import DDPCPUSpawnAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.ddp_hpc_accelerator import DDPHPCAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.ddp_spawn_accelerator import DDPSpawnAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.dp_accelerator import DataParallelAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.horovod_accelerator import HorovodAccelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.tpu_accelerator import TPUAccelerator # noqa: F401 -from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator # noqa: F401 diff --git a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py index 7c80a4a30d223..efe14ff6b9b4b 100644 --- a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py @@ -15,9 +15,10 @@ import torch -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -79,10 +80,12 @@ def validation_step(self, args): def test_step(self, args): return self._step(self.trainer.model.test_step, args) - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + def sync_tensor( + self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None, + ) -> torch.Tensor: return tensor @property diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py index 987eda50476f1..729ae2ec2ba94 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py @@ -21,12 +21,12 @@ import numpy as np import torch import torch.distributed as torch_distrib -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed @@ -35,7 +35,7 @@ all_gather_ddp_if_available, find_free_network_port, rank_zero_only, - sync_ddp_if_available, + sync_ddp_if_available, ReduceOp, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py index 8df353b025378..58fd60ac18a69 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py @@ -16,17 +16,18 @@ import torch import torch.distributed as dist import torch.distributed as torch_distrib -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available, \ + ReduceOp class DDPHPCAccelerator(Accelerator): diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py index 33af749a229ee..39871a6c6d344 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py @@ -18,12 +18,12 @@ import torch import torch.distributed as torch_distrib import torch.multiprocessing as mp -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed import LightningDistributed @@ -35,7 +35,7 @@ find_free_network_port, rank_zero_only, rank_zero_warn, - sync_ddp_if_available, + sync_ddp_if_available, ReduceOp, ) from pytorch_lightning.utilities.seed import seed_everything diff --git a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py index 150be86210866..7d41dd990e7ad 100644 --- a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py @@ -17,10 +17,10 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, AMPType, DeviceType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp if _HOROVOD_AVAILABLE: import horovod.torch as hvd diff --git a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py index 66fc236a2a775..158978cbcbba9 100644 --- a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py @@ -21,7 +21,7 @@ from torch.optim import Optimizer from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp +from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import ( @@ -32,6 +32,7 @@ rank_zero_warn, ) from pytorch_lightning.utilities.cloud_io import atomic_save +from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException if _TPU_AVAILABLE: From ba1c986a32d744b406b0bd09f5b3c245a003ce6e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 25 Jan 2021 10:16:46 +0100 Subject: [PATCH 122/157] move plugins --- pytorch_lightning/accelerators/accelerator.py | 4 ++-- .../accelerators/accelerator_connector.py | 4 ++-- pytorch_lightning/accelerators/cpu.py | 2 +- pytorch_lightning/accelerators/plugins/__init__.py | 3 --- .../accelerators/plugins/precision/__init__.py | 5 ----- .../accelerators/plugins/training_type/__init__.py | 10 ---------- pytorch_lightning/plugins/__init__.py | 4 +++- .../{accelerators => }/plugins/base_plugin.py | 0 pytorch_lightning/plugins/precision/__init__.py | 5 +++++ .../{accelerators => }/plugins/precision/apex_amp.py | 2 +- .../{accelerators => }/plugins/precision/mixed.py | 2 +- .../{accelerators => }/plugins/precision/native_amp.py | 2 +- .../plugins/precision/precision_plugin.py | 2 +- .../plugins/precision/sharded_native_amp.py | 2 +- pytorch_lightning/plugins/training_type/__init__.py | 10 ++++++++++ .../{accelerators => }/plugins/training_type/ddp.py | 2 +- .../{accelerators => }/plugins/training_type/ddp2.py | 2 +- .../plugins/training_type/ddp_spawn.py | 2 +- .../{accelerators => }/plugins/training_type/dp.py | 2 +- .../plugins/training_type/horovod.py | 2 +- .../plugins/training_type/parallel.py | 2 +- .../plugins/training_type/sharded.py | 2 +- .../plugins/training_type/sharded_spawn.py | 2 +- .../plugins/training_type/single_device.py | 2 +- .../plugins/training_type/training_type_plugin.py | 2 +- pytorch_lightning/trainer/training_loop.py | 2 +- tests/backends/test_accelerator_connector.py | 4 ++-- tests/plugins/test_sharded_plugin.py | 2 +- 28 files changed, 42 insertions(+), 43 deletions(-) delete mode 100644 pytorch_lightning/accelerators/plugins/__init__.py delete mode 100644 pytorch_lightning/accelerators/plugins/precision/__init__.py delete mode 100644 pytorch_lightning/accelerators/plugins/training_type/__init__.py rename pytorch_lightning/{accelerators => }/plugins/base_plugin.py (100%) create mode 100644 pytorch_lightning/plugins/precision/__init__.py rename pytorch_lightning/{accelerators => }/plugins/precision/apex_amp.py (97%) rename pytorch_lightning/{accelerators => }/plugins/precision/mixed.py (62%) rename pytorch_lightning/{accelerators => }/plugins/precision/native_amp.py (94%) rename pytorch_lightning/{accelerators => }/plugins/precision/precision_plugin.py (97%) rename pytorch_lightning/{accelerators => }/plugins/precision/sharded_native_amp.py (92%) create mode 100644 pytorch_lightning/plugins/training_type/__init__.py rename pytorch_lightning/{accelerators => }/plugins/training_type/ddp.py (99%) rename pytorch_lightning/{accelerators => }/plugins/training_type/ddp2.py (93%) rename pytorch_lightning/{accelerators => }/plugins/training_type/ddp_spawn.py (98%) rename pytorch_lightning/{accelerators => }/plugins/training_type/dp.py (93%) rename pytorch_lightning/{accelerators => }/plugins/training_type/horovod.py (98%) rename pytorch_lightning/{accelerators => }/plugins/training_type/parallel.py (96%) rename pytorch_lightning/{accelerators => }/plugins/training_type/sharded.py (96%) rename pytorch_lightning/{accelerators => }/plugins/training_type/sharded_spawn.py (95%) rename pytorch_lightning/{accelerators => }/plugins/training_type/single_device.py (89%) rename pytorch_lightning/{accelerators => }/plugins/training_type/training_type_plugin.py (97%) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 4834fdf39f0ae..711ad367915ad 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -3,8 +3,8 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.accelerators.plugins import TrainingTypePlugin, HorovodPlugin -from pytorch_lightning.accelerators.plugins.precision import ( +from pytorch_lightning.plugins import TrainingTypePlugin, HorovodPlugin +from pytorch_lightning.plugins .precision import ( ApexMixedPrecisionPlugin, MixedPrecisionPlugin, NativeMixedPrecisionPlugin, diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 808472f4a4c73..baf14c4146aed 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -20,9 +20,9 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \ +from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \ PrecisionPlugin, ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ +from pytorch_lightning.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index a39aace801993..57dc5bf6a8bbf 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -1,5 +1,5 @@ from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.accelerators.plugins import MixedPrecisionPlugin +from pytorch_lightning.plugins import MixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/accelerators/plugins/__init__.py b/pytorch_lightning/accelerators/plugins/__init__.py deleted file mode 100644 index 119284ef33c76..0000000000000 --- a/pytorch_lightning/accelerators/plugins/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from pytorch_lightning.accelerators.plugins.base_plugin import Plugin -from pytorch_lightning.accelerators.plugins.precision import * -from pytorch_lightning.accelerators.plugins.training_type import * diff --git a/pytorch_lightning/accelerators/plugins/precision/__init__.py b/pytorch_lightning/accelerators/plugins/precision/__init__.py deleted file mode 100644 index 0c7265f4be29d..0000000000000 --- a/pytorch_lightning/accelerators/plugins/precision/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from pytorch_lightning.accelerators.plugins.precision.apex_amp import ApexMixedPrecisionPlugin -from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin -from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin -from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.accelerators.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/__init__.py b/pytorch_lightning/accelerators/plugins/training_type/__init__.py deleted file mode 100644 index 152fdc68d552e..0000000000000 --- a/pytorch_lightning/accelerators/plugins/training_type/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin -from pytorch_lightning.accelerators.plugins.training_type.ddp2 import DDP2Plugin -from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin -from pytorch_lightning.accelerators.plugins.training_type.dp import DataParallelPlugin -from pytorch_lightning.accelerators.plugins.training_type.horovod import HorovodPlugin -from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded import DDPShardedPlugin -from pytorch_lightning.accelerators.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin -from pytorch_lightning.accelerators.plugins.training_type.single_device import SingleDevicePlugin -from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index b416a9f56aebe..e023060d5b16a 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -1 +1,3 @@ -from pytorch_lightning.accelerators.plugins import * \ No newline at end of file +from pytorch_lightning.plugins.base_plugin import Plugin +from pytorch_lightning.plugins.precision import * +from pytorch_lightning.plugins.training_type import * diff --git a/pytorch_lightning/accelerators/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py similarity index 100% rename from pytorch_lightning/accelerators/plugins/base_plugin.py rename to pytorch_lightning/plugins/base_plugin.py diff --git a/pytorch_lightning/plugins/precision/__init__.py b/pytorch_lightning/plugins/precision/__init__.py new file mode 100644 index 0000000000000..8220a1a890867 --- /dev/null +++ b/pytorch_lightning/plugins/precision/__init__.py @@ -0,0 +1,5 @@ +from pytorch_lightning.plugins .precision.apex_amp import ApexMixedPrecisionPlugin +from pytorch_lightning.plugins .precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.plugins .precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.plugins .precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins .precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin diff --git a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py similarity index 97% rename from pytorch_lightning/accelerators/plugins/precision/apex_amp.py rename to pytorch_lightning/plugins/precision/apex_amp.py index 967324b1a3490..7ba75ca3d9aaa 100644 --- a/pytorch_lightning/accelerators/plugins/precision/apex_amp.py +++ b/pytorch_lightning/plugins/precision/apex_amp.py @@ -3,7 +3,7 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.plugins .precision.mixed import MixedPrecisionPlugin from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import AMPType, _APEX_AVAILABLE, rank_zero_warn diff --git a/pytorch_lightning/accelerators/plugins/precision/mixed.py b/pytorch_lightning/plugins/precision/mixed.py similarity index 62% rename from pytorch_lightning/accelerators/plugins/precision/mixed.py rename to pytorch_lightning/plugins/precision/mixed.py index f96a47f35c04c..dce279e660144 100644 --- a/pytorch_lightning/accelerators/plugins/precision/mixed.py +++ b/pytorch_lightning/plugins/precision/mixed.py @@ -1,4 +1,4 @@ -from pytorch_lightning.accelerators.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins .precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import AMPType diff --git a/pytorch_lightning/accelerators/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py similarity index 94% rename from pytorch_lightning/accelerators/plugins/precision/native_amp.py rename to pytorch_lightning/plugins/precision/native_amp.py index fad0d1f469c34..885d37901d6ee 100644 --- a/pytorch_lightning/accelerators/plugins/precision/native_amp.py +++ b/pytorch_lightning/plugins/precision/native_amp.py @@ -2,7 +2,7 @@ import torch -from pytorch_lightning.accelerators.plugins.precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.plugins .precision.mixed import MixedPrecisionPlugin from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py similarity index 97% rename from pytorch_lightning/accelerators/plugins/precision/precision_plugin.py rename to pytorch_lightning/plugins/precision/precision_plugin.py index 120fbcafbecf9..31e94c612804c 100644 --- a/pytorch_lightning/accelerators/plugins/precision/precision_plugin.py +++ b/pytorch_lightning/plugins/precision/precision_plugin.py @@ -4,7 +4,7 @@ import torch from torch.optim import Optimizer -from pytorch_lightning.accelerators.plugins.base_plugin import Plugin +from pytorch_lightning.plugins .base_plugin import Plugin from pytorch_lightning.core import LightningModule diff --git a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py similarity index 92% rename from pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py rename to pytorch_lightning/plugins/precision/sharded_native_amp.py index 969780dd1df7e..d7e8ca0020091 100644 --- a/pytorch_lightning/accelerators/plugins/precision/sharded_native_amp.py +++ b/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -15,7 +15,7 @@ from torch.optim import Optimizer -from pytorch_lightning.accelerators.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.plugins .precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _FAIRSCALE_AVAILABLE if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py new file mode 100644 index 0000000000000..7109594600a04 --- /dev/null +++ b/pytorch_lightning/plugins/training_type/__init__.py @@ -0,0 +1,10 @@ +from pytorch_lightning.plugins .training_type.ddp import DDPPlugin +from pytorch_lightning.plugins .training_type.ddp2 import DDP2Plugin +from pytorch_lightning.plugins .training_type.ddp_spawn import DDPSpawnPlugin +from pytorch_lightning.plugins .training_type.dp import DataParallelPlugin +from pytorch_lightning.plugins .training_type.horovod import HorovodPlugin +from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins .training_type.sharded import DDPShardedPlugin +from pytorch_lightning.plugins .training_type.sharded_spawn import DDPSpawnShardedPlugin +from pytorch_lightning.plugins .training_type.single_device import SingleDevicePlugin +from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py similarity index 99% rename from pytorch_lightning/accelerators/plugins/training_type/ddp.py rename to pytorch_lightning/plugins/training_type/ddp.py index 08f27f3d9e15c..06c0a5ce5f03b 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -9,7 +9,7 @@ import torch.distributed as torch_distrib from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed import LightningDistributed from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py b/pytorch_lightning/plugins/training_type/ddp2.py similarity index 93% rename from pytorch_lightning/accelerators/plugins/training_type/ddp2.py rename to pytorch_lightning/plugins/training_type/ddp2.py index ff55ef72e0f83..c693a004a39e0 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp2.py +++ b/pytorch_lightning/plugins/training_type/ddp2.py @@ -1,6 +1,6 @@ import torch -from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.plugins .training_type.ddp import DDPPlugin from pytorch_lightning.core.step_result import Result diff --git a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py similarity index 98% rename from pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py rename to pytorch_lightning/plugins/training_type/ddp_spawn.py index 622ac2a726998..80886d2555c21 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -7,7 +7,7 @@ import torch.multiprocessing as mp from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module diff --git a/pytorch_lightning/accelerators/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py similarity index 93% rename from pytorch_lightning/accelerators/plugins/training_type/dp.py rename to pytorch_lightning/plugins/training_type/dp.py index d77aa52fc700c..c168aa0a42d00 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -2,7 +2,7 @@ import torch -from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin from pytorch_lightning.core.step_result import Result from pytorch_lightning.overrides.data_parallel import LightningDataParallel diff --git a/pytorch_lightning/accelerators/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py similarity index 98% rename from pytorch_lightning/accelerators/plugins/training_type/horovod.py rename to pytorch_lightning/plugins/training_type/horovod.py index eb2edd2f3e414..ca00b01b6f911 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -4,7 +4,7 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from pytorch_lightning.accelerators.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities import _HOROVOD_AVAILABLE from pytorch_lightning.utilities.distributed import rank_zero_only diff --git a/pytorch_lightning/accelerators/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py similarity index 96% rename from pytorch_lightning/accelerators/plugins/training_type/parallel.py rename to pytorch_lightning/plugins/training_type/parallel.py index 865e7e6b4bd1c..8bc692b97b3ee 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -4,7 +4,7 @@ import torch -from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py similarity index 96% rename from pytorch_lightning/accelerators/plugins/training_type/sharded.py rename to pytorch_lightning/plugins/training_type/sharded.py index 1ba54bf8419bb..fb24f8c73315d 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded.py +++ b/pytorch_lightning/plugins/training_type/sharded.py @@ -1,6 +1,6 @@ from typing import Optional -from pytorch_lightning.accelerators.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.plugins .training_type.ddp import DDPPlugin from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only diff --git a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py similarity index 95% rename from pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py rename to pytorch_lightning/plugins/training_type/sharded_spawn.py index 04e171bb9d5a0..c1020457e3bec 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py @@ -1,6 +1,6 @@ from typing import Optional -from pytorch_lightning.accelerators.plugins.training_type.ddp_spawn import DDPSpawnPlugin +from pytorch_lightning.plugins .training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.core.optimizer import is_lightning_optimizer from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only diff --git a/pytorch_lightning/accelerators/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py similarity index 89% rename from pytorch_lightning/accelerators/plugins/training_type/single_device.py rename to pytorch_lightning/plugins/training_type/single_device.py index 200072ee82651..c83d9685c428c 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -1,6 +1,6 @@ import torch -from pytorch_lightning.accelerators.plugins.training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin class SingleDevicePlugin(TrainingTypePlugin): diff --git a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py similarity index 97% rename from pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py rename to pytorch_lightning/plugins/training_type/training_type_plugin.py index c5e400494e82c..363dde8e593f3 100644 --- a/pytorch_lightning/accelerators/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -5,7 +5,7 @@ import torch from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.plugins.base_plugin import Plugin +from pytorch_lightning.plugins .base_plugin import Plugin class TrainingTypePlugin(Plugin, ABC): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index b3510f0f400fe..bedd4c57f749d 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -18,7 +18,7 @@ import numpy as np import torch -from pytorch_lightning.accelerators.plugins import ParallelPlugin +from pytorch_lightning.plugins import ParallelPlugin from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.memory import ModelSummary diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index 92950274e49cd..79b0505fcdcba 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -22,8 +22,8 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.accelerators.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin -from pytorch_lightning.accelerators.plugins import PrecisionPlugin +from pytorch_lightning.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, DDP2Plugin +from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.callbacks import Callback from pytorch_lightning.cluster_environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment from tests.base.boring_model import BoringModel diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 0bd13db5a9052..bc4a21db554af 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -5,7 +5,7 @@ import torch from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin, \ +from pytorch_lightning.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin, \ ShardedNativeMixedPrecisionPlugin from pytorch_lightning.callbacks import Callback from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE From 45dfbb7b11b123b497fd70de0901d9d1248aaaab Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 25 Jan 2021 10:42:33 +0100 Subject: [PATCH 123/157] restore legacy --- .../accelerators/legacy/accelerator.py | 255 +++++++++++++++ .../legacy/accelerator_connector.py | 42 ++- .../accelerators/legacy/cpu_accelerator.py | 2 +- .../accelerators/legacy/ddp2_accelerator.py | 269 ++++++++++++++++ .../accelerators/legacy/ddp_accelerator.py | 2 +- .../legacy/ddp_cpu_hpc_accelerator.py | 48 +++ .../legacy/ddp_cpu_spawn_accelerator.py | 297 ++++++++++++++++++ .../legacy/ddp_hpc_accelerator.py | 2 +- .../legacy/ddp_spawn_accelerator.py | 2 +- .../accelerators/legacy/dp_accelerator.py | 189 +++++++++++ .../accelerators/legacy/gpu_accelerator.py | 109 +++++++ .../legacy/horovod_accelerator.py | 2 +- .../accelerators/legacy/tpu_accelerator.py | 2 +- 13 files changed, 1200 insertions(+), 21 deletions(-) create mode 100644 pytorch_lightning/accelerators/legacy/accelerator.py create mode 100644 pytorch_lightning/accelerators/legacy/ddp2_accelerator.py create mode 100644 pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py create mode 100644 pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py create mode 100644 pytorch_lightning/accelerators/legacy/dp_accelerator.py create mode 100644 pytorch_lightning/accelerators/legacy/gpu_accelerator.py diff --git a/pytorch_lightning/accelerators/legacy/accelerator.py b/pytorch_lightning/accelerators/legacy/accelerator.py new file mode 100644 index 0000000000000..ea6b21e714b2f --- /dev/null +++ b/pytorch_lightning/accelerators/legacy/accelerator.py @@ -0,0 +1,255 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from contextlib import contextmanager +from typing import Any, Optional, Union + +import torch +from torch.optim import Optimizer + +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin +from pytorch_lightning.utilities.apply_func import move_data_to_device +from pytorch_lightning.utilities.parsing import AttributeDict + +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + class ReduceOp: + SUM = None + + +class Accelerator(object): + + def __init__(self, + trainer: Optional = None, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): + self.trainer = trainer + self.nickname = None + self.cluster_environment = cluster_environment + self.dist = AttributeDict(rank=0, device=None) + self.ddp_plugin = ddp_plugin + + if trainer is not None: + self.train_loop = self.trainer.train + self.validation_loop = self.trainer.run_evaluation + self.test_loop = self.trainer.run_evaluation + + def setup(self, model): + pass + + def teardown(self): + # Ensure if necessary all processes are finished + self.barrier() + + def barrier(self, name: Optional[str] = None): + pass + + def broadcast(self, obj, src=0): + return obj + + def train_or_test(self): + if self.trainer.testing: + results = self.trainer.run_test() + else: + results = self.trainer.train() + return results + + def batch_to_device(self, batch: Any, device: torch.device): + model = self.trainer.get_model() + if model is not None: + return model.transfer_batch_to_device(batch, device) + return move_data_to_device(batch, device) + + def training_step_end(self, output): + return output + + def test_step_end(self, output): + return output + + def validation_step_end(self, output): + return output + + def process_dataloader(self, dataloader): + return dataloader + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + automatic_optimization = self.trainer.train_loop.automatic_optimization + + if not automatic_optimization and self.ddp_plugin is not None: + # Manually prepare for reduce as user calling backwards manually + self.ddp_plugin.on_before_manual_backward(self.trainer.model, closure_loss) + + if self.trainer.precision == 16: + closure_loss = self.trainer.precision_connector.backend.backward( + closure_loss, optimizer, opt_idx, *args, **kwargs + ) + else: + # do backward pass + model = self.trainer.get_model() + model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs) + + # once backward has been applied, release graph + closure_loss = closure_loss.detach() + return closure_loss + + def clip_gradients(self, optimizer, clip_val=None): + # use the trainer's clip val if none passed + grad_clip_val = self.trainer.gradient_clip_val + if clip_val is not None: + grad_clip_val = clip_val + grad_clip_val = float(grad_clip_val) + + if grad_clip_val <= 0: + return + self._clip_gradients(optimizer, grad_clip_val) + + def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): + if self.trainer.amp_backend: + self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type) + else: + model = self.trainer.get_model() + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) + + def on_train_epoch_end(self, outputs): + pass + + def on_train_end(self): + pass + + def early_stopping_should_stop(self, pl_module): + return self.trainer.should_stop + + def setup_optimizers(self, model): + if self.trainer.testing: + return + + optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) + self.trainer.optimizers = optimizers + self.trainer.lr_schedulers = lr_schedulers + self.trainer.optimizer_frequencies = optimizer_frequencies + + def init_ddp_connection( + self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True + ) -> None: + self.ddp_plugin.init_ddp_connection( + self.trainer, + self.cluster_environment, + global_rank, + world_size, + is_slurm_managing_tasks, + ) + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + """ + Function to reduce a tensor from several distributed processes to one aggregated tensor. + + Args: + tensor: the tensor to sync and reduce + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + + Return: + reduced value + """ + raise NotImplementedError() + + def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False): + """ + Function to gather a tensor from several distributed processes + + Args: + tensor: tensor of shape (batch, ...) + group: the process group to gather results from. Defaults to all processes (world) + sync_grads: flag that allows users to synchronize gradients for all_gather op + + Return: + A tensor of shape (world_size, batch, ...) + """ + raise NotImplementedError() + + def optimizer_state(self, optimizer: Optimizer) -> dict: + """ + Returns state of an optimizer. Allows for syncing/collating optimizer state from processes in custom + plugins. + Return: + Optimizer state dict + """ + if self.ddp_plugin: + return self.ddp_plugin.optimizer_state(optimizer) + return optimizer.state_dict() + + def get_reference_model(self, model) -> LightningModule: + """ + Override to modify returning base :class:`LightningModule` + when accessing variable and functions if the accelerator has wrapped the model. + + Example:: + ref_model = accelerator.get_reference_model(model) + ref_model.training_step(...) + + Args: + model: Accelerator model. + + Returns: Reference :class:`LightningModule`. + + """ + return model + + def __getstate__(self): + return { + 'trainer': self.trainer, + 'nickname': self.nickname, + 'cluster_environment': self.cluster_environment, + 'dist': self.dist, + 'ddp_plugin': self.ddp_plugin + } + + def __setstate__(self, d): + self.trainer = d['trainer'] + self.nickname = d['nickname'] + self.cluster_environment = d['cluster_environment'] + self.dist = d['dist'] + self.ddp_plugin = d['ddp_plugin'] + + def on_save(self, checkpoint): + return checkpoint + + @property + def rpc_enabled(self): + return self.ddp_plugin is not None and isinstance(self.ddp_plugin, RPCPlugin) + + @property + def distributed_sampler_kwargs(self): + raise NotImplementedError + + @property + def require_distributed_sampler(self): + raise NotImplementedError + + @contextmanager + def block_ddp_plugin_sync_behaviour(self): + """ + Blocks ddp sync gradients behaviour on backwards pass. + This is useful for skipping sync when accumulating gradients, reducing communication overhead + Returns: context manager with sync behaviour off + """ + cm = self.ddp_plugin.block_backward_sync(self.trainer.model) if self.ddp_plugin else None + yield cm diff --git a/pytorch_lightning/accelerators/legacy/accelerator_connector.py b/pytorch_lightning/accelerators/legacy/accelerator_connector.py index d9dcc5cbd0a88..8b5e5314b2c54 100644 --- a/pytorch_lightning/accelerators/legacy/accelerator_connector.py +++ b/pytorch_lightning/accelerators/legacy/accelerator_connector.py @@ -16,8 +16,20 @@ import torch from pytorch_lightning import _logger as log -from pytorch_lightning import accelerators -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy import ( + DDP2Accelerator, + DDPCPUHPCAccelerator, + DDPHPCAccelerator, + DDPSpawnAccelerator, + DDPCPUSpawnAccelerator, + DDPAccelerator, + DataParallelAccelerator, + HorovodAccelerator, + TPUAccelerator, + GPUAccelerator, + CPUAccelerator, +) from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.utilities import ( @@ -220,42 +232,42 @@ def select_accelerator(self): # TODO: clean-up this branching as most just select class and uses the very same arguments # choose the appropriate accelerator backend if self.trainer._distrib_type == DistributedType.DDP2: - accelerator_backend = accelerators.DDP2Accelerator( + accelerator_backend = DDP2Accelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_ddp_cpu_slurm: - accelerator_backend = accelerators.DDPCPUHPCAccelerator( + accelerator_backend = DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_slurm_ddp: - accelerator_backend = accelerators.DDPHPCAccelerator( + accelerator_backend = DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_ddp_cpu_torch_elastic: - accelerator_backend = accelerators.DDPCPUHPCAccelerator( + accelerator_backend = DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif use_torchelastic_ddp: - accelerator_backend = accelerators.DDPHPCAccelerator( + accelerator_backend = DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin ) elif self.trainer._distrib_type == DistributedType.DDP_SPAWN: - accelerator_backend = accelerators.DDPSpawnAccelerator( + accelerator_backend = DDPSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, @@ -263,7 +275,7 @@ def select_accelerator(self): ) elif use_ddp_cpu_spawn: - accelerator_backend = accelerators.DDPCPUSpawnAccelerator( + accelerator_backend = DDPCPUSpawnAccelerator( self.trainer, nprocs=self.trainer.num_processes, cluster_environment=cluster_env, @@ -271,26 +283,26 @@ def select_accelerator(self): ) elif self.trainer.distributed_backend == "ddp": - accelerator_backend = accelerators.DDPAccelerator( + accelerator_backend = DDPAccelerator( self.trainer, cluster_env, ddp_plugin=self.trainer.plugin_connector.ddp_plugin ) elif self.trainer._distrib_type == DistributedType.DP: - accelerator_backend = accelerators.DataParallelAccelerator(self.trainer, cluster_env) + accelerator_backend = DataParallelAccelerator(self.trainer, cluster_env) elif self.trainer._distrib_type == DistributedType.HOROVOD: - accelerator_backend = accelerators.HorovodAccelerator(self.trainer, cluster_env) + accelerator_backend = HorovodAccelerator(self.trainer, cluster_env) elif self.trainer._device_type == DeviceType.GPU and self.trainer.num_gpus == 1: - accelerator_backend = accelerators.GPUAccelerator(self.trainer, cluster_env) + accelerator_backend = GPUAccelerator(self.trainer, cluster_env) elif self.trainer._device_type == DeviceType.TPU: - accelerator_backend = accelerators.TPUAccelerator(self.trainer, cluster_env) + accelerator_backend = TPUAccelerator(self.trainer, cluster_env) elif self.trainer.distributed_backend is None: - accelerator_backend = accelerators.CPUAccelerator(self.trainer, cluster_env) + accelerator_backend = CPUAccelerator(self.trainer, cluster_env) else: raise MisconfigurationException( f'`Trainer(accelerator={self.trainer.distributed_backend}, num_nodes={self.trainer.num_nodes},' diff --git a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py index efe14ff6b9b4b..e7d42e2647e93 100644 --- a/pytorch_lightning/accelerators/legacy/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/cpu_accelerator.py @@ -15,7 +15,7 @@ import torch -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import ReduceOp diff --git a/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py new file mode 100644 index 0000000000000..95ea4ab2686da --- /dev/null +++ b/pytorch_lightning/accelerators/legacy/ddp2_accelerator.py @@ -0,0 +1,269 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +from typing import Any, List, Optional, Union + +import torch +import torch.distributed as torch_distrib +from torch.nn.parallel import DistributedDataParallel + +from pytorch_lightning import _logger as log +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.core.step_result import Result +from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin +from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available, \ + ReduceOp + + +class DDP2Accelerator(Accelerator): + + def __init__(self, + trainer, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): + """ + Runs training using DDP2 strategy on a cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDP2Accelerator()) + + """ + super().__init__(trainer, cluster_environment, ddp_plugin) + self.task_idx = None + self.dist = LightningDistributed() + self.nickname = 'ddp2' + + def setup(self, model): + self.trainer.model = model + self.task_idx = self.cluster_environment.local_rank() + + def train(self): + model = self.trainer.model + return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) + + def training_step(self, args): + return self._step(args) + + def validation_step(self, args): + return self._step(args) + + def test_step(self, args): + return self._step(args) + + def _step(self, args): + args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args) + if self.trainer.amp_backend == AMPType.NATIVE: + with torch.cuda.amp.autocast(): + output = self.trainer.model(*args) + else: + output = self.trainer.model(*args) + return output + + def barrier(self, name: Optional[str] = None): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + def training_step_end(self, output): + if isinstance(output, Result): + output.dp_reduce() + return output + + def validation_step_end(self, output): + if isinstance(output, Result): + output.dp_reduce() + return output + + def test_step_end(self, output): + if isinstance(output, Result): + output.dp_reduce() + return output + + def set_world_ranks(self, process_idx): + # Todo: required argument `process_idx` is not used + self.trainer.local_rank = self.trainer.node_rank + self.trainer.global_rank = self.trainer.node_rank + self.trainer.world_size = self.trainer.num_nodes + + def broadcast(self, obj, src=0): + return self.dist.broadcast(obj) + + def init_device(self, process_idx): + self.trainer.root_gpu = process_idx + torch.cuda.set_device(self.trainer.root_gpu) + + def model_to_device(self, model): + model.cuda(self.trainer.root_gpu) + + def get_device_ids(self): + device_ids = self.trainer.data_parallel_device_ids + return device_ids + + def ddp_train(self, process_idx, mp_queue, model): + """ + Entry point for ddp + + Args: + process_idx: current process rank + mp_queue: multiprocessing queue + model: pointer to current :class:`LightningModule` + + Returns: + Dict with evaluation results + + """ + # Todo: required argument `mp_queue` is not used + # show progressbar only on progress_rank 0 + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: + self.trainer.progress_bar_callback.disable() + + # determine which process we are and world size + self.set_world_ranks(process_idx) + + # set warning rank + rank_zero_only.rank = self.trainer.global_rank + + # Initialize cuda device + self.init_device(process_idx) + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + model.trainer = self.trainer + self.init_ddp_connection( + self.trainer.global_rank, + self.trainer.world_size, + self.trainer.is_slurm_managing_tasks + ) + + if isinstance(self.ddp_plugin, RPCPlugin): + if not self.ddp_plugin.is_main_rpc_process: + self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer) + self.ddp_plugin.exit_rpc_process() + if self.ddp_plugin.return_after_exit_rpc_process: + return + else: + self.ddp_plugin.on_main_rpc_connection(self.trainer) + + # call setup after the ddp process has connected + self.trainer.call_setup_hook(model) + + # on world_size=0 let everyone know training is starting + if self.trainer.is_global_zero and not torch.distributed.is_initialized(): + log.info('-' * 100) + log.info(f'distributed_backend={self.trainer.distributed_backend}') + log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') + log.info('-' * 100) + + # call sync_bn before .cuda(), configure_apex and configure_ddp + if self.trainer.sync_batchnorm: + model = self.configure_sync_batchnorm(model) + + # move the model to the correct device + self.model_to_device(model) + + # CHOOSE OPTIMIZER + # allow for lr schedulers as well + self.setup_optimizers(model) + + self.ddp_plugin.on_after_setup_optimizers(self.trainer) + + # set model properties before going into wrapper + self.trainer.model_connector.copy_trainer_model_properties(model) + + # 16-bit + model = self.trainer.precision_connector.connect(model) + + self.trainer.convert_to_lightning_optimizers() + + # device ids change depending on the DDP setup + device_ids = self.get_device_ids() + + # allow user to configure ddp + model = self.configure_ddp(model, device_ids) + + # set up training routine + self.trainer.train_loop.setup_training(model) + + # train or test + results = self.train_or_test() + + # clean up memory + torch.cuda.empty_cache() + return results + + def configure_ddp( + self, model: LightningModule, device_ids: List[int] + ) -> DistributedDataParallel: + model = self.ddp_plugin.configure_ddp(model, device_ids) + return model + + def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) + + return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) + + def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False): + """ + Function to gather a tensor from several distributed processes + + Args: + tensor: tensor of shape (batch, ...) + group: the process group to gather results from. Defaults to all processes (world) + sync_grads: flag that allows users to synchronize gradients for all_gather op + + Return: + A tensor of shape (world_size, batch, ...) + """ + return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) + + def get_reference_model(self, model) -> LightningModule: + return self.ddp_plugin.get_model_from_plugin(model) + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=self.trainer.num_nodes, + rank=self.trainer.global_rank + ) + if self.ddp_plugin is not None: + distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs) + return distributed_sampler_kwargs + + @property + def require_distributed_sampler(self): + return True diff --git a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py index 729ae2ec2ba94..ff0466662226a 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_accelerator.py @@ -26,7 +26,7 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed diff --git a/pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py new file mode 100644 index 0000000000000..8ec4d18509cab --- /dev/null +++ b/pytorch_lightning/accelerators/legacy/ddp_cpu_hpc_accelerator.py @@ -0,0 +1,48 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +from typing import Optional + +from pytorch_lightning.accelerators.legacy.ddp_hpc_accelerator import DDPHPCAccelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin + + +class DDPCPUHPCAccelerator(DDPHPCAccelerator): + + def __init__(self, + trainer, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): + """ + Runs training using DDP (with CPUs) strategy on a cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDPCPUHPCAccelerator()) + + """ + super().__init__(trainer, cluster_environment, ddp_plugin) + self.nickname = 'ddp_cpu' + + def model_to_device(self, model, process_idx): + # Todo: required argument `process_idx` is not used + model.cpu() + + def get_device_ids(self): + device_ids = None + return device_ids + + def init_device(self, process_idx): + pass diff --git a/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py new file mode 100644 index 0000000000000..1559ad671e4d8 --- /dev/null +++ b/pytorch_lightning/accelerators/legacy/ddp_cpu_spawn_accelerator.py @@ -0,0 +1,297 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +import os +from typing import Any, List, Optional, Union + +import torch +import torch.distributed as torch_distrib +import torch.multiprocessing as mp +from torch.nn.parallel import DistributedDataParallel + +from pytorch_lightning import _logger as log +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin +from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin +from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities.distributed import ( + all_gather_ddp_if_available, + find_free_network_port, + rank_zero_only, + rank_zero_warn, + sync_ddp_if_available, ReduceOp, +) + + +class DDPCPUSpawnAccelerator(Accelerator): + + def __init__(self, + trainer, + nprocs: int, + cluster_environment: Optional[ClusterEnvironment] = None, + ddp_plugin: Optional[DDPPlugin] = None): + """ + Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn + + Example:: + + # default + trainer = Trainer(accelerator=DDPCPUSpawnAccelerator()) + + """ + super().__init__(trainer, cluster_environment, ddp_plugin) + self.mp_queue = None + self.nprocs = nprocs + self.dist = LightningDistributed() + self.nickname = 'ddp_cpu' + + def setup(self, model): + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) + + # pass in a state q + smp = mp.get_context('spawn') + self.mp_queue = smp.SimpleQueue() + + self.trainer.model = model + + def train(self): + model = self.trainer.model + + # train in children process + mp.spawn(self.ddp_train, nprocs=self.nprocs, args=(self.mp_queue, model,)) + + # restore main state with best weights + best_path = self.mp_queue.get() + results = self.mp_queue.get() + + # recover the weights of the processes trained in the children + self.__recover_child_process_weights(model, best_path) + return results + + def ddp_train(self, process_idx, mp_queue, model): + """ + Entry point for ddp + + Args: + process_idx: + mp_queue: multiprocessing queue + model: + """ + # show progressbar only on progress_rank 0 + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: + self.trainer.progress_bar_callback.disable() + + # determine which process we are and world size + self.set_world_ranks(process_idx) + + # set warning rank + rank_zero_only.rank = self.trainer.global_rank + + # set up server using proc 0's ip address + # try to init for 20 times at max in case ports are taken + # where to store ip_table + model.trainer = self.trainer + self.init_ddp_connection( + self.trainer.global_rank, + self.trainer.world_size, + self.trainer.is_slurm_managing_tasks + ) + + if isinstance(self.ddp_plugin, RPCPlugin): + if not self.ddp_plugin.is_main_rpc_process: + self.ddp_plugin.on_accelerator_exit_rpc_process(self.trainer) + self.ddp_plugin.exit_rpc_process() + if self.ddp_plugin.return_after_exit_rpc_process: + return + else: + self.ddp_plugin.on_main_rpc_connection(self.trainer) + + # call setup after the ddp process has connected + self.trainer.call_setup_hook(model) + + # on world_size=0 let everyone know training is starting + if self.trainer.is_global_zero and not torch.distributed.is_initialized(): + log.info('-' * 100) + log.info(f'distributed_backend={self.trainer.distributed_backend}') + log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') + log.info('-' * 100) + + # call sync_bn before .cuda(), configure_apex and configure_ddp + if self.trainer.sync_batchnorm: + model = self.configure_sync_batchnorm(model) + + # move the model to the correct device + self.model_to_device(model, process_idx) + + # CHOOSE OPTIMIZER + # allow for lr schedulers as well + self.setup_optimizers(model) + + self.ddp_plugin.on_after_setup_optimizers(self.trainer) + + # set model properties before going into wrapper + self.trainer.model_connector.copy_trainer_model_properties(model) + + # 16-bit + model = self.trainer.precision_connector.connect(model) + + self.trainer.convert_to_lightning_optimizers() + + # DDP spawn already spawned off each process... no need to do anything + device_ids = self.get_device_ids() + + # allow user to configure ddp + model = self.configure_ddp(model, device_ids) + + # set up training routine + self.trainer.train_loop.setup_training(model) + + # train or test + results = self.train_or_test() + + # get original model + model = self.trainer.get_model() + + # persist info in ddp_spawn + self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) + + # clean up memory + torch.cuda.empty_cache() + + def training_step(self, args): + return self._step(args) + + def validation_step(self, args): + return self._step(args) + + def test_step(self, args): + return self._step(args) + + def _step(self, args): + args = self.ddp_plugin.on_before_forward(self.trainer.get_model(), *args) + if self.trainer.amp_backend == AMPType.NATIVE: + with torch.cuda.amp.autocast(): + output = self.trainer.model(*args) + else: + output = self.trainer.model(*args) + return output + + def barrier(self, name: Optional[str] = None): + if torch_distrib.is_initialized(): + torch_distrib.barrier() + + def broadcast(self, obj, src=0): + return self.dist.broadcast(obj) + + def early_stopping_should_stop(self, pl_module): + stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) + torch_distrib.all_reduce(stop, op=torch_distrib.reduce_op.SUM) + torch_distrib.barrier() + should_stop = stop == self.trainer.world_size + return should_stop + + def set_world_ranks(self, process_idx): + self.trainer.local_rank = process_idx + self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx + self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes + + def model_to_device(self, model, process_idx): + # Todo: required argument `process_idx` is not used + model.cpu() + + def get_device_ids(self): + device_ids = None + return device_ids + + def __recover_child_process_weights(self, model, best_path): + # transfer back the best path to the trainer + if self.trainer.checkpoint_callback: + self.trainer.checkpoint_callback.best_model_path = best_path + + self.trainer.model = model + + def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): + # Todo: required argument `model` is not used + # track the best model path + best_model_path = None + if self.trainer.checkpoint_callback is not None: + best_model_path = self.trainer.checkpoint_callback.best_model_path + + if self.trainer.global_rank == 0 and mp_queue is not None: + rank_zero_warn('cleaning up ddp environment...') + # todo, pass complete checkpoint as state dictionary + mp_queue.put(best_model_path) + mp_queue.put(results) + + def configure_ddp( + self, model: LightningModule, device_ids: List[int] + ) -> DistributedDataParallel: + model = self.ddp_plugin.configure_ddp(model, device_ids) + return model + + def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) + + return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) + + def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False): + """ + Function to gather a tensor from several distributed processes + + Args: + tensor: tensor of shape (batch, ...) + group: the process group to gather results from. Defaults to all processes (world) + sync_grads: flag that allows users to synchronize gradients for all_gather op + + Return: + A tensor of shape (world_size, batch, ...) + """ + return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) + + def get_reference_model(self, model) -> LightningModule: + return self.ddp_plugin.get_model_from_plugin(model) + + @property + def distributed_sampler_kwargs(self): + distributed_sampler_kwargs = dict( + num_replicas=self.trainer.num_nodes * self.trainer.num_processes, + rank=self.trainer.global_rank + ) + if self.ddp_plugin is not None: + distributed_sampler_kwargs = self.ddp_plugin.distributed_sampler_kwargs(distributed_sampler_kwargs) + return distributed_sampler_kwargs + + @property + def require_distributed_sampler(self): + return True diff --git a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py index 58fd60ac18a69..0d45300e0106e 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_hpc_accelerator.py @@ -21,7 +21,7 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed diff --git a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py index 39871a6c6d344..e2e9e3062a909 100644 --- a/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/ddp_spawn_accelerator.py @@ -23,7 +23,7 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed import LightningDistributed diff --git a/pytorch_lightning/accelerators/legacy/dp_accelerator.py b/pytorch_lightning/accelerators/legacy/dp_accelerator.py new file mode 100644 index 0000000000000..13bed9082c24a --- /dev/null +++ b/pytorch_lightning/accelerators/legacy/dp_accelerator.py @@ -0,0 +1,189 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +import torch +from torch import optim + +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.core.step_result import Result +from pytorch_lightning.distributed import LightningDistributed +from pytorch_lightning.overrides.data_parallel import LightningDataParallel +from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class DataParallelAccelerator(Accelerator): + + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): + """ + Runs training using DP via manual start (not HPC cluster) + + Example:: + + # default + trainer = Trainer(accelerator=DataParallelAccelerator()) + + """ + super().__init__(trainer, cluster_environment) + self.model_autocast_original_forward = None + self.dist = LightningDistributed() + self.nickname = 'dp' + + def setup(self, model): + # call setup after the ddp process has connected + self.trainer.call_setup_hook(model) + + # put model on correct device + model.cuda(self.trainer.root_gpu) + + # CHOOSE OPTIMIZER + # allow for lr schedulers as well + self.setup_optimizers(model) + + # init torch data parallel + model = self.__init_torch_data_parallel(model) + + # hack forward to do autocast for the user + self.model_autocast_original_forward = model.forward + + # init half precision + if self.trainer.amp_backend: + model = self.__init_half_precision(model) + + self.trainer.convert_to_lightning_optimizers() + + self.trainer.model = model + + def __init_torch_data_parallel(self, model): + # create list of device ids + device_ids = self.trainer.data_parallel_device_ids + if isinstance(device_ids, int): + device_ids = list(range(device_ids)) + + # set dp device + torch.cuda.set_device(self.trainer.root_gpu) + model = LightningDataParallel(model, device_ids=device_ids) + return model + + def __init_half_precision(self, model): + if self.trainer.amp_backend == AMPType.NATIVE: + self.__init_native_amp(model) + else: + model = self.__init_nvidia_apex(model) + return model + + def __init_native_amp(self, model): + model.forward = torch.cuda.amp.autocast()(model.forward) + + def __init_nvidia_apex(self, model): + # check for this bug (amp + dp + !01 doesn't work) + # https://github.com/NVIDIA/apex/issues/227 + if self.trainer.amp_level == 'O2': + raise MisconfigurationException( + f'Amp level {self.trainer.amp_level} with DataParallel is not supported.' + f' See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.' + f' We recommend you switch to ddp if you want to use amp') + else: + model = self.trainer.precision_connector.connect(model) + + return model + + def train(self): + model = self.trainer.model + # set up training routine + self.trainer.train_loop.setup_training(model) + + # train or test + results = self.train_or_test() + + return results + + def teardown(self): + # replace the original fwd function + self.trainer.model.forward = self.model_autocast_original_forward + self.barrier() + + def _step(self, args): + if self.trainer.amp_backend == AMPType.NATIVE: + with torch.cuda.amp.autocast(): + output = self.trainer.model(*args) + else: + output = self.trainer.model(*args) + return output + + def training_step(self, args): + return self._step(args) + + def validation_step(self, args): + return self._step(args) + + def test_step(self, args): + return self._step(args) + + def training_step_end(self, output): + if isinstance(output, Result): + output.dp_reduce() + elif isinstance(output, torch.Tensor): + output = output.mean() + return output + + def validation_step_end(self, output): + if isinstance(output, Result): + output.dp_reduce() + elif isinstance(output, torch.Tensor): + output = output.mean() + return output + + def test_step_end(self, output): + if isinstance(output, Result): + output.dp_reduce() + elif isinstance(output, torch.Tensor): + output = output.mean() + return output + + def reinit_scheduler_properties(self, optimizers: list, schedulers: list): + """ + Reinitialize optimizer.step properties added by schedulers + """ + for scheduler in schedulers: + scheduler = scheduler['scheduler'] + + for optimizer in optimizers: + # check that we dont mix users optimizers and schedulers + if scheduler.optimizer == optimizer: + # Find the mro belonging to the base lr scheduler class + for i, mro in enumerate(scheduler.__class__.__mro__): + is_regular_scheduler = optim.lr_scheduler._LRScheduler + is_lr_reduce_on_plateau = optim.lr_scheduler.ReduceLROnPlateau + if is_regular_scheduler or is_lr_reduce_on_plateau: + idx = i + state = scheduler.state_dict() + else: + state = None + + scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer) + if state is not None: + scheduler.load_state_dict(state) + + def get_reference_model(self, model) -> LightningModule: + if isinstance(model, LightningDataParallel): + return model.module + return model + + @property + def require_distributed_sampler(self): + return False diff --git a/pytorch_lightning/accelerators/legacy/gpu_accelerator.py b/pytorch_lightning/accelerators/legacy/gpu_accelerator.py new file mode 100644 index 0000000000000..2314a8c8c7987 --- /dev/null +++ b/pytorch_lightning/accelerators/legacy/gpu_accelerator.py @@ -0,0 +1,109 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Callable, Optional, Union + +import torch + +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator +from pytorch_lightning.cluster_environments import ClusterEnvironment +from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.utilities import AMPType +from pytorch_lightning.utilities.distributed import ReduceOp + + +class GPUAccelerator(Accelerator): + amp_backend: AMPType + + def __init__(self, trainer, cluster_environment: Optional[ClusterEnvironment] = None): + """ + Runs training using a single GPU + + Example:: + + # default + trainer = Trainer(accelerator=GPUAccelerator()) + + """ + super().__init__(trainer, cluster_environment) + self.dist = LightningDistributed() + self.nickname = None + + def setup(self, model): + + # call setup + self.trainer.call_setup_hook(model) + + torch.cuda.set_device(self.trainer.root_gpu) + model.cuda(self.trainer.root_gpu) + + # CHOOSE OPTIMIZER + # allow for lr schedulers as well + self.setup_optimizers(model) + + # 16-bit + model = self.trainer.precision_connector.connect(model) + + self.trainer.convert_to_lightning_optimizers() + + self.trainer.model = model + + def train(self): + model = self.trainer.model + + # set up training routine + self.trainer.train_loop.setup_training(model) + + # train or test + results = self.train_or_test() + return results + + def _step(self, model_step: Callable, args): + args[0] = self.to_device(args[0]) + + if self.trainer.amp_backend == AMPType.NATIVE: + with torch.cuda.amp.autocast(): + output = model_step(*args) + else: + output = model_step(*args) + + return output + + def training_step(self, args): + return self._step(self.trainer.model.training_step, args) + + def validation_step(self, args): + return self._step(self.trainer.model.validation_step, args) + + def test_step(self, args): + return self._step(self.trainer.model.test_step, args) + + def to_device(self, batch): + gpu_id = 0 + if isinstance(self.trainer.data_parallel_device_ids, list): + gpu_id = self.trainer.data_parallel_device_ids[0] + + # Don't copy the batch since there is a single gpu that the batch could + # be referenced from and if there are multiple optimizers the batch will + # wind up copying it to the same device repeatedly. + return self.batch_to_device(batch, gpu_id) + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return tensor + + @property + def require_distributed_sampler(self): + return False diff --git a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py index 7d41dd990e7ad..dd9cd911d97d5 100644 --- a/pytorch_lightning/accelerators/legacy/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/horovod_accelerator.py @@ -17,7 +17,7 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, AMPType, DeviceType from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp diff --git a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py index 158978cbcbba9..4cdf3354556d5 100644 --- a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py @@ -21,7 +21,7 @@ from torch.optim import Optimizer from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.legacy.accelerator import Accelerator from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import ( From 9b7326a25c68b89d41105df80e8e24fb9c7decb8 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 25 Jan 2021 11:09:38 +0100 Subject: [PATCH 124/157] drop test.py from root --- test.py | 97 --------------------------------------------------------- 1 file changed, 97 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 959436c179c21..0000000000000 --- a/test.py +++ /dev/null @@ -1,97 +0,0 @@ -import torch -import pytorch_lightning as pl - -class RandomDataset(torch.utils.data.Dataset): - def __init__(self, size, length): - self.len = length - self.data = torch.randn(length, size) - - def __getitem__(self, index): - return self.data[index] - - def __len__(self): - return self.len - - -class BoringModel(pl.LightningModule): - - def __init__(self): - """ - Testing PL Module - - Use as follows: - - subclass - - modify the behavior for what you want - - class TestModel(BaseTestModel): - def training_step(...): - # do your own thing - - or: - - model = BaseTestModel() - model.training_epoch_end = None - - """ - super().__init__() - self.layer = torch.nn.Linear(32, 2) - - def forward(self, x): - return self.layer(x) - - def loss(self, batch, prediction): - # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls - return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) - - def step(self, x): - x = self(x) - out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) - return out - - def training_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"loss": loss} - - def training_step_end(self, training_step_outputs): - return training_step_outputs - - def training_epoch_end(self, outputs) -> None: - torch.stack([x["loss"] for x in outputs]).mean() - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"x": loss} - - # def validation_epoch_end(self, outputs) -> None: - # torch.stack([x['x'] for x in outputs]).mean() - - def test_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"y": loss} - - def test_epoch_end(self, outputs) -> None: - torch.stack([x["y"] for x in outputs]).mean() - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) - return [optimizer], [lr_scheduler] - - def train_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - def val_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - def test_dataloader(self): - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - @property - def automatic_optimization(self): - return True - -if __name__ == '__main__': - pl.Trainer(gpus=[1,], max_epochs=20, amp_backend='native').fit(BoringModel(), torch.utils.data.DataLoader(RandomDataset(32, 500))) \ No newline at end of file From 96bc05d9d86d6c45a8a0b69525eb3494beaaa794 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Tue, 26 Jan 2021 18:14:01 +0100 Subject: [PATCH 125/157] add tpu accelerator and plugins --- pytorch_lightning/accelerators/tpu.py | 18 +- .../plugins/precision/tpu_bfloat.py | 8 + .../plugins/training_type/__init__.py | 22 ++- .../plugins/training_type/ddp_spawn.py | 2 +- .../plugins/training_type/parallel.py | 4 +- .../plugins/training_type/single_device.py | 6 +- .../plugins/training_type/single_tpu.py | 34 ++++ .../plugins/training_type/tpu_spawn.py | 184 ++++++++++++++++++ pytorch_lightning/trainer/trainer.py | 2 + 9 files changed, 260 insertions(+), 20 deletions(-) create mode 100644 pytorch_lightning/plugins/precision/tpu_bfloat.py create mode 100644 pytorch_lightning/plugins/training_type/single_tpu.py create mode 100644 pytorch_lightning/plugins/training_type/tpu_spawn.py diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index bf922b1c2df8e..1fd6a4f565258 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -1,13 +1,17 @@ -# TODO: Complete the TPUAccelerator +from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.plugins.training_type import SingleTPUPlugin, TPUSpawnPlugin +from pytorch_lightning.plugins.precision import MixedPrecisionPlugin class TPUAccelerator(Accelerator): def setup(self, trainer, model): - raise NotImplementedError + if isinstance(self.precision_plugin, MixedPrecisionPlugin): + raise MisconfigurationException( + "amp + tpu is not supported. " + "Only bfloats are supported on TPU. Consider using TPUHalfPrecisionPlugin" + ) - def on_train_start(self): - raise NotImplementedError - - def on_train_end(self): - raise NotImplementedError + if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)): + raise MisconfigurationException("TPUs only support a single tpu core or tpu spawn training.") + return super().setup(trainer, model) \ No newline at end of file diff --git a/pytorch_lightning/plugins/precision/tpu_bfloat.py b/pytorch_lightning/plugins/precision/tpu_bfloat.py new file mode 100644 index 0000000000000..852d2eee6dfc3 --- /dev/null +++ b/pytorch_lightning/plugins/precision/tpu_bfloat.py @@ -0,0 +1,8 @@ +import os +import torch +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin + +class TPUHalfPrecisionPlugin(PrecisionPlugin): + def connect(self, model: torch.nn.Module, optimizers, lr_schedulers): + os.environ['XLA_USE_BF16'] = str(1) + return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers) \ No newline at end of file diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py index 7109594600a04..7c31c253eb0eb 100644 --- a/pytorch_lightning/plugins/training_type/__init__.py +++ b/pytorch_lightning/plugins/training_type/__init__.py @@ -1,10 +1,12 @@ -from pytorch_lightning.plugins .training_type.ddp import DDPPlugin -from pytorch_lightning.plugins .training_type.ddp2 import DDP2Plugin -from pytorch_lightning.plugins .training_type.ddp_spawn import DDPSpawnPlugin -from pytorch_lightning.plugins .training_type.dp import DataParallelPlugin -from pytorch_lightning.plugins .training_type.horovod import HorovodPlugin -from pytorch_lightning.plugins .training_type.parallel import ParallelPlugin -from pytorch_lightning.plugins .training_type.sharded import DDPShardedPlugin -from pytorch_lightning.plugins .training_type.sharded_spawn import DDPSpawnShardedPlugin -from pytorch_lightning.plugins .training_type.single_device import SingleDevicePlugin -from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin +from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin +from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin +from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin +from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin +from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin +from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin +from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 80886d2555c21..95371b48356b6 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -33,7 +33,7 @@ def __init__( parallel_devices, num_nodes=1, cluster_environment: ClusterEnvironment = None, - sync_batchnorm=False, + sync_batchnorm: bool = False, **kwargs: Dict[str, Any], ): super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment) diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 8bc692b97b3ee..3235f6cef041c 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -90,4 +90,6 @@ def block_backward_sync(self): if isinstance(self.model, LightningDistributedDataParallel): yield self.model.no_sync() else: - yield None \ No newline at end of file + yield None + + \ No newline at end of file diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index c83d9685c428c..de4193ae3d2fd 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -8,6 +8,10 @@ def __init__(self, device): super().__init__() self.device: torch.device = device + @property + def on_tpu(self): + return self.device.type == 'xla' + @property def on_gpu(self): return self.device.type == "cuda" and torch.cuda.is_available() @@ -38,4 +42,4 @@ def barrier(self, *args, **kwargs): pass def broadcast(self, obj: object, src: int = 0) -> object: - return obj \ No newline at end of file + return obj diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py new file mode 100644 index 0000000000000..ace3405463af3 --- /dev/null +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -0,0 +1,34 @@ +import io +from typing import Optional +import torch +from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin +from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn + +if _TPU_AVAILABLE: + import torch_xla + import torch_xla.core.xla_model as xm + + +class SingleTPUPlugin(SingleDevicePlugin): + def __init__(self, device): + super().__init__(device) + + self.tpu_local_core_rank = 0 + self.tpu_global_core_rank = 0 + + def barrier(self, name: Optional[str] = None): + torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}") + + def pre_training(self): + if isinstance(self.device, int): + self.device = xm.xla_device(self.device) + + self.tpu_local_core_rank = xm.get_local_ordinal() + self.tpu_global_core_rank = xm.get_ordinal() + + def post_training(self): + model = self.lightning_module + + if self.on_colab_kaggle: + rank_zero_warn("cleaning up... please do not interrupt") + self.save_spawn_weights(model) diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py new file mode 100644 index 0000000000000..6476c07587e66 --- /dev/null +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -0,0 +1,184 @@ +import io +import os +from pytorch_lightning.core.lightning import LightningModule +import torch +from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.seed import seed_everything +from typing import Any, Dict, Optional +from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin +from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn, + +from pytorch_lightning.utilities.apply_func import move_data_to_device + +if _TPU_AVAILABLE: + import torch_xla + import torch_xla.core.xla_model as xm + import torch_xla.distributed.parallel_loader as xla_pl + import torch_xla.distributed.xla_multiprocessing as xmp + +class TPUSpawnPlugin(DDPSpawnPlugin): + def __init__(self, parallel_devices, num_nodes=1, **kwargs: Dict[str, Any]): + + parallel_devices = [xm.xla_device(device) if isinstance(device, int) else device for device in parallel_devices] + super().__init__(parallel_devices, num_nodes=num_nodes, cluster_environment=None, sync_batchnorm=False, **kwargs) + self.tpu_local_core_rank = 0 + self.start_method = None + + @property + def distributed_sampler_kwargs(self): + return dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) + + def process_dataloader(self, dataloader): + device = xm.xla_device(self.trainer.tpu_id) + dataloader = xla_pl.ParallelLoader(dataloader, [device]) + dataloader = dataloader.per_device_loader(device) + return dataloader + + def configure_ddp(self): + pass + + def init_ddp_connection(self, global_rank: int, world_size: int) -> None: + pass + + def set_world_ranks(self, process_idx): + self.tpu_local_core_rank = xm.get_local_ordinal() + self.tpu_global_core_rank = xm.get_ordinal() + self.global_rank = self.tpu_local_core_rank + self.world_size = self.num_nodes * self.num_processes + + def new_process(self, process_idx, trainer): + seed = os.environ.get("PL_GLOBAL_SEED") + if seed is not None: + seed_everything(int(seed)) + + self.set_world_ranks(process_idx) + + # set warning rank + rank_zero_only.rank = self.global_rank + + if self.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None: + trainer.progress_bar_callback.disable() + + self.model_to_device() + self.barrier() + + if trainer.testing: + results = trainer.run_test() + else: + results = trainer.train() + + self.__save_end_of_training_weights(self.lightning_module) + self.transfer_distrib_spawn_state_on_fit_end(results) + + def __save_end_of_training_weights(self, model: LightningModule, trainer): + # when training ends on these platforms dump weights to get out of the main process + if self.on_colab_kaggle: + rank_zero_warn("cleaning up... please do not interrupt") + self.save_spawn_weights(model) + + def model_to_device(self): + pass + + def barrier(self, name: Optional[str] = None): + torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}") + + def on_save(self, checkpoint): + """ + Move XLA tensors to CPU before saving + Recommended on XLA Guide: + https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors + """ + return move_data_to_device(checkpoint, torch.device("cpu")) + + @property + def on_colab_kaggle(self) -> bool: + return bool(os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE')) + + def broadcast(self, obj, src=0): + buffer = io.BytesIO() + torch.save(obj, buffer) + data = bytearray(buffer.getbuffer()) + data_tensor = torch.tensor(data).to(xm.xla_device(), dtype=torch.float) + data = xm.all_gather(data_tensor) + buffer = io.BytesIO(data.cpu().byte().numpy()) + obj = torch.load(buffer) + return obj + + def load_spawn_weights(self, original_model): + """ + Load the temp weights saved in the process + To recover the trained model from the ddp process we load the saved weights + """ + + loaded_model = original_model + + if self.is_global_zero: + # load weights saved in ddp + path = os.path.join(original_model.trainer.default_root_dir, "__temp_weight_distributed_end.ckpt") + loaded_model = original_model.__class__.load_from_checkpoint(path) + + # copy loaded weights to old model + original_model.load_state_dict(loaded_model.state_dict()) + + # remove ddp weights + os.remove(path) + + return loaded_model + + def save_spawn_weights(self, model): + """ + Dump a temporary checkpoint after ddp ends to get weights out of the process + """ + if model.trainer.is_global_zero: + path = os.path.join(model.trainer.default_root_dir, "__temp_weight_distributed_end.ckpt") + model.trainer.save_checkpoint(path) + return path + + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device) + stop = xm.mesh_reduce('stop_signal', should_stop, sum) + torch_xla.core.xla_model.rendezvous("pl.EarlyStoppingCallback.stop_distributed_training_check") + should_stop = int(stop.item()) == self.world_size + return should_stop + + def post_training(self): + # TODO: Check if trainer references can be resolved otherwise + model = self.lightning_module + + # restore main state with best weights + best_path = self.mp_queue.get() + results = self.mp_queue.get() + last_path = self.mp_queue.get() + + # transfer back the best path to the trainer + if self.lightning_module.trainer.checkpoint_callback is not None: + self.lightning_module.trainer.checkpoint_callback.best_model_path = best_path + # todo, pass also bets score + + # load last weights + if last_path and not self.lightning_module.trainer.testing: + ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) + model.load_state_dict(ckpt) + + self.lightning_module = model + + # when training completes, load the weights back in main process + self.__load_weights_on_main_process() + + def __load_weights_on_main_process(self): + model = self.lightning_module + + # load weights if not interrupted + # TODO: check for trainer reference + if self.on_colab_kaggle and not model.trainer.testing: + self.load_spawn_weights(model) + + self.lightning_module = model + + def start_training(self, trainer): + xmp.spawn(self.new_process, args=(self.lightning_module, trainer, self.mp_queue), + nproc=len(self.parallel_devices), start_method=self.start_method) + + def start_testing(self, trainer): + xmp.spawn(self.new_process, args=(self.lightning_module, trainer, self.mp_queue), + nproc=len(self.parallel_devices), start_method=self.start_method) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 96f4eaf430101..fe075c5c95783 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -513,6 +513,7 @@ def fit( # TODO: the old setup is now called "pre_training", where should this hook be called now? self.call_hook("on_before_accelerator_backend_setup", model) self.training_type_plugin.pre_training() + self.precision_plugin.pre_training() self.call_setup_hook(self.lightning_module) @@ -522,6 +523,7 @@ def fit( else: self.training_type_plugin.start_training(self) + self.precision_plugin.post_training() self.training_type_plugin.post_training() self.accelerator_backend.teardown() results = self.training_type_plugin.results From 9e46624370e30f1842d1aa4d381fdc931adaaf5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 30 Jan 2021 15:39:46 +0100 Subject: [PATCH 126/157] fixes --- pytorch_lightning/plugins/training_type/tpu_spawn.py | 2 +- pytorch_lightning/trainer/trainer.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 6476c07587e66..5de336b16870b 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -6,7 +6,7 @@ from pytorch_lightning.utilities.seed import seed_everything from typing import Any, Dict, Optional from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin -from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn, +from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import move_data_to_device diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 56ae98d3665b7..5344a98fdb73f 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -29,7 +29,7 @@ from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.memory import ModelSummary -from pytorch_lightning.core.step_result import EvalResult, Result +from pytorch_lightning.core.step_result import Result from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector from pytorch_lightning.profiler import BaseProfiler @@ -308,7 +308,7 @@ def __init__( self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) - self.plugin_connector = PluginConnector(self, plugins) + self.plugin_connector = PluginConnector(self) self.accelerator_connector = BackendConnector( num_processes, tpu_cores, @@ -419,7 +419,7 @@ def __init__( # last thing are the plugins which override whatever the trainer used by default # TODO: probably not needed anymore after refactor - self.plugin_connector.on_trainer_init() + self.plugin_connector.on_trainer_init(plugins) # Callback system self.on_init_end() From e174b8dd8b8081cfac242508371d22f719ec9fe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:08:56 +0100 Subject: [PATCH 127/157] fix lightning optimizer merge --- pytorch_lightning/accelerators/accelerator.py | 1 - pytorch_lightning/plugins/__init__.py | 5 +++-- pytorch_lightning/plugins/training_type/horovod.py | 3 --- pytorch_lightning/plugins/training_type/sharded.py | 2 -- pytorch_lightning/plugins/training_type/sharded_spawn.py | 3 --- pytorch_lightning/trainer/optimizers.py | 2 +- pytorch_lightning/trainer/training_loop.py | 6 +----- tests/models/test_hooks.py | 1 + tests/models/test_horovod.py | 2 +- 9 files changed, 7 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index c5c77d4711e6a..8dabd4ed7cf75 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -66,7 +66,6 @@ def setup(self, trainer: "Trainer", model: LightningModule) -> None: self.connect_training_type_plugin(self.training_type_plugin, model) self.setup_optimizers(trainer, model) self.connect_precision_plugin(self.precision_plugin) - self.optimizers = trainer.convert_to_lightning_optimizers(self.optimizers) @property def model(self) -> torch.nn.Module: diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index a17d5127edfc6..ffb3b76157e98 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -1,3 +1,4 @@ from pytorch_lightning.plugins.base_plugin import Plugin # noqa: F401 -from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin # noqa: F401 +from pytorch_lightning.plugins.precision import * +from pytorch_lightning.plugins.training_type import * + diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index a8bd0091eef6d..434eb2f09c1db 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -86,9 +86,6 @@ def _filter_named_parameters(model, optimizer): ) for optimizer in optimizers ] - optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers) - self.lightning_module.trainer.optimizers = optimizers - def start_training(self, trainer): with ExitStack() as stack: for optimizer in trainer.optimizers: diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py index fb24f8c73315d..16570492a0dc8 100644 --- a/pytorch_lightning/plugins/training_type/sharded.py +++ b/pytorch_lightning/plugins/training_type/sharded.py @@ -32,8 +32,6 @@ def _reinit_optimizers_with_oss(self): ) optimizers[x] = zero_optimizer del optimizer - trainer = self.lightning_module.trainer - trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers) def _wrap_optimizers(self): trainer = self.model.trainer diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py index c1020457e3bec..503e78e13618c 100644 --- a/pytorch_lightning/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py @@ -32,9 +32,6 @@ def _reinit_optimizers_with_oss(self): ) optimizers[x] = zero_optimizer del optimizer - trainer = self.lightning_module.trainer - trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers) - def _wrap_optimizers(self): trainer = self.model.trainer diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index 996cfc607f825..20438f427d315 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -81,7 +81,7 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]: return optimizers, lr_schedulers, optimizer_frequencies - def convert_to_lightning_optimizers(self, optimizers): + def convert_to_lightning_optimizers(self): def _convert_to_lightning_optimizer(trainer, optimizer): if not isinstance(optimizer, LightningOptimizer): optimizer = LightningOptimizer(optimizer) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 05977c1fc3b86..695741ed3cd22 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -449,11 +449,7 @@ def _process_result(self, training_step_output, split_batch): return training_step_output_for_epoch_end def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure): - with self.trainer.profiler.profile("optimizer_step"): - # optimizer step lightningModule hook - self.trainer.accelerator_backend.optimizer_step( - optimizer, self.trainer.current_epoch, batch_idx, opt_idx, train_step_and_backward_closure - ) + model_ref = self.trainer.get_model() is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) using_native_amp = self.trainer.amp_backend == AMPType.NATIVE diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index ab47dd0d1517f..227716d5e72c4 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect +import os from unittest import mock from unittest.mock import MagicMock diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 7337ee1200420..429ad108f1fc6 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -26,7 +26,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator +from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE From 98660def76119fee5c7826530cf73a066977b7f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:10:30 +0100 Subject: [PATCH 128/157] reset bugreportmodel --- pl_examples/bug_report_model.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index df82ea0c835da..4d9a23f48ca5d 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -56,23 +56,24 @@ class BoringModel(LightningModule): def __init__(self): """ Testing PL Module + Use as follows: - subclass - modify the behavior for what you want + class TestModel(BaseTestModel): def training_step(...): # do your own thing + or: + model = BaseTestModel() model.training_epoch_end = None + """ super().__init__() self.layer = torch.nn.Linear(32, 2) - @property - def automatic_optimization(self): - return True - def forward(self, x): return self.layer(x) @@ -81,7 +82,7 @@ def loss(self, batch, prediction): return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) def step(self, x): - x = self(x) + x = self.layer(x) out = torch.nn.functional.mse_loss(x, torch.ones_like(x)) return out From 4d95b6ce5309c2374c33931e38c0d60a2ae372b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:20:38 +0100 Subject: [PATCH 129/157] unwrapping --- pytorch_lightning/overrides/data_parallel.py | 4 ++-- pytorch_lightning/plugins/training_type/ddp.py | 15 ++++++--------- .../plugins/training_type/ddp_spawn.py | 15 ++++++--------- pytorch_lightning/plugins/training_type/dp.py | 5 +++-- 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index d45040562152a..2c38d8e03b3ee 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -27,9 +27,9 @@ def unwrap_lightning_module(wrapped_model): model = wrapped_model - if isinstance(model, (LightningDistributedDataParallel, LightningDataParallel)): + if isinstance(model, (DistributedDataParallel, DataParallel)): model = model.module - if isinstance(model, LightningDistributedModule): + if isinstance(model, _LightningModuleWrapperBase): model = model.module return model diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 1128756780518..ed3cabd1b4fcc 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -20,11 +20,13 @@ import numpy as np import torch import torch.distributed as torch_distrib +from torch.nn.parallel.distributed import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HYDRA_AVAILABLE from pytorch_lightning.utilities.distributed import ( @@ -77,10 +79,7 @@ def root_device(self): @property def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - # fixme: uncomment when this class will actually be used - # return unwrap_lightning_module(self._model) - pass + return unwrap_lightning_module(self._model) @property def distributed_sampler_kwargs(self): @@ -184,10 +183,8 @@ def set_world_ranks(self): self.world_size = self.num_nodes * self.num_processes def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self._model = LightningDistributedDataParallel( - self.model, + self._model = DistributedDataParallel( + LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs, ) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 9745fd5dee9f5..5b585fd1b1c43 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -18,11 +18,13 @@ import torch import torch.distributed as torch_distrib import torch.multiprocessing as mp +from torch.nn.parallel.distributed import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load @@ -63,10 +65,7 @@ def root_device(self): @property def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - # fixme: uncomment when this class will actually be used - # return unwrap_lightning_module(self._model) - pass + return unwrap_lightning_module(self._model) @property def distributed_sampler_kwargs(self): @@ -155,10 +154,8 @@ def post_training(self): self.__recover_child_process_weights(best_path, last_path) def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self.model = LightningDistributedDataParallel( - self.model, + self._model = DistributedDataParallel( + LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs, ) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index ce33da87048cc..363a54e53750a 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -14,9 +14,10 @@ from typing import List import torch +from torch.nn import DataParallel from pytorch_lightning.core.step_result import Result -from pytorch_lightning.overrides.data_parallel import LightningDataParallel +from pytorch_lightning.overrides import LightningParallelModule from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -26,7 +27,7 @@ def __init__(self, parallel_devices: List[torch.device]): super().__init__(parallel_devices=parallel_devices, cluster_environment=None) def setup(self, model): - self._model = LightningDataParallel(model, self.parallel_devices) + self._model = DataParallel(LightningParallelModule(model), self.parallel_devices) def reduce(self, output, *args, **kwargs): if isinstance(output, Result): From b69d0133f815ef90ac16ac073bd2feabc2cd6a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:36:39 +0100 Subject: [PATCH 130/157] step routing forward --- pytorch_lightning/accelerators/accelerator.py | 6 +++--- pytorch_lightning/plugins/training_type/ddp.py | 9 +++++++++ pytorch_lightning/plugins/training_type/ddp_spawn.py | 9 +++++++++ pytorch_lightning/plugins/training_type/dp.py | 10 ++++++++++ pytorch_lightning/plugins/training_type/sharded.py | 9 +++++++++ .../plugins/training_type/sharded_spawn.py | 9 +++++++++ .../plugins/training_type/training_type_plugin.py | 9 +++++++++ 7 files changed, 58 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 8dabd4ed7cf75..47b8f03c600d4 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -133,7 +133,7 @@ def training_step(self, args): with self.precision_plugin.train_step_context(): with self.training_type_plugin.train_step_context(): - return self.lightning_module.training_step(*args) + return self.training_type_plugin.training_step(*args) def validation_step(self, args): """The actual validation step. @@ -152,7 +152,7 @@ def validation_step(self, args): with self.precision_plugin.val_step_context(): with self.training_type_plugin.val_step_context(): - return self.lightning_module.validation_step(*args) + return self.training_type_plugin.validation_step(*args) def test_step(self, args): """The actual test step. @@ -171,7 +171,7 @@ def test_step(self, args): with self.precision_plugin.test_step_context(): with self.training_type_plugin.test_step_context(): - return self.lightning_module.test_step(*args) + return self.training_type_plugin.test_step(*args) def training_step_end(self, output): """A hook to do something at the end of the training step diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index ed3cabd1b4fcc..1ee9f8d58089e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -267,3 +267,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) return output + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 5b585fd1b1c43..cb5e4e0cabba5 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -223,3 +223,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) return output + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 363a54e53750a..f1fcdbe02831d 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -58,3 +58,13 @@ def broadcast(self, obj: object, src: int = 0) -> object: def reduce_early_stopping_decision(self, should_stop: bool) -> bool: return should_stop + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py index 16570492a0dc8..115b1fb0676dc 100644 --- a/pytorch_lightning/plugins/training_type/sharded.py +++ b/pytorch_lightning/plugins/training_type/sharded.py @@ -52,3 +52,12 @@ def _optim_state_dict(self, optimizer): :meth:`consolidate_state_dict`. """ return optimizer.state_dict() + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py index 503e78e13618c..8be72f2e52d24 100644 --- a/pytorch_lightning/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py @@ -54,3 +54,12 @@ def _optim_state_dict(self, optimizer): :meth:`consolidate_state_dict`. """ return optimizer.state_dict() + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index d1e7907d5d97f..78c14d153e576 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -110,3 +110,12 @@ def start_training(self, trainer: "Trainer") -> None: def start_testing(self, trainer: "Trainer") -> None: # double dispatch to initiate the test loop self._results = trainer.run_test() + + def training_step(self, *args, **kwargs): + return self.lightning_module.training_step(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.lightning_module.validation_step(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.lightning_module.test_step(*args, **kwargs) From cb6676d4710101e7951f436eac91dcb0a3eb611b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:48:17 +0100 Subject: [PATCH 131/157] model access --- pytorch_lightning/plugins/training_type/dp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index f1fcdbe02831d..fc08080399441 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -44,7 +44,7 @@ def root_device(self): @property def lightning_module(self): - return self._model.module + return getattr(self._model, "module", None) def model_to_device(self): # no need to do anything when model is wrapped in torch.nn.DataParallel From a33d27fc6809a4c44b74914fd3d3b9992643493e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:49:31 +0100 Subject: [PATCH 132/157] unwrap --- pytorch_lightning/plugins/training_type/dp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index fc08080399441..cc4b3e2584efc 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -18,6 +18,7 @@ from pytorch_lightning.core.step_result import Result from pytorch_lightning.overrides import LightningParallelModule +from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -44,7 +45,7 @@ def root_device(self): @property def lightning_module(self): - return getattr(self._model, "module", None) + return unwrap_lightning_module(self.model) def model_to_device(self): # no need to do anything when model is wrapped in torch.nn.DataParallel From f7486e2384e8b139dd67ae827d33814bc099948b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 31 Jan 2021 16:53:37 +0100 Subject: [PATCH 133/157] opt --- pytorch_lightning/plugins/training_type/horovod.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 434eb2f09c1db..f45c3dcb93bb6 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -85,6 +85,7 @@ def _filter_named_parameters(model, optimizer): optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer) ) for optimizer in optimizers ] + self.lightning_module.trainer.accelerator.optimizers = optimizers def start_training(self, trainer): with ExitStack() as stack: From 3792b72bb714286726c57f094fbd79c9296624a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 00:15:18 +0100 Subject: [PATCH 134/157] integrate distrib_type --- .../accelerators/accelerator_connector.py | 156 +++++++++--------- pytorch_lightning/plugins/training_type/dp.py | 1 - pytorch_lightning/trainer/properties.py | 12 +- pytorch_lightning/trainer/trainer.py | 2 - 4 files changed, 89 insertions(+), 82 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index baf14c4146aed..d0ed8878c1917 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -27,7 +27,8 @@ from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus -from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser +from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser, DeviceType, \ + DistributedType, _TPU_AVAILABLE from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -65,13 +66,9 @@ def __init__( amp_level, cluster_environment, ): - # initialization - self.use_dp = False - self.use_ddp = False - self.use_ddp2 = False - self.use_horovod = False - self.use_single_gpu = False + self._device_type = DeviceType.CPU + self._distrib_type = None self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) @@ -137,6 +134,10 @@ def __init__( self.replace_sampler_ddp = replace_sampler_ddp + @property + def on_cpu(self): + return self._device_type == DeviceType.CPU + @property def on_tpu(self): return self.tpu_cores is not None @@ -153,6 +154,22 @@ def on_gpu(self): gpus = self.parallel_device_ids return gpus is not None and len(gpus) > 0 and torch.cuda.is_available() + @property + def use_dp(self): + return self._distrib_type == DistributedType.DP + + @property + def use_ddp(self): + return self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) + + @property + def use_ddp2(self): + return self._distrib_type == DistributedType.DDP2 + + @property + def use_horovod(self): + return self._distrib_type == DistributedType.HOROVOD + @property def num_gpus(self) -> int: gpus = self.parallel_device_ids @@ -220,8 +237,8 @@ def select_training_type_plugin(self): elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic - use_ddp_spawn = self.use_ddp and self.distributed_backend == "ddp_spawn" - use_ddp_cpu_spawn = self.use_ddp and self.distributed_backend == "ddp_cpu" + use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN + use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks use_ddp_sharded = self.distributed_backend == "ddp_sharded" @@ -288,96 +305,85 @@ def select_cluster_environment(self): return env def set_distributed_mode(self): - # No distributed backend + if self.distributed_backend is None: - # horovod multi GPU if self.has_horovodrun(): self._set_horovod_backend() - - # DDP CPU - elif self.num_gpus == 0: - if self.num_nodes > 1 or self.num_processes > 1: - self.use_ddp = True - - # Single GPU - elif self.num_gpus == 1: - self.use_single_gpu = True - - # Default: DDP-Spawn + elif self.num_gpus == 0 and (self.num_nodes > 1 or self.num_processes > 1): + self._distrib_type = DistributedType.DDP elif self.num_gpus > 1: rank_zero_warn( - "You requested multiple GPUs but did not specify a backend, e.g." - ' (distributed_backend="dp"|"ddp"|"ddp2").' - ' Setting distributed_backend="ddp_spawn" for you.' + 'You requested multiple GPUs but did not specify a backend, e.g.' + ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.' ) self.distributed_backend = "ddp_spawn" - # DP - if self.distributed_backend == "dp": - # do nothing if num_gpus == 0 - if self.num_gpus == 1: - self.use_single_gpu = True - self.use_dp = True - elif self.num_gpus > 1: - self.use_dp = True - - # DDP, DDP-Spawn - elif self.distributed_backend in ("ddp", "ddp_spawn"): - if self.num_gpus == 0: - # DDP CPU - if self.num_nodes > 1 or self.num_processes > 1: - self.use_ddp = True - - # DDP Single GPU - elif self.num_gpus == 1: - self.use_single_gpu = True - self.use_ddp = True - - # DDP Multi GPU - elif self.num_gpus > 1: - self.use_ddp = True - self.num_processes = self.num_gpus - - # DDP2 - elif self.distributed_backend == "ddp2": - # do nothing if num_gpus == 0 - if self.num_gpus >= 1: - self.use_ddp2 = True - - # DDP CPU - elif self.distributed_backend == "ddp_cpu": + # special case with DDP on CPUs + if self.distributed_backend == "ddp_cpu": + self._distrib_type = DistributedType.DDP + self.data_parallel_device_ids = None if self.num_gpus > 0: rank_zero_warn( - "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs." + 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) - self.parallel_device_ids = None - self.use_ddp = True + if self.num_processes is None: + # define the max CPU available + self.num_processes = os.cpu_count() + # special case with TPUs + elif self.distributed_backend == 'tpu': + self._device_type = DeviceType.TPU + # set all other requested distrib. types adn if it was not set in the + elif self.distributed_backend and self._distrib_type is None: + self._distrib_type = DistributedType(self.distributed_backend) + + # unless you request explicitly for CPU and some GPU are available use them + _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend + if (self.num_gpus > 0 and not _on_cpu): + self._device_type = DeviceType.GPU + + _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) + # DP and DDP2 cannot run without GPU + if (self.num_gpus == 0 and self._distrib_type in _distrib_types): + rank_zero_warn( + 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' + ) + # todo: in some cases it yield in comarison None and int + if ((self.num_nodes and self.num_nodes > 1) + or (self.num_processes and self.num_processes > 1)): + self._distrib_type = DistributedType.DDP + else: + rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') + self._distrib_type = None - # Sharded DDP - elif self.distributed_backend in ("ddp_sharded", "ddp_sharded_spawn"): - self.use_ddp = True + # for DDP overwrite nb processes by requested GPUs + if (self._device_type == DeviceType.GPU + and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)): + self.num_processes = self.num_gpus - # HOROVOD - elif self.distributed_backend == "horovod": + # Horovod si an extra case... + if self.distributed_backend == "horovod": self._set_horovod_backend() # throw error to force user ddp or ddp2 choice - if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): + _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) + if (self.num_nodes > 1 and self._distrib_type not in _ddp): raise MisconfigurationException( - "DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. " - "To silence this warning set distributed_backend=ddp or distributed_backend=ddp2" + 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' + 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' ) - rank_zero_info(f"GPU available: {torch.cuda.is_available()}, used: {self.on_gpu}") + rank_zero_info( + f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}' + ) num_cores = self.tpu_cores if self.tpu_cores is not None else 0 - rank_zero_info(f"TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores") + rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') - if torch.cuda.is_available() and not self.on_gpu: - rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.") + if torch.cuda.is_available() and self._device_type != DeviceType.GPU: + rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.') def _set_horovod_backend(self): self.check_horovod() - self.use_horovod = True + self._distrib_type = DistributedType.HOROVOD # Initialize Horovod to get rank / size info hvd.init() diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index cc4b3e2584efc..d16a25c52e6bc 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -68,4 +68,3 @@ def validation_step(self, *args, **kwargs): def test_step(self, *args, **kwargs): return self.model(*args, **kwargs) - diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index db58e5a4815a0..81777530723fe 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -133,16 +133,20 @@ def use_ddp2(self): def use_horovod(self): return self.accelerator_connector.use_horovod - @property - def use_single_gpu(self): - return self.accelerator_connector.use_single_gpu - @property def use_tpu(self): # TODO update this, what is the difference between use_tpu and on_tpu? return False # return self.accelerator_connector.use_tpu + @property + def _distrib_type(self): + return self.accelerator_connector._distrib_type + + @property + def _device_type(self): + return self.accelerator_connector._device_type + @property def num_nodes(self): return self.accelerator_connector.num_nodes diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 5344a98fdb73f..c404adadd8117 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -296,8 +296,6 @@ def __init__( reload when reaching the minimum length of datasets. """ super().__init__() - self._device_type = DeviceType.CPU - self._distrib_type = None self._running_stage = None self._predicting = False From ef85b812b3a3352390e0366ce6f9a9c11c969c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 00:54:38 +0100 Subject: [PATCH 135/157] sync changes --- .../accelerators/accelerator_connector.py | 149 +++++++++++------- 1 file changed, 91 insertions(+), 58 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index d0ed8878c1917..94f98e1f65521 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -20,21 +20,43 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, \ - PrecisionPlugin, ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.plugins import SingleDevicePlugin, DDPPlugin, DDPSpawnPlugin, \ - DataParallelPlugin, DDP2Plugin, HorovodPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin +from pytorch_lightning.accelerators.tpu import TPUAccelerator from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment +from pytorch_lightning.plugins import ( + ApexMixedPrecisionPlugin, + DataParallelPlugin, + DDP2Plugin, + DDPPlugin, + DDPShardedPlugin, + DDPSpawnPlugin, + DDPSpawnShardedPlugin, + HorovodPlugin, + NativeMixedPrecisionPlugin, + PrecisionPlugin, + ShardedNativeMixedPrecisionPlugin, + SingleDevicePlugin, + SingleTPUPlugin, + TPUHalfPrecisionPlugin, + TPUSpawnPlugin, +) from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus -from pytorch_lightning.utilities import AMPType, _NATIVE_AMP_AVAILABLE, _APEX_AVAILABLE, device_parser, DeviceType, \ - DistributedType, _TPU_AVAILABLE -from pytorch_lightning.utilities import rank_zero_only -from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info +from pytorch_lightning.utilities import ( + _APEX_AVAILABLE, + _NATIVE_AMP_AVAILABLE, + _TPU_AVAILABLE, + AMPType, + device_parser, + DeviceType, + DistributedType, + rank_zero_only, +) +from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException try: import torch_xla + import torch_xla.core.xla_model as xm except ImportError: XLA_AVAILABLE = False else: @@ -49,22 +71,23 @@ class BackendConnector(object): + def __init__( - self, - num_processes, - tpu_cores, - distributed_backend, - auto_select_gpus, - gpus, - num_nodes, - sync_batchnorm, - benchmark, - replace_sampler_ddp, - deterministic, - precision, - amp_type, - amp_level, - cluster_environment, + self, + num_processes, + tpu_cores, + distributed_backend, + auto_select_gpus, + gpus, + num_nodes, + sync_batchnorm, + benchmark, + replace_sampler_ddp, + deterministic, + precision, + amp_type, + amp_level, + cluster_environment, ): # initialization self._device_type = DeviceType.CPU @@ -182,14 +205,14 @@ def parallel_devices(self): if self.on_gpu: devices = [torch.device("cuda", i) for i in self.parallel_device_ids] elif self.on_tpu: - raise NotImplementedError + devices = [xm.xla_device(i) for i in self.parallel_device_ids] else: devices = [torch.device("cpu")] * self.num_processes return devices @property def is_using_torchelastic(self): - te_flags_passed = 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ) + te_flags_passed = "WORLD_SIZE" in os.environ and ("GROUP_RANK" in os.environ or "NODE_RANK" in os.environ) return te_flags_passed def select_precision_plugin(self): @@ -198,42 +221,46 @@ def select_precision_plugin(self): return PrecisionPlugin() elif self.precision == 16: - if self.amp_type == 'native': + if self.on_tpu: + return TPUHalfPrecisionPlugin() + + if self.amp_type == "native": if not _NATIVE_AMP_AVAILABLE: - rank_zero_warn('You have asked for native AMP but your PyTorch version does not support it.' - ' Consider upgrading with `pip install torch>=1.6`.' - ' We will attempt to use NVIDIA Apex for this session.') - self.amp_type = 'apex' + rank_zero_warn( + "You have asked for native AMP but your PyTorch version does not support it." + " Consider upgrading with `pip install torch>=1.6`." + " We will attempt to use NVIDIA Apex for this session." + ) + self.amp_type = "apex" else: - log.info('Using native 16bit precision.') - if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn': + log.info("Using native 16bit precision.") + if self.distributed_backend == "ddp_sharded" or self.distributed_backend == "ddp_sharded_spawn": return ShardedNativeMixedPrecisionPlugin() self.amp_type = AMPType.NATIVE return NativeMixedPrecisionPlugin() - if self.amp_type == 'apex': + if self.amp_type == "apex": if not _APEX_AVAILABLE: - rank_zero_warn('You have asked for Apex AMP but you have not installed it yet.' - ' Install apex first using this guide: https://github.com/NVIDIA/apex#linux') + rank_zero_warn( + "You have asked for Apex AMP but you have not installed it yet." + " Install apex first using this guide: https://github.com/NVIDIA/apex#linux" + ) else: - if self.distributed_backend == 'ddp_sharded' or self.distributed_backend == 'ddp_sharded_spawn': + if self.distributed_backend == "ddp_sharded" or self.distributed_backend == "ddp_sharded_spawn": raise MisconfigurationException( - 'Sharded Plugin is not supported with Apex AMP, ' - 'please using native AMP for 16-bit precision.' + "Sharded Plugin is not supported with Apex AMP, " + "please using native AMP for 16-bit precision." ) - log.info('Using APEX 16bit precision.') + log.info("Using APEX 16bit precision.") self.amp_type = AMPType.APEX return ApexMixedPrecisionPlugin(self.amp_level) else: - raise NotImplementedError('We only support precisions 32 and 16!') + raise NotImplementedError("We only support precisions 32 and 16!") def select_training_type_plugin(self): cluster_environment = self.select_cluster_environment() if self.use_ddp2: - plugin = DDP2Plugin( - parallel_devices=self.parallel_devices, - cluster_environment=cluster_environment - ) + plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=cluster_environment) elif self.use_ddp: use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic @@ -244,9 +271,12 @@ def select_training_type_plugin(self): use_ddp_sharded = self.distributed_backend == "ddp_sharded" use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn" + if self.on_tpu: + ddp_plugin_cls = TPUSpawnPlugin + # ddp script mode uses the same flags as TE # TODO: decouple from TE - if os.environ.get('PL_IN_DDP_SUBPROCESS', False): + if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False if use_ddp_sharded: @@ -270,6 +300,8 @@ def select_training_type_plugin(self): plugin = DataParallelPlugin(parallel_devices=self.parallel_devices) elif self.use_horovod: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) + elif self.on_tpu: + plugin = SingleTPUPlugin(self.tpu_id) else: plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu")) return plugin @@ -281,6 +313,8 @@ def select_accelerator(self): if self.on_gpu: acc_cls = GPUAccelerator + elif self.on_tpu: + acc_cls = TPUAccelerator else: acc_cls = CPUAccelerator @@ -348,16 +382,17 @@ def set_distributed_mode(self): 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) # todo: in some cases it yield in comarison None and int - if ((self.num_nodes and self.num_nodes > 1) - or (self.num_processes and self.num_processes > 1)): + if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)): self._distrib_type = DistributedType.DDP else: rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') self._distrib_type = None # for DDP overwrite nb processes by requested GPUs - if (self._device_type == DeviceType.GPU - and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)): + if ( + self._device_type == DeviceType.GPU + and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN) + ): self.num_processes = self.num_gpus # Horovod si an extra case... @@ -372,14 +407,12 @@ def set_distributed_mode(self): 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' ) - rank_zero_info( - f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}' - ) + rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') num_cores = self.tpu_cores if self.tpu_cores is not None else 0 rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: - rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.') + rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.") def _set_horovod_backend(self): self.check_horovod() @@ -421,7 +454,7 @@ def configure_slurm_ddp(self): num_requested_gpus = self.num_gpus * self.num_nodes num_slurm_tasks = 0 try: - num_slurm_tasks = int(os.environ['SLURM_NTASKS']) + num_slurm_tasks = int(os.environ["SLURM_NTASKS"]) self.is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus # enable slurm cpu @@ -429,8 +462,8 @@ def configure_slurm_ddp(self): self.is_slurm_managing_tasks = num_slurm_tasks == self.num_processes # in interactive mode we don't manage tasks - job_name = os.environ['SLURM_JOB_NAME'] - if job_name == 'bash': + job_name = os.environ["SLURM_JOB_NAME"] + if job_name == "bash": self.is_slurm_managing_tasks = False except Exception: @@ -439,7 +472,7 @@ def configure_slurm_ddp(self): # used for tests only, set this flag to simulate slurm managing a task try: - should_fake = int(os.environ['FAKE_SLURM_MANAGING_TASKS']) + should_fake = int(os.environ["FAKE_SLURM_MANAGING_TASKS"]) if should_fake: self.is_slurm_managing_tasks = True except Exception: @@ -447,4 +480,4 @@ def configure_slurm_ddp(self): # notify user the that slurm is managing tasks if self.is_slurm_managing_tasks: - rank_zero_info('Multi-processing is handled by Slurm.') + rank_zero_info("Multi-processing is handled by Slurm.") From 9d9a9409ae836b9f9413914d8a1072cebc7d9025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 01:35:53 +0100 Subject: [PATCH 136/157] sync --- pytorch_lightning/accelerators/accelerator_connector.py | 6 ++---- pytorch_lightning/trainer/connectors/env_vars_connector.py | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 94f98e1f65521..589843064bd3b 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -44,7 +44,6 @@ from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, - _TPU_AVAILABLE, AMPType, device_parser, DeviceType, @@ -55,7 +54,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException try: - import torch_xla import torch_xla.core.xla_model as xm except ImportError: XLA_AVAILABLE = False @@ -395,7 +393,7 @@ def set_distributed_mode(self): ): self.num_processes = self.num_gpus - # Horovod si an extra case... + # Horovod is an extra case... if self.distributed_backend == "horovod": self._set_horovod_backend() @@ -409,7 +407,7 @@ def set_distributed_mode(self): rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') num_cores = self.tpu_cores if self.tpu_cores is not None else 0 - rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') + rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.") diff --git a/pytorch_lightning/trainer/connectors/env_vars_connector.py b/pytorch_lightning/trainer/connectors/env_vars_connector.py index 6b907d288c5ca..e4d5670b5fe78 100644 --- a/pytorch_lightning/trainer/connectors/env_vars_connector.py +++ b/pytorch_lightning/trainer/connectors/env_vars_connector.py @@ -28,7 +28,6 @@ def overwrite_by_env_vars(fn: Callable) -> Callable: def overwrite_by_env_vars(self, *args, **kwargs): # get the class cls = self.__class__ - if args: # inace any args passed move them to kwargs # parse only the argument names cls_arg_names = [arg[0] for arg in get_init_arguments_and_types(cls)] From a190a565619f3beb72735e0ce19db78beb138409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 14:48:44 +0100 Subject: [PATCH 137/157] fixes --- pytorch_lightning/plugins/__init__.py | 1 + tests/models/test_amp.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index 9f748996a707d..1a8b5090a346b 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -4,6 +4,7 @@ from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.tpu_bfloat import TPUHalfPrecisionPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin # noqa: F401 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin # noqa: F401 diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index f9c502bf3ce7e..94bfd6808ed79 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -20,8 +20,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.cluster_environments import SLURMEnvironment -from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _APEX_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException From 73bb60787383907ee5aa87985debf74ff70051e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 14:51:52 +0100 Subject: [PATCH 138/157] add forgotten generators --- pytorch_lightning/plugins/base_plugin.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py index c325518e4c8ff..b316a8663f9ff 100644 --- a/pytorch_lightning/plugins/base_plugin.py +++ b/pytorch_lightning/plugins/base_plugin.py @@ -43,11 +43,14 @@ def post_training(self) -> None: @contextlib.contextmanager def train_step_context(self) -> Generator: """A contextmanager for the trainstep""" + yield @contextlib.contextmanager def val_step_context(self) -> Generator: """A contextmanager for the validation step""" + yield @contextlib.contextmanager def test_step_context(self) -> Generator: """A contextmanager for the teststep""" + yield From ae71997dac15d560a45c07d1bf891f9409c9d777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 15:27:04 +0100 Subject: [PATCH 139/157] add missing logic --- pytorch_lightning/accelerators/accelerator.py | 7 +++---- pytorch_lightning/overrides/data_parallel.py | 9 ++++++++ pytorch_lightning/plugins/__init__.py | 1 + pytorch_lightning/plugins/base_plugin.py | 3 +++ .../plugins/training_type/__init__.py | 11 ++++++++++ .../plugins/training_type/ddp.py | 21 ++++++++++++------- .../plugins/training_type/ddp_spawn.py | 21 ++++++++++++------- .../plugins/training_type/horovod.py | 4 +--- .../training_type/training_type_plugin.py | 9 ++++++++ 9 files changed, 63 insertions(+), 23 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 576c8279376ea..e26dc8b476ab2 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -73,7 +73,6 @@ def setup(self, trainer: "Trainer", model: LightningModule) -> None: self.connect_training_type_plugin(self.training_type_plugin, model) self.setup_optimizers(trainer, model) self.connect_precision_plugin(self.precision_plugin) - self.optimizers = trainer.convert_to_lightning_optimizers(self.optimizers) @property def model(self) -> torch.nn.Module: @@ -141,7 +140,7 @@ def training_step(self, args): with self.precision_plugin.train_step_context(): with self.training_type_plugin.train_step_context(): - return self.lightning_module.training_step(*args) + return self.training_type_plugin.training_step(*args) def validation_step(self, args): """The actual validation step. @@ -160,7 +159,7 @@ def validation_step(self, args): with self.precision_plugin.val_step_context(): with self.training_type_plugin.val_step_context(): - return self.lightning_module.validation_step(*args) + return self.training_type_plugin.validation_step(*args) def test_step(self, args): """The actual test step. @@ -179,7 +178,7 @@ def test_step(self, args): with self.precision_plugin.test_step_context(): with self.training_type_plugin.test_step_context(): - return self.lightning_module.test_step(*args) + return self.training_type_plugin.test_step(*args) def training_step_end(self, output): """A hook to do something at the end of the training step diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index b027502f99e8a..28840cd51faf6 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -25,6 +25,15 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection +def unwrap_lightning_module(wrapped_model) -> LightningModule: + model = wrapped_model + if isinstance(model, (DistributedDataParallel, DataParallel)): + model = model.module + if isinstance(model, _LightningModuleWrapperBase): + model = model.module + return model + + class LightningDataParallel(DataParallel): def __init__(self, module: LightningModule, *args, **kwargs): diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index 9f748996a707d..0990b547907e7 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -9,6 +9,7 @@ from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin # noqa: F401 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin # noqa: F401 diff --git a/pytorch_lightning/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py index c325518e4c8ff..b316a8663f9ff 100644 --- a/pytorch_lightning/plugins/base_plugin.py +++ b/pytorch_lightning/plugins/base_plugin.py @@ -43,11 +43,14 @@ def post_training(self) -> None: @contextlib.contextmanager def train_step_context(self) -> Generator: """A contextmanager for the trainstep""" + yield @contextlib.contextmanager def val_step_context(self) -> Generator: """A contextmanager for the validation step""" + yield @contextlib.contextmanager def test_step_context(self) -> Generator: """A contextmanager for the teststep""" + yield diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py index 329f6347b17c3..21dec5bc5ccda 100644 --- a/pytorch_lightning/plugins/training_type/__init__.py +++ b/pytorch_lightning/plugins/training_type/__init__.py @@ -1 +1,12 @@ +from pytorch_lightning.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin +from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin +from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin +from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin +from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin +from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin +from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin +from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index f1027efb418ba..c133e0e68bc93 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -20,6 +20,7 @@ import numpy as np import torch import torch.distributed as torch_distrib +from torch.nn.parallel.distributed import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.distributed import LightningDistributed @@ -77,10 +78,7 @@ def root_device(self): @property def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - # fixme: uncomment when this class will actually be used - # return unwrap_lightning_module(self._model) - pass + return unwrap_lightning_module(self._model) @property def distributed_sampler_kwargs(self): @@ -184,10 +182,8 @@ def set_world_ranks(self): self.world_size = self.num_nodes * self.num_processes def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self._model = LightningDistributedDataParallel( - self.model, + self._model = DistributedDataParallel( + LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs, ) @@ -270,3 +266,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) return output + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 99fd2d5ea3c61..fd4fc9219196a 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -18,6 +18,7 @@ import torch import torch.distributed as torch_distrib import torch.multiprocessing as mp +from torch.nn.parallel.distributed import DistributedDataParallel from pytorch_lightning import _logger as log from pytorch_lightning.distributed.dist import LightningDistributed @@ -63,10 +64,7 @@ def root_device(self): @property def lightning_module(self): - # the model may not be wrapped with DistributedDataParallel if calling this too early - # fixme: uncomment when this class will actually be used - # return unwrap_lightning_module(self._model) - pass + return unwrap_lightning_module(self._model) @property def distributed_sampler_kwargs(self): @@ -155,10 +153,8 @@ def post_training(self): self.__recover_child_process_weights(best_path, last_path) def configure_ddp(self): - # if unset, default `find_unused_parameters` `True` - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - self.model = LightningDistributedDataParallel( - self.model, + self._model = DistributedDataParallel( + LightningDistributedModule(self.model), device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs, ) @@ -226,3 +222,12 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ if isinstance(output, torch.Tensor): output = sync_ddp_if_available(output, group, reduce_op) return output + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index a8bd0091eef6d..f45c3dcb93bb6 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -85,9 +85,7 @@ def _filter_named_parameters(model, optimizer): optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer) ) for optimizer in optimizers ] - - optimizers = self.lightning_module.trainer.convert_to_lightning_optimizers(optimizers) - self.lightning_module.trainer.optimizers = optimizers + self.lightning_module.trainer.accelerator.optimizers = optimizers def start_training(self, trainer): with ExitStack() as stack: diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 5dbbf23881373..89f2329512e5e 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -114,3 +114,12 @@ def start_training(self, trainer: 'Trainer') -> None: def start_testing(self, trainer: 'Trainer') -> None: # double dispatch to initiate the test loop self._results = trainer.run_test() + + def training_step(self, *args, **kwargs): + return self.lightning_module.training_step(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.lightning_module.validation_step(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.lightning_module.test_step(*args, **kwargs) From 0e686c315048c282d7bdb1d579506515e1921da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 15:43:55 +0100 Subject: [PATCH 140/157] update --- pytorch_lightning/plugins/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index 1a8b5090a346b..6a70ee62c9722 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -13,6 +13,8 @@ from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin # noqa: F401 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin # noqa: F401 __all__ = [ From d6a43eab8685cc4fd3583ddddb03d81d4b50494a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 15:55:55 +0100 Subject: [PATCH 141/157] import --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- pytorch_lightning/plugins/training_type/ddp_spawn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index c133e0e68bc93..84b70662c1f48 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -24,7 +24,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HYDRA_AVAILABLE diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index fd4fc9219196a..45640524e1d99 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -22,7 +22,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.cloud_io import atomic_save From ceb8f75dc05a6d98206cc0e4ac84d5afee2f5669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 15:59:57 +0100 Subject: [PATCH 142/157] missed imports --- pytorch_lightning/plugins/training_type/ddp.py | 3 ++- pytorch_lightning/plugins/training_type/ddp_spawn.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index c133e0e68bc93..ffbebe8178697 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -24,7 +24,8 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HYDRA_AVAILABLE diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index fd4fc9219196a..425becfbb8d9d 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -22,7 +22,8 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.cloud_io import atomic_save From fbb7c20e86df44574c1316169dfcf98c20933e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 16:02:05 +0100 Subject: [PATCH 143/157] import fixes --- pytorch_lightning/plugins/training_type/rpc.py | 2 +- pytorch_lightning/plugins/training_type/rpc_sequential.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 3bd0ba913d0b1..5b48f0e9d02e9 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -17,7 +17,7 @@ import torch -from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.utilities import _RPC_AVAILABLE diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py index 9bf2f6dbc77c3..4ab6cc22e3760 100644 --- a/pytorch_lightning/plugins/training_type/rpc_sequential.py +++ b/pytorch_lightning/plugins/training_type/rpc_sequential.py @@ -21,7 +21,7 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import LightningModule -from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin from pytorch_lightning.trainer.states import RunningStage From b61099905309d30cc7b017b5a64fea0ea8fa7982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 16:05:00 +0100 Subject: [PATCH 144/157] isort --- pytorch_lightning/plugins/training_type/rpc.py | 2 +- pytorch_lightning/plugins/training_type/rpc_sequential.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 5b48f0e9d02e9..4aff83189b6bc 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -17,8 +17,8 @@ import torch -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.utilities import _RPC_AVAILABLE diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py index 4ab6cc22e3760..baff4289c75a1 100644 --- a/pytorch_lightning/plugins/training_type/rpc_sequential.py +++ b/pytorch_lightning/plugins/training_type/rpc_sequential.py @@ -21,8 +21,8 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import LightningModule -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only From 9b799247dd372488458952dfa03dc25f72ac8ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 16:07:46 +0100 Subject: [PATCH 145/157] mv f --- pytorch_lightning/overrides/base.py | 11 +++++++++++ pytorch_lightning/overrides/data_parallel.py | 9 --------- pytorch_lightning/plugins/training_type/ddp.py | 3 ++- pytorch_lightning/plugins/training_type/ddp_spawn.py | 3 ++- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py index b2ad5b7d710fe..3dd20f6d4303b 100644 --- a/pytorch_lightning/overrides/base.py +++ b/pytorch_lightning/overrides/base.py @@ -14,6 +14,8 @@ from typing import Any import torch +from torch.nn import DataParallel +from torch.nn.parallel import DistributedDataParallel from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.states import RunningStage @@ -61,3 +63,12 @@ def warn_if_output_is_none(output: Any, method_name: str) -> None: """ Warns user about which method returned None. """ if output is None: warning_cache.warn(f'Your {method_name} returned None. Did you forget to return an output?') + + +def unwrap_lightning_module(wrapped_model) -> LightningModule: + model = wrapped_model + if isinstance(model, (DistributedDataParallel, DataParallel)): + model = model.module + if isinstance(model, _LightningModuleWrapperBase): + model = model.module + return model diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 28840cd51faf6..b027502f99e8a 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -25,15 +25,6 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection -def unwrap_lightning_module(wrapped_model) -> LightningModule: - model = wrapped_model - if isinstance(model, (DistributedDataParallel, DataParallel)): - model = model.module - if isinstance(model, _LightningModuleWrapperBase): - model = model.module - return model - - class LightningDataParallel(DataParallel): def __init__(self, module: LightningModule, *args, **kwargs): diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index ffbebe8178697..28872f882ab8c 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -25,7 +25,8 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed import LightningDistributed from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module +from pytorch_lightning.overrides.base import unwrap_lightning_module +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HYDRA_AVAILABLE diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 425becfbb8d9d..5e6b251e0c373 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -23,7 +23,8 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, unwrap_lightning_module +from pytorch_lightning.overrides.base import unwrap_lightning_module +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.cloud_io import atomic_save From 9afe54de9fe98bfa34dc725ed36685ddd18c4acc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 16:27:29 +0100 Subject: [PATCH 146/157] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc381b3983753..997ec482855ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -112,6 +112,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Added parallel plugins for DP, DDP, DDPSpawn, DDP2 and Horovod ([#5714](https://github.com/PyTorchLightning/pytorch-lightning/pull/5714)) * Added new Accelerators for CPU, GPU and TPU ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719)) * Added Plugins for TPU training ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719)) + * Added RPC and Sharded plugins ([#5732](https://github.com/PyTorchLightning/pytorch-lightning/pull/5732)) + * Added missing `LightningModule`-wrapper logic to new plugins and accelerator ([#5734](https://github.com/PyTorchLightning/pytorch-lightning/pull/5734)) ### Deprecated From ca8cb6822cff10be8376069d4972bf95bfae5916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 16:28:22 +0100 Subject: [PATCH 147/157] format --- tests/core/test_lightning_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 17d25b6c9b75a..4d36027709900 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import patch, Mock +from unittest.mock import Mock, patch import pytest from torch.optim import Adam, SGD From 06337451bc976565976e12ad8c8a8b0b86506bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 17:06:19 +0100 Subject: [PATCH 148/157] move helper to parallel plugin --- pytorch_lightning/plugins/training_type/ddp.py | 6 ------ pytorch_lightning/plugins/training_type/ddp_spawn.py | 6 ------ pytorch_lightning/plugins/training_type/dp.py | 9 +++------ pytorch_lightning/plugins/training_type/parallel.py | 5 +++++ 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 28872f882ab8c..bb906a2268d62 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -25,8 +25,6 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed import LightningDistributed from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.overrides.base import unwrap_lightning_module -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HYDRA_AVAILABLE @@ -78,10 +76,6 @@ def __init__( def root_device(self): return self.parallel_devices[self.local_rank] - @property - def lightning_module(self): - return unwrap_lightning_module(self._model) - @property def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 5e6b251e0c373..6f251eb36985a 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -23,8 +23,6 @@ from pytorch_lightning import _logger as log from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.overrides.base import unwrap_lightning_module -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.cloud_io import atomic_save @@ -64,10 +62,6 @@ def __init__( def root_device(self): return self.parallel_devices[self.local_rank] - @property - def lightning_module(self): - return unwrap_lightning_module(self._model) - @property def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index ce33da87048cc..4f35b8b37ea08 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -14,9 +14,10 @@ from typing import List import torch +from torch.nn import DataParallel from pytorch_lightning.core.step_result import Result -from pytorch_lightning.overrides.data_parallel import LightningDataParallel +from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -26,7 +27,7 @@ def __init__(self, parallel_devices: List[torch.device]): super().__init__(parallel_devices=parallel_devices, cluster_environment=None) def setup(self, model): - self._model = LightningDataParallel(model, self.parallel_devices) + self._model = DataParallel(LightningParallelModule(model), self.parallel_devices) def reduce(self, output, *args, **kwargs): if isinstance(output, Result): @@ -41,10 +42,6 @@ def reduce(self, output, *args, **kwargs): def root_device(self): return self.parallel_devices[0] - @property - def lightning_module(self): - return self._model.module - def model_to_device(self): # no need to do anything when model is wrapped in torch.nn.DataParallel pass diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index af4c2e254be56..91d44fbdaa5d1 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -18,6 +18,7 @@ import torch from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin @@ -46,6 +47,10 @@ def root_device(self): def on_gpu(self): return self.root_device.type == "cuda" and torch.cuda.is_available() + @property + def lightning_module(self): + return unwrap_lightning_module(self._model) + @abstractmethod def setup(self, model): raise NotImplementedError From a622e0b6ce3fcfe6f64c282da0b850874b9bc93c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 17:07:41 +0100 Subject: [PATCH 149/157] d --- pytorch_lightning/plugins/training_type/dp.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 4f35b8b37ea08..2bf4bbc0b4a96 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -54,3 +54,12 @@ def broadcast(self, obj: object, src: int = 0) -> object: def reduce_early_stopping_decision(self, should_stop: bool) -> bool: return should_stop + + def training_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def test_step(self, *args, **kwargs): + return self.model(*args, **kwargs) From f2758034d3ee9b07f7219025772b98a96dd56b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 19:01:31 +0100 Subject: [PATCH 150/157] add world size --- pytorch_lightning/plugins/training_type/horovod.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index f45c3dcb93bb6..335f65b3e3fbb 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -45,6 +45,7 @@ def setup(self, model): self.global_rank = hvd.rank() self.local_rank = hvd.local_rank() + self.world_size = hvd.size() rank_zero_only.rank = self.global_rank self.model_to_device() From 4ae008bf7e0b0e6e4ea93f6c3a8cf6ffffcb478e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 19:20:40 +0100 Subject: [PATCH 151/157] clean up --- pytorch_lightning/trainer/trainer.py | 36 ---------------------------- 1 file changed, 36 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c404adadd8117..9565db4ddf2bc 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -422,42 +422,6 @@ def __init__( # Callback system self.on_init_end() - @property - def optimizers(self): - return self.accelerator_backend.optimizers - - @optimizers.setter - def optimizers(self, new_optims): - self.accelerator_backend.optimizers = new_optims - - @property - def lr_schedulers(self): - return self.accelerator_backend.lr_schedulers - - @lr_schedulers.setter - def lr_schedulers(self, new_schedulers): - self.accelerator_backend.lr_schedulers = new_schedulers - - @property - def optimizer_frequencies(self): - return self.accelerator_backend.optimizer_frequencies - - @optimizer_frequencies.setter - def optimizer_frequencies(self, new_freqs): - self.accelerator_backend.optimizer_frequencies = new_freqs - - @property - def amp_backend(self): - return self.accelerator_backend.amp_backend - - @property - def precision(self): - return self.accelerator_backend.precision - - @property - def scaler(self): - return self.accelerator_backend.scaler - def fit( self, model: LightningModule, From d4c63086472622a5b9a00cb47dacf3c61814a543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 1 Feb 2021 19:26:18 +0100 Subject: [PATCH 152/157] duplicate --- pytorch_lightning/overrides/data_parallel.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 1d29cbf8081f6..b027502f99e8a 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -25,15 +25,6 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection -def unwrap_lightning_module(wrapped_model): - model = wrapped_model - if isinstance(model, (DistributedDataParallel, DataParallel)): - model = model.module - if isinstance(model, _LightningModuleWrapperBase): - model = model.module - return model - - class LightningDataParallel(DataParallel): def __init__(self, module: LightningModule, *args, **kwargs): From 994916490e05c48ec595e309e2f336ddee9e834a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Feb 2021 05:40:18 +0100 Subject: [PATCH 153/157] activate ddp_sharded and tpu --- .../accelerators/accelerator_connector.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index b6c60bb1a7eee..6e3cc9d57b704 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -34,7 +34,7 @@ SingleDevicePlugin, SingleTPUPlugin, TPUHalfPrecisionPlugin, - TPUSpawnPlugin, + TPUSpawnPlugin, DDPShardedPlugin, DDPSpawnShardedPlugin, ) from pytorch_lightning.plugins.environments import SLURMEnvironment, TorchElasticEnvironment from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus @@ -256,23 +256,21 @@ def select_training_type_plugin(self): use_ddp_cpu_spawn = self.use_ddp and self.on_cpu use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks - # use_ddp_sharded = self.distributed_backend == "ddp_sharded" - # use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn" + use_ddp_sharded = self.distributed_backend == "ddp_sharded" + use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn" - if self.on_tpu: - ddp_plugin_cls = TPUSpawnPlugin - - # ddp script mode uses the same flags as TE # TODO: decouple from TE + # ddp script mode uses the same flags as TE if os.environ.get("PL_IN_DDP_SUBPROCESS", False): use_torchelastic_ddp = False - # fixme - # if use_ddp_sharded: - # ddp_plugin_cls = DDPShardedPlugin - # elif use_ddp_sharded_spawn: - # ddp_plugin_cls = DDPSpawnShardedPlugin - if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: + if self.on_tpu: + ddp_plugin_cls = TPUSpawnPlugin + elif use_ddp_sharded: + ddp_plugin_cls = DDPShardedPlugin + elif use_ddp_sharded_spawn: + ddp_plugin_cls = DDPSpawnShardedPlugin + elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp: ddp_plugin_cls = DDPPlugin elif use_ddp_spawn or use_ddp_cpu_spawn: ddp_plugin_cls = DDPSpawnPlugin From 6d47357b2f09a2e90184dc5a1ed1b7e0ad85ca9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Feb 2021 05:53:35 +0100 Subject: [PATCH 154/157] set nvidia flags --- .../accelerators/accelerator_connector.py | 3 --- pytorch_lightning/accelerators/gpu.py | 15 ++++++++++++++- .../plugins/training_type/training_type_plugin.py | 12 ------------ 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 6e3cc9d57b704..43cea74f36ffa 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -123,9 +123,6 @@ def __init__( self.interactive_ddp_procs = [] self.global_rank = 0 - # NVIDIA setup - # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids) - # benchmarking # TODO: should this be moved to GPU accelerator? torch.backends.cudnn.benchmark = self.benchmark diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 833d5e1cb2a9a..f01cecac1615a 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -1,17 +1,22 @@ +import logging +import os + import torch from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException +log = logging.getLogger(__name__) + class GPUAccelerator(Accelerator): def setup(self, trainer, model): if "cuda" not in str(self.root_device): raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + self.set_nvidia_flags() torch.cuda.set_device(self.root_device) model.to(self.root_device) - return super().setup(trainer, model) def on_train_start(self): @@ -25,3 +30,11 @@ def on_train_end(self): # clean up memory with torch.cuda.device(self.root_device): torch.cuda.empty_cache() + + @staticmethod + def set_nvidia_flags(): + # set the correct cuda visible devices (using pci order) + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) + devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) + log.info(f"LOCAL_RANK: {os.getenv('LOCAL_RANK', 0)} - CUDA_VISIBLE_DEVICES: [{devices}]") diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 89f2329512e5e..bda5d161da33b 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -64,18 +64,6 @@ def barrier(self, name: Optional[str] = None) -> None: def broadcast(self, obj: object, src: int = 0) -> object: """Broadcasts an object to all processes""" - # TODO method this is currently unused. Check after complete refactors are pushed - def set_nvidia_flags(self, is_slurm_managing_tasks: bool, device_ids: Optional[Sequence]) -> None: - if device_ids is None: - return - - # set the correct cuda visible devices (using pci order) - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) - devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) - if self.lightning_module is not None: - log.info(f"LOCAL_RANK: {self.lightning_module.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: """Reduce the early stopping decision across all possibly spawned processes""" return should_stop From a6864ec795542965e9efa4415047090f5355d243 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Feb 2021 05:58:26 +0100 Subject: [PATCH 155/157] remove unused colab var --- pytorch_lightning/accelerators/accelerator_connector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 43cea74f36ffa..b86d78e7ee37f 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -135,9 +135,6 @@ def __init__( # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) - # TODO: move this to TPU accelerator/plugin - self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE") - self.replace_sampler_ddp = replace_sampler_ddp @property From b4b9724c32bfd4f5d6e46653a3153912467b1f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Feb 2021 06:06:31 +0100 Subject: [PATCH 156/157] use_tpu <-> on_tpu attrs --- pytorch_lightning/accelerators/accelerator_connector.py | 1 - pytorch_lightning/trainer/properties.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index b86d78e7ee37f..01283f2aab14a 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -116,7 +116,6 @@ def __init__( # override dist backend when using tpus if self.on_tpu: self.distributed_backend = "tpu" - self.use_tpu = True # init flags for SLURM+DDP to work self.world_size = 1 diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 81777530723fe..39dcbc6c7c3e0 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -135,9 +135,7 @@ def use_horovod(self): @property def use_tpu(self): - # TODO update this, what is the difference between use_tpu and on_tpu? - return False - # return self.accelerator_connector.use_tpu + return self.accelerator_connector.on_tpu @property def _distrib_type(self): From 81001e3a3b1e43130a223193c1aa82d552eac02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Feb 2021 09:42:08 +0100 Subject: [PATCH 157/157] make some ddp_cpu and clusterplugin tests pass --- .../accelerators/accelerator_connector.py | 28 ++++++++++--------- .../plugins/legacy/plugin_connector.py | 27 +++++++++--------- pytorch_lightning/trainer/trainer.py | 4 +-- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 01283f2aab14a..1fa95ef4c13b5 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -319,6 +319,8 @@ def select_cluster_environment(self): return env def set_distributed_mode(self): + if isinstance(self.distributed_backend, Accelerator): + return if self.distributed_backend is None: if self.has_horovodrun(): @@ -346,27 +348,27 @@ def set_distributed_mode(self): # special case with TPUs elif self.distributed_backend == 'tpu': self._device_type = DeviceType.TPU - # set all other requested distrib. types adn if it was not set in the + # set all other requested distrib. types and if it was not set in the elif self.distributed_backend and self._distrib_type is None: self._distrib_type = DistributedType(self.distributed_backend) # unless you request explicitly for CPU and some GPU are available use them _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend - if (self.num_gpus > 0 and not _on_cpu): + if self.num_gpus > 0 and not _on_cpu: self._device_type = DeviceType.GPU - _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) + # _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) # DP and DDP2 cannot run without GPU - if (self.num_gpus == 0 and self._distrib_type in _distrib_types): - rank_zero_warn( - 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' - ) - # todo: in some cases it yield in comarison None and int - if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)): - self._distrib_type = DistributedType.DDP - else: - rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') - self._distrib_type = None + # if (self.num_gpus == 0 and self._distrib_type in _distrib_types): + # rank_zero_warn( + # 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' + # ) + # # todo: in some cases it yield in comarison None and int + # if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)): + # self._distrib_type = DistributedType.DDP + # else: + # rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') + # self._distrib_type = None # for DDP overwrite nb processes by requested GPUs if ( diff --git a/pytorch_lightning/plugins/legacy/plugin_connector.py b/pytorch_lightning/plugins/legacy/plugin_connector.py index 22f97bf8b77f3..95ec73f7dd80e 100644 --- a/pytorch_lightning/plugins/legacy/plugin_connector.py +++ b/pytorch_lightning/plugins/legacy/plugin_connector.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. from enum import Enum -from typing import List, Optional, Union +from typing import List, Optional, Union, Sequence +from pytorch_lightning.plugins import Plugin from pytorch_lightning.plugins.environments import ClusterEnvironment from pytorch_lightning.plugins.legacy.apex import ApexPlugin from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin @@ -26,22 +27,22 @@ class PluginConnector: - def __init__(self, trainer): + def __init__(self, trainer, plugins: Optional[Union[str, list]] = None): self.trainer = trainer - self.plugins = [] - self.ddp_plugin = DDPPlugin() + self.plugins = plugins or [] self.cloud_environment = None - - def on_trainer_init(self, plugins: Optional[Union[str, list]]): - self.plugins = plugins - if self.plugins is None: - self.plugins = [] + # self.ddp_plugin = DDPPlugin() self.plugins = self._convert_str_custom_plugins(self.plugins) - self.plugins = self._append_required_plugins(self.plugins) - self.__attach_ddp() + + # TODO: plugin dependencies + # self.plugins = self._append_required_plugins(self.plugins) + self.__attach_cluster() - self.__attach_amp() - self.__attach_apex() + + # TODO: attach custom training type and precision plugins + # self.__attach_ddp() + # self.__attach_amp() + # self.__attach_apex() def __attach_amp(self): amp_plugin = self.__attach_plugin(NativeAMPPlugin) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9565db4ddf2bc..5cdfa5021acb8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -306,7 +306,7 @@ def __init__( self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) - self.plugin_connector = PluginConnector(self) + self.plugin_connector = PluginConnector(self, plugins) self.accelerator_connector = BackendConnector( num_processes, tpu_cores, @@ -417,7 +417,7 @@ def __init__( # last thing are the plugins which override whatever the trainer used by default # TODO: probably not needed anymore after refactor - self.plugin_connector.on_trainer_init(plugins) + # self.plugin_connector.on_trainer_init(plugins) # Callback system self.on_init_end()