From f75f445d165492c23cc53a43c2a11d6c77713140 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 22 Feb 2021 17:10:21 +0000 Subject: [PATCH 01/60] Initial changes --- pytorch_lightning/accelerators/__init__.py | 1 + pytorch_lightning/accelerators/ipu.py | 32 ++++ .../plugins/training_type/ipu.py | 154 ++++++++++++++++++ .../connectors/accelerator_connector.py | 20 ++- pytorch_lightning/utilities/__init__.py | 1 + pytorch_lightning/utilities/enums.py | 1 + pytorch_lightning/utilities/imports.py | 1 + 7 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 pytorch_lightning/accelerators/ipu.py create mode 100644 pytorch_lightning/plugins/training_type/ipu.py diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 05e15fe1f1767..2a460a27e373a 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -13,4 +13,5 @@ from pytorch_lightning.accelerators.accelerator import Accelerator # noqa F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa F401 from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.ipu import IPUAccelerator # noqa F401 from pytorch_lightning.accelerators.tpu import TPUAccelerator # noqa F401 diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py new file mode 100644 index 0000000000000..8374bc1bc1554 --- /dev/null +++ b/pytorch_lightning/accelerators/ipu.py @@ -0,0 +1,32 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import Callable + +from torch.optim import Optimizer + +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class IPUAccelerator(Accelerator): + + def setup_optimizers(self, trainer): + super().setup_optimizers(trainer) + + if len(self.optimizers) > 1: + raise MisconfigurationException("IPUs currently only support one optimizer.") + + def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs): + # Optimizer step is handled by the IPU accelerator. + lambda_closure() diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py new file mode 100644 index 0000000000000..95d78fb8f815c --- /dev/null +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -0,0 +1,154 @@ +import json +import os +from typing import Any, Iterable, Optional, Union + +import torch +from torch.utils.data import DataLoader + +from pytorch_lightning import _logger as log +from pytorch_lightning import LightningModule +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase +from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.utilities import _POPTORCH_AVAILABLE +from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +if _POPTORCH_AVAILABLE: + import poptorch + + if not poptorch.ipuHardwareIsAvailable(): + raise MisconfigurationException("IPU Accelerator requires IPUs to run.") + +# todo: No idea what's happening with grad accumulation, need to check since IPUs handle grad accum. +# todo: or even lr scheduling... + + +class LightningIPUModule(_LightningModuleWrapperBase): + + def __init__(self, pl_module: LightningModule, precision: int): + super().__init__(pl_module) + self.precision = precision + + def forward(self, *inputs, **kwargs): + if self.precision == 16: + inputs = self._move_float_tensors_to_half(inputs) + + return super().forward(*inputs, **kwargs) + + @staticmethod + def batch_to(data): + return data.half() + + def _move_float_tensors_to_half(self, batch: Any): + batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) + return batch + + +class IPUPlugin(TrainingTypePlugin): + + def __init__( + self, + mixed_precision: bool, + half: bool = False, + device_iterations: int = 1, + replication_factor: int = 1, + autoround_num_ipus: bool = True, + autoreport: bool = True, + autoreport_dir: Optional[str] = None + ): + super().__init__() + self.half = half + self.mixed_precision = mixed_precision + self.device_iterations = device_iterations + self.replication_factor = replication_factor + self.autoround_num_ipus = autoround_num_ipus + self.autoreport = autoreport + self.autoreport_dir = autoreport_dir + + if self.autoreport: + options = {"autoReport.all": self.autoreport} + if self.autoreport_dir: + if not os.path.exists(self.autoreport_dir): + os.makedirs(self.autoreport_dir) + options["autoReport.directory"] = self.autoreport_dir + os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) + + @property + def on_gpu(self) -> bool: + return False + + @property + def root_device(self) -> torch.device: + pass + + def model_to_device(self) -> None: + pass + + @property + def is_global_zero(self) -> bool: + return True + + def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: + return tensor + + def barrier(self, name: Optional[str] = None) -> None: + pass + + def broadcast(self, obj: object, src: int = 0) -> object: + return object + + @property + def lightning_module(self) -> Optional[LightningModule]: + return self.model.module if isinstance(self.model, LightningIPUModule) else self.model + + def pre_dispatch(self) -> None: + if self.half: + log.info('Using 16bit precision, converting model to FP16.') + self.model = self.model.half() + precision = 16 if self.half or self.mixed_precision else 32 + + # Separate models are instantiated for different stages, but they share the same weights on host. + # When validation/test models are run, they sync weights first. + # Create model for training which will run training. + + optimizer = self.lightning_module.trainer.optimizers[0] + self.model = poptorch.trainingModel( + model=LightningIPUModule(self.lightning_module, precision), + options=self._create_opts(is_train_model=True), + optimizer=optimizer + ) + + # Create model for training which will run validation. + self.validation_model = LightningIPUModule(self.lightning_module, precision) + self.validation_model = poptorch.inferenceModel( + model=self.validation_model, + options=self._create_opts(is_train_model=False), + ) + + def _create_opts(self, is_train_model): + opts = poptorch.Options() + opts.deviceIterations(self.device_iterations) + opts.replicationFactor(self.replication_factor) + gradient_accumulation = self.lightning_module.trainer.accumulate_grad_batches if is_train_model else 1 + opts.Training.gradientAccumulation(gradient_accumulation) + opts.autoRoundNumIPUs(self.autoround_num_ipus) + return opts + + def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + dataloader = self._convert_to_poptorch_loader( + dataloader=dataloader, opts=self._create_opts(is_train_model=self.lightning_module.training) + ) + return dataloader + + def _convert_to_poptorch_loader(self, dataloader, opts): + skip_keys = ['dataset_kind'] + if dataloader.batch_size: + # re-create batch sampler in new poptorch loader + skip_keys += ['batch_sampler'] + + dl_args = {k: v for k, v in dataloader.__dict__.items() if not k.startswith('_') and k not in skip_keys} + dl_args["options"] = opts + multiprocessing_context = dataloader.multiprocessing_context + dataloader = poptorch.DataLoader(**dl_args) + dataloader.multiprocessing_context = multiprocessing_context + return dataloader diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7021081d6cc90..8308555307874 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -21,6 +21,7 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator from pytorch_lightning.plugins import ( ApexMixedPrecisionPlugin, @@ -43,6 +44,7 @@ TrainingTypePlugin, ) from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment +from pytorch_lightning.plugins.training_type.ipu import IPUPlugin from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus from pytorch_lightning.utilities import ( _APEX_AVAILABLE, @@ -229,6 +231,10 @@ def on_cpu(self) -> bool: def on_tpu(self) -> bool: return self.tpu_cores is not None + @property + def on_ipu(self) -> bool: + return self._device_type == DeviceType.IPU + @property def tpu_id(self) -> Optional[int]: if self.on_tpu and isinstance(self.tpu_cores, list): @@ -292,7 +298,9 @@ def parallel_devices(self) -> Union[List[torch.device], int]: @property def root_gpu(self) -> Optional[int]: - return self.accelerator.root_device.index if not isinstance(self.accelerator, TPUAccelerator) else None + return self.accelerator.root_device.index if not isinstance( + self.accelerator, (IPUAccelerator, TPUAccelerator) + ) else None @property def is_using_torchelastic(self) -> bool: @@ -303,6 +311,9 @@ def select_precision_plugin(self) -> PrecisionPlugin: # set precision type self.amp_type = AMPType.from_str(self.amp_type) + if self._device_type == DeviceType.IPU: + return IPUPrecisionPlugin(self.precision) + if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) @@ -401,6 +412,8 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: plugin = SingleTPUPlugin(self.tpu_id) else: plugin = TPUSpawnPlugin(parallel_devices=list(range(self.tpu_cores))) + elif self.on_ipu: + plugin = IPUPlugin(mixed_precision=self.precision == 32) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) @@ -436,6 +449,8 @@ def select_accelerator(self) -> Accelerator: acc_cls = GPUAccelerator elif self.on_tpu: acc_cls = TPUAccelerator + elif self.on_ipu: + acc_cls = IPUAccelerator else: acc_cls = CPUAccelerator @@ -496,6 +511,9 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): # special case with TPUs elif self.distributed_backend == 'tpu': self._device_type = DeviceType.TPU + # special case with IPUs + elif self.distributed_backend == 'ipu': + self._device_type = DeviceType.IPU elif self.distributed_backend and self._distrib_type is None: self._distrib_type = DistributedType(self.distributed_backend) diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index cf3aa06f305b8..9b25838d8ab41 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -36,6 +36,7 @@ _module_available, _NATIVE_AMP_AVAILABLE, _OMEGACONF_AVAILABLE, + _POPTORCH_AVAILABLE, _RPC_AVAILABLE, _TORCH_GREATER_EQUAL_1_6, _TORCH_GREATER_EQUAL_1_7, diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py index 3e4add4fb68d1..ae03beaf4fb42 100644 --- a/pytorch_lightning/utilities/enums.py +++ b/pytorch_lightning/utilities/enums.py @@ -83,4 +83,5 @@ class DeviceType(LightningEnum): """ CPU = 'CPU' GPU = 'GPU' + IPU = 'IPU' TPU = 'TPU' diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 8024997382457..a3b88a3d13366 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -64,6 +64,7 @@ def _compare_version(package: str, op, version) -> bool: _HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental") _NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast") _OMEGACONF_AVAILABLE = _module_available("omegaconf") +_POPTORCH_AVAILABLE = _module_available('poptorch') _RPC_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.rpc') _TORCHTEXT_AVAILABLE = _module_available("torchtext") _TORCHVISION_AVAILABLE = _module_available('torchvision') From dc9744b00ceeb9c53054f2f1624963963f011289 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 24 Mar 2021 22:57:47 +0000 Subject: [PATCH 02/60] Add broken example for now --- pl_examples/ipu_examples/__init__.py | 0 pl_examples/ipu_examples/mnist.py | 118 ++++++++++++++++++ pytorch_lightning/plugins/__init__.py | 2 + .../plugins/precision/ipu_precision.py | 5 + 4 files changed, 125 insertions(+) create mode 100644 pl_examples/ipu_examples/__init__.py create mode 100644 pl_examples/ipu_examples/mnist.py create mode 100644 pytorch_lightning/plugins/precision/ipu_precision.py diff --git a/pl_examples/ipu_examples/__init__.py b/pl_examples/ipu_examples/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py new file mode 100644 index 0000000000000..db125d5157057 --- /dev/null +++ b/pl_examples/ipu_examples/mnist.py @@ -0,0 +1,118 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from pprint import pprint + +import torch +import torch.nn as nn +from torch.nn import functional as F + +import pytorch_lightning as pl +from pl_examples import cli_lightning_logo +from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule +from pytorch_lightning.accelerators import IPUAccelerator + + +class Block(nn.Module): + + def __init__(self, in_channels, num_filters, kernel_size, pool_size): + super(Block, self).__init__() + self.conv = nn.Conv2d(in_channels=in_channels, out_channels=num_filters, kernel_size=kernel_size) + self.pool = nn.MaxPool2d(kernel_size=pool_size) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.pool(x) + x = self.relu(x) + return x + + +class LitClassifier(pl.LightningModule): + + def __init__(self, learning_rate=1e-3): + super().__init__() + self.save_hyperparameters() + + self.layer1 = Block(1, 32, 3, 2) + self.layer2 = Block(32, 64, 3, 2) + self.layer3 = nn.Linear(1600, 128) + self.layer3_act = nn.ReLU() + self.layer3_dropout = torch.nn.Dropout(0.5) + self.layer4 = nn.Linear(128, 10) + self.softmax = nn.Softmax(1) + + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + # Flatten layer + x = x.view(-1, 1600) + x = self.layer3_act(self.layer3(x)) + x = self.layer4(self.layer3_dropout(x)) + x = self.softmax(x) + return x + + def training_step(self, batch): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def test_step(self, batch): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def configure_optimizers(self): + return torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--learning_rate', type=float, default=0.0001) + return parser + + +def cli_main(): + parser = ArgumentParser() + parser = pl.Trainer.add_argparse_args(parser) + parser = LitClassifier.add_model_specific_args(parser) + parser = IPUAccelerator.add_argparse_args(parser) + parser = MNISTDataModule.add_argparse_args(parser) + args = parser.parse_args() + + dm = MNISTDataModule.from_argparse_args(args) + + model = LitClassifier(args.learning_rate) + + accelerator = IPUAccelerator.from_argparse_args(args) + trainer = pl.Trainer.from_argparse_args(args, accelerator=accelerator) + + trainer.fit(model, datamodule=dm) + + result = trainer.test(model, datamodule=dm) + pprint(result) + + +if __name__ == '__main__': + cli_lightning_logo() + cli_main() diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index a67235baa4767..66e80c1178d15 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -12,6 +12,7 @@ from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.ipu import IPUPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.rpc import RPCPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin # noqa: F401 @@ -32,6 +33,7 @@ "DeepSpeedPrecisionPlugin", "DoublePrecisionPlugin", "HorovodPlugin", + "IPUPlugin", "NativeMixedPrecisionPlugin", "PrecisionPlugin", "ShardedNativeMixedPrecisionPlugin", diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py new file mode 100644 index 0000000000000..744ac1bd5fb82 --- /dev/null +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -0,0 +1,5 @@ +from pytorch_lightning.plugins import PrecisionPlugin + + +class IPUPrecisionPlugin(PrecisionPlugin): + pass From 931bb74ad74d0b29d50937cf6fed1c930c0b07da Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 7 Apr 2021 23:27:24 +0100 Subject: [PATCH 03/60] Fix reference --- pytorch_lightning/plugins/__init__.py | 2 ++ pytorch_lightning/trainer/connectors/accelerator_connector.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index 66e80c1178d15..a9e6fa1bce619 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -2,6 +2,7 @@ from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.deepspeed_precision import DeepSpeedPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin # noqa: F401 +from pytorch_lightning.plugins.precision.ipu_precision import IPUPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 @@ -34,6 +35,7 @@ "DoublePrecisionPlugin", "HorovodPlugin", "IPUPlugin", + "IPUPrecisionPlugin", "NativeMixedPrecisionPlugin", "PrecisionPlugin", "ShardedNativeMixedPrecisionPlugin", diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 808df52a91f10..61d7917dfec41 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -36,6 +36,7 @@ DoublePrecisionPlugin, HorovodPlugin, IPUPlugin, + IPUPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin, ShardedNativeMixedPrecisionPlugin, @@ -324,7 +325,7 @@ def select_precision_plugin(self) -> PrecisionPlugin: self.amp_type = AMPType.from_str(self.amp_type) if self._device_type == DeviceType.IPU: - return IPUPrecisionPlugin(self.precision) + return IPUPrecisionPlugin() if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) From c617f02abe15a553e3f5a8176ffac409661dbcf9 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 11 May 2021 12:50:02 +0100 Subject: [PATCH 04/60] Fix format --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 56670ecd68e93..3437d2b0bcff3 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -550,7 +550,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if isinstance(self.tpu_cores, int): self._distrib_type = DistributedType.TPU_SPAWN elif self.distributed_backend == 'ipu': - self._device_type = DeviceType.IPU + self._device_type = DeviceType.IPU elif self.distributed_backend and self._distrib_type is None: self._distrib_type = DistributedType(self.distributed_backend) From 522a81fe75dfb6e5c3503a00d28585e662978421 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 11 May 2021 17:14:20 +0100 Subject: [PATCH 05/60] Code runs --- pl_examples/ipu_examples/mnist.py | 25 +--- .../plugins/precision/ipu_precision.py | 23 +++- .../plugins/training_type/ipu.py | 114 +++++++++++++----- .../connectors/accelerator_connector.py | 4 +- 4 files changed, 112 insertions(+), 54 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index db125d5157057..2f084d41b4124 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -20,9 +20,7 @@ from torch.nn import functional as F import pytorch_lightning as pl -from pl_examples import cli_lightning_logo from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule -from pytorch_lightning.accelerators import IPUAccelerator class Block(nn.Module): @@ -64,20 +62,17 @@ def forward(self, x): x = self.softmax(x) return x - def training_step(self, batch): - x, y = batch + def training_step(self, x, y, batch_idx): y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss - def validation_step(self, batch): - x, y = batch + def validation_step(self, x, y): y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss - def test_step(self, batch): - x, y = batch + def test_step(self, x, y): y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss @@ -86,17 +81,15 @@ def configure_optimizers(self): return torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate) @staticmethod - def add_model_specific_args(parent_parser): - parser = ArgumentParser(parents=[parent_parser], add_help=False) + def add_model_specific_args(parser): parser.add_argument('--learning_rate', type=float, default=0.0001) return parser -def cli_main(): +if __name__ == '__main__': parser = ArgumentParser() parser = pl.Trainer.add_argparse_args(parser) parser = LitClassifier.add_model_specific_args(parser) - parser = IPUAccelerator.add_argparse_args(parser) parser = MNISTDataModule.add_argparse_args(parser) args = parser.parse_args() @@ -104,15 +97,9 @@ def cli_main(): model = LitClassifier(args.learning_rate) - accelerator = IPUAccelerator.from_argparse_args(args) - trainer = pl.Trainer.from_argparse_args(args, accelerator=accelerator) + trainer = pl.Trainer.from_argparse_args(args, max_epochs=10, accelerator='ipu') trainer.fit(model, datamodule=dm) result = trainer.test(model, datamodule=dm) pprint(result) - - -if __name__ == '__main__': - cli_lightning_logo() - cli_main() diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py index 744ac1bd5fb82..4e88a6cf73fe1 100644 --- a/pytorch_lightning/plugins/precision/ipu_precision.py +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -1,5 +1,24 @@ -from pytorch_lightning.plugins import PrecisionPlugin +from typing import Any + +from torch import Tensor + +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin class IPUPrecisionPlugin(PrecisionPlugin): - pass + + def __init__(self, precision: int) -> None: + super().__init__() + self.precision = precision + + def backward( + self, + closure_loss: Tensor, + *args: Any, + **kwargs: Any, + ) -> Tensor: + # IPU internally manages bwd step. + return closure_loss + + def clip_gradients(self, *args, **kwargs) -> None: + pass diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 95d78fb8f815c..385bc09edd6b8 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -1,14 +1,17 @@ +import inspect import json import os -from typing import Any, Iterable, Optional, Union +from typing import Any, Iterable, List, Optional, Union import torch from torch.utils.data import DataLoader from pytorch_lightning import _logger as log -from pytorch_lightning import LightningModule +from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase -from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities import _POPTORCH_AVAILABLE from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -44,21 +47,21 @@ def _move_float_tensors_to_half(self, batch: Any): return batch -class IPUPlugin(TrainingTypePlugin): +class IPUPlugin(ParallelPlugin): def __init__( self, - mixed_precision: bool, half: bool = False, device_iterations: int = 1, replication_factor: int = 1, autoround_num_ipus: bool = True, autoreport: bool = True, - autoreport_dir: Optional[str] = None + autoreport_dir: Optional[str] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, ): - super().__init__() + super().__init__(parallel_devices, cluster_environment) self.half = half - self.mixed_precision = mixed_precision self.device_iterations = device_iterations self.replication_factor = replication_factor self.autoround_num_ipus = autoround_num_ipus @@ -94,23 +97,25 @@ def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> def barrier(self, name: Optional[str] = None) -> None: pass + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + return tensor + def broadcast(self, obj: object, src: int = 0) -> object: - return object + return obj @property def lightning_module(self) -> Optional[LightningModule]: - return self.model.module if isinstance(self.model, LightningIPUModule) else self.model + model = self.model.module if isinstance(self.model, poptorch.PoplarExecutor) else self.model + return model.module if isinstance(model, LightningIPUModule) else model def pre_dispatch(self) -> None: if self.half: log.info('Using 16bit precision, converting model to FP16.') self.model = self.model.half() - precision = 16 if self.half or self.mixed_precision else 32 + precision = self.lightning_module.trainer.accelerator.precision_plugin.precision + precision = 16 if self.half else precision - # Separate models are instantiated for different stages, but they share the same weights on host. - # When validation/test models are run, they sync weights first. # Create model for training which will run training. - optimizer = self.lightning_module.trainer.optimizers[0] self.model = poptorch.trainingModel( model=LightningIPUModule(self.lightning_module, precision), @@ -118,12 +123,16 @@ def pre_dispatch(self) -> None: optimizer=optimizer ) - # Create model for training which will run validation. - self.validation_model = LightningIPUModule(self.lightning_module, precision) - self.validation_model = poptorch.inferenceModel( - model=self.validation_model, - options=self._create_opts(is_train_model=False), - ) + # Separate models are instantiated for different stages, but they share the same weights on host. + # When validation/test models are run, they sync weights first. + + # todo: not sure this is the cleanest way to do this... + self.inference_models = {} + for x in ('val', 'test', 'predict'): + self.inference_models[x] = poptorch.inferenceModel( + model=LightningIPUModule(self.lightning_module, precision), + options=self._create_opts(is_train_model=False), + ) def _create_opts(self, is_train_model): opts = poptorch.Options() @@ -135,20 +144,63 @@ def _create_opts(self, is_train_model): return opts def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: - dataloader = self._convert_to_poptorch_loader( - dataloader=dataloader, opts=self._create_opts(is_train_model=self.lightning_module.training) - ) + if isinstance(dataloader, CombinedLoader): + dataloader.loaders = apply_to_collection( + dataloader.loaders, + DataLoader, + self.process_dataloader, + ) + return dataloader + + if not isinstance(dataloader, poptorch.DataLoader): + dataloader = self._convert_to_poptorch_loader( + dataloader=dataloader, opts=self._create_opts(is_train_model=self.lightning_module.training) + ) return dataloader - def _convert_to_poptorch_loader(self, dataloader, opts): - skip_keys = ['dataset_kind'] - if dataloader.batch_size: - # re-create batch sampler in new poptorch loader - skip_keys += ['batch_sampler'] + def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], + opts: poptorch.Options) -> Union[Iterable, DataLoader]: + skip_keys = ('sampler', 'batch_sampler', 'dataset_kind') + + attrs = {k: v for k, v in vars(dataloader).items() if not k.startswith("_")} + + params = set(inspect.signature(dataloader.__init__).parameters) + contains_dataset = True + + if type(dataloader) is not DataLoader: + contains_dataset = "dataset" in params + params.update(inspect.signature(DataLoader.__init__).parameters) + + dl_args = {name: attrs[name] for name in params if name in attrs and name not in skip_keys} - dl_args = {k: v for k, v in dataloader.__dict__.items() if not k.startswith('_') and k not in skip_keys} - dl_args["options"] = opts multiprocessing_context = dataloader.multiprocessing_context - dataloader = poptorch.DataLoader(**dl_args) + dl_args['multiprocessing_context'] = multiprocessing_context + if not contains_dataset: + dl_args.pop('dataset') + + dataloader = poptorch.DataLoader(**dl_args, options=opts) dataloader.multiprocessing_context = multiprocessing_context return dataloader + + def training_step(self, *args, **kwargs): + # todo: we shouldn't need to drop the batch idx here + # also the args are now being passed as individual args which is different, i.e + # def training_step(batch, batch_idx): + # becomes + # def training_step(x, y): + # where x and y are the batch arguments... + args = args[0] # Drop the batch idx + return self.model(*args, **kwargs) + + def validation_step(self, *args, **kwargs): + batch_idx = torch.tensor(args[1], dtype=torch.int) + args = args[0] # Drop the batch idx + return self.inference_models['val'](*args, batch_idx, **kwargs) + + def test_step(self, *args, **kwargs): + args = args[0] # Drop the batch idx + return self.inference_models['test'](*args, **kwargs) + + def predict_step(self, *args, **kwargs): + args = args[0] # Drop the batch idx + return self.inference_models['predict'](*args, **kwargs) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 3437d2b0bcff3..eb58cf061a0d9 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -347,7 +347,7 @@ def select_precision_plugin(self) -> PrecisionPlugin: self.amp_type = AMPType.from_str(self.amp_type) if self._device_type == DeviceType.IPU: - return IPUPrecisionPlugin() + return IPUPrecisionPlugin(self.precision) if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) @@ -446,7 +446,7 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.on_ipu: - plugin = IPUPlugin(mixed_precision=self.precision == 32) + plugin = IPUPlugin() else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) From 0c003608503f940a777bd7754adab867e734c05c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 26 May 2021 11:31:14 +0100 Subject: [PATCH 06/60] Fixes --- pl_examples/ipu_examples/mnist.py | 75 +++++++------------ .../plugins/training_type/ipu.py | 75 ++++++++++++------- .../training_type/training_type_plugin.py | 16 ++++ .../connectors/accelerator_connector.py | 9 ++- pytorch_lightning/trainer/data_loading.py | 7 ++ pytorch_lightning/trainer/trainer.py | 5 +- 6 files changed, 106 insertions(+), 81 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 2f084d41b4124..8db1a6dfde949 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -16,88 +16,65 @@ from pprint import pprint import torch -import torch.nn as nn from torch.nn import functional as F import pytorch_lightning as pl from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule -class Block(nn.Module): - - def __init__(self, in_channels, num_filters, kernel_size, pool_size): - super(Block, self).__init__() - self.conv = nn.Conv2d(in_channels=in_channels, out_channels=num_filters, kernel_size=kernel_size) - self.pool = nn.MaxPool2d(kernel_size=pool_size) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.pool(x) - x = self.relu(x) - return x - - class LitClassifier(pl.LightningModule): - def __init__(self, learning_rate=1e-3): + def __init__( + self, + hidden_dim: int = 128, + learning_rate: float = 0.0001, + ): super().__init__() self.save_hyperparameters() - self.layer1 = Block(1, 32, 3, 2) - self.layer2 = Block(32, 64, 3, 2) - self.layer3 = nn.Linear(1600, 128) - self.layer3_act = nn.ReLU() - self.layer3_dropout = torch.nn.Dropout(0.5) - self.layer4 = nn.Linear(128, 10) - self.softmax = nn.Softmax(1) + self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) + self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) def forward(self, x): - x = self.layer1(x) - x = self.layer2(x) - # Flatten layer - x = x.view(-1, 1600) - x = self.layer3_act(self.layer3(x)) - x = self.layer4(self.layer3_dropout(x)) - x = self.softmax(x) + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) return x - def training_step(self, x, y, batch_idx): + def training_step(self, batch, batch_idx): + x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss - def validation_step(self, x, y): + def validation_step(self, batch, batch_idx): + x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss - def test_step(self, x, y): + def test_step(self, batch, batch_idx): + x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss - def configure_optimizers(self): - return torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate) + def on_validation_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx: int) -> None: + self.log('val_loss', outputs.mean(), prog_bar=True) - @staticmethod - def add_model_specific_args(parser): - parser.add_argument('--learning_rate', type=float, default=0.0001) - return parser + def on_test_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx: int) -> None: + self.log('test_loss', outputs.mean(), prog_bar=True) + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) -if __name__ == '__main__': - parser = ArgumentParser() - parser = pl.Trainer.add_argparse_args(parser) - parser = LitClassifier.add_model_specific_args(parser) - parser = MNISTDataModule.add_argparse_args(parser) - args = parser.parse_args() - dm = MNISTDataModule.from_argparse_args(args) +if __name__ == '__main__': + dm = MNISTDataModule(batch_size=32) - model = LitClassifier(args.learning_rate) + model = LitClassifier() - trainer = pl.Trainer.from_argparse_args(args, max_epochs=10, accelerator='ipu') + trainer = pl.Trainer(max_epochs=10, accelerator='ipu', ipu_cores=8) trainer.fit(model, datamodule=dm) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 385bc09edd6b8..dbdc7fc931942 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -63,7 +63,6 @@ def __init__( super().__init__(parallel_devices, cluster_environment) self.half = half self.device_iterations = device_iterations - self.replication_factor = replication_factor self.autoround_num_ipus = autoround_num_ipus self.autoreport = autoreport self.autoreport_dir = autoreport_dir @@ -117,11 +116,9 @@ def pre_dispatch(self) -> None: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] - self.model = poptorch.trainingModel( - model=LightningIPUModule(self.lightning_module, precision), - options=self._create_opts(is_train_model=True), - optimizer=optimizer - ) + model = LightningIPUModule(self.lightning_module, precision) + + self.model = poptorch.trainingModel(model=model, options=self._create_opts(training=True), optimizer=optimizer) # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, they sync weights first. @@ -130,19 +127,35 @@ def pre_dispatch(self) -> None: self.inference_models = {} for x in ('val', 'test', 'predict'): self.inference_models[x] = poptorch.inferenceModel( - model=LightningIPUModule(self.lightning_module, precision), - options=self._create_opts(is_train_model=False), + model=model, + options=self._create_opts(training=False), ) - def _create_opts(self, is_train_model): + @property + def replication_factor(self): + return len(self.parallel_devices) + + def _create_opts(self, training): opts = poptorch.Options() opts.deviceIterations(self.device_iterations) opts.replicationFactor(self.replication_factor) - gradient_accumulation = self.lightning_module.trainer.accumulate_grad_batches if is_train_model else 1 + gradient_accumulation = self.lightning_module.trainer.accumulate_grad_batches if training else 1 opts.Training.gradientAccumulation(gradient_accumulation) opts.autoRoundNumIPUs(self.autoround_num_ipus) return opts + def on_reset_train_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_val_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_test_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_predict_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: if isinstance(dataloader, CombinedLoader): dataloader.loaders = apply_to_collection( @@ -151,15 +164,17 @@ def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[I self.process_dataloader, ) return dataloader - + elif isinstance(dataloader, list): + dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader) + return dataloader if not isinstance(dataloader, poptorch.DataLoader): dataloader = self._convert_to_poptorch_loader( - dataloader=dataloader, opts=self._create_opts(is_train_model=self.lightning_module.training) + dataloader=dataloader, opts=self._create_opts(training=self.lightning_module.training) ) return dataloader def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], - opts: poptorch.Options) -> Union[Iterable, DataLoader]: + opts: 'poptorch.Options') -> Union[Iterable, DataLoader]: skip_keys = ('sampler', 'batch_sampler', 'dataset_kind') attrs = {k: v for k, v in vars(dataloader).items() if not k.startswith("_")} @@ -183,24 +198,28 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], return dataloader def training_step(self, *args, **kwargs): - # todo: we shouldn't need to drop the batch idx here - # also the args are now being passed as individual args which is different, i.e - # def training_step(batch, batch_idx): - # becomes - # def training_step(x, y): - # where x and y are the batch arguments... - args = args[0] # Drop the batch idx - return self.model(*args, **kwargs) + args, batch_idx = self._prepare_input(args) + return self.model(args, batch_idx, **kwargs) + + def _prepare_input(self, args): + args, batch_idx = args + # explicit conversion to tuple as Lists are not supported in jit as they are mutable + # todo: we probably want to apply this to all lists in the object + # todo: do we need to do additional checks for dicts? + args = tuple(args) + accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + num_repeat = self.replication_factor * self.device_iterations * accumulate_grad_batches + batch_idx = torch.tensor(batch_idx, dtype=torch.int).unsqueeze(0).repeat(num_repeat) + return args, batch_idx def validation_step(self, *args, **kwargs): - batch_idx = torch.tensor(args[1], dtype=torch.int) - args = args[0] # Drop the batch idx - return self.inference_models['val'](*args, batch_idx, **kwargs) + args, batch_idx = self._prepare_input(args) + return self.inference_models['val'](args, batch_idx, **kwargs) def test_step(self, *args, **kwargs): - args = args[0] # Drop the batch idx - return self.inference_models['test'](*args, **kwargs) + args, batch_idx = self._prepare_input(args) + return self.inference_models['test'](args, batch_idx, **kwargs) def predict_step(self, *args, **kwargs): - args = args[0] # Drop the batch idx - return self.inference_models['predict'](*args, **kwargs) + args, batch_idx = self._prepare_input(args) + return self.inference_models['predict'](args, batch_idx, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index ede5717258040..b440b5685229f 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -186,6 +186,22 @@ def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[I """ return dataloader + def on_reset_train_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + """Called before resetting the train dataloader.""" + return dataloader + + def on_reset_val_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + """Called before resetting the val dataloader.""" + return dataloader + + def on_reset_test_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + """Called before resetting the test dataloader.""" + return dataloader + + def on_reset_predict_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + """Called before resetting the predict dataloader.""" + return dataloader + def init_optimizers(self, trainer: 'pl.Trainer', model: 'pl.LightningModule'): return trainer.init_optimizers(model) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index eb58cf061a0d9..c5f3e7d656f7d 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -79,6 +79,7 @@ def __init__( self, num_processes, tpu_cores, + ipu_cores, distributed_backend, auto_select_gpus, gpus, @@ -98,6 +99,7 @@ def __init__( self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + self.ipu_cores = ipu_cores self.distributed_backend = distributed_backend self.auto_select_gpus = auto_select_gpus self.gpus = gpus @@ -250,7 +252,7 @@ def on_tpu(self) -> bool: @property def on_ipu(self) -> bool: - return self._device_type == DeviceType.IPU + return self.ipu_cores is not None @property def tpu_id(self) -> Optional[int]: @@ -314,6 +316,9 @@ def parallel_devices(self) -> List[Union[torch.device, int]]: # https://github.com/PyTorchLightning/pytorch-lightning/issues/3169 if isinstance(self.tpu_cores, int): devices = list(range(self.tpu_cores)) + elif self.on_ipu: + if isinstance(self.ipu_cores, int): + devices = list(range(self.ipu_cores)) else: devices = [torch.device("cpu")] * self.num_processes return devices @@ -446,7 +451,7 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) elif self.on_ipu: - plugin = IPUPlugin() + plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 29711b23d8546..42d7a8d4e2328 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -261,6 +261,9 @@ def reset_train_dataloader(self, model: LightningModule) -> None: # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches self.train_dataloader = CombinedLoader(self.train_dataloader, self._multiple_trainloader_mode) + # todo (sean): should be the accelerator, not the training type plugin + self.train_dataloader = self.accelerator.training_type_plugin.on_reset_train_dataloader(self.train_dataloader) + self.num_training_batches = len(self.train_dataloader) if has_len(self.train_dataloader) else float('inf') if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: @@ -361,6 +364,10 @@ def _reset_eval_dataloader( # add worker_init_fn for correct seeding in worker processes apply_to_collection(dataloaders, dtype=DataLoader, function=self.auto_add_worker_init_fn) + hook_name = f"on_reset_{mode}_dataloader" + # todo (sean): should be the accelerator, not the training type plugin + dataloaders = getattr(self.accelerator.training_type_plugin, hook_name)(dataloaders) + loader_num_batches = [] # determine number of batches diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8732d8c33dce7..9b84a761d9fa4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -102,6 +102,7 @@ def __init__( gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, + ipu_cores: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: Optional[int] = None, overfit_batches: Union[int, float] = 0.0, @@ -318,8 +319,8 @@ def __init__( self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector( - num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, benchmark, - replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins + num_processes, tpu_cores, ipu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, + benchmark, replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) From adbdb2a023e83b861b58ddbd37ffa50dc1809b42 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 26 May 2021 11:38:16 +0100 Subject: [PATCH 07/60] Clear up files --- pl_examples/ipu_examples/mnist.py | 1 - pytorch_lightning/plugins/training_type/ipu.py | 1 - 2 files changed, 2 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 8db1a6dfde949..bc535daf94542 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from argparse import ArgumentParser from pprint import pprint import torch diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index dbdc7fc931942..68be0ba1451bd 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -53,7 +53,6 @@ def __init__( self, half: bool = False, device_iterations: int = 1, - replication_factor: int = 1, autoround_num_ipus: bool = True, autoreport: bool = True, autoreport_dir: Optional[str] = None, From 3e733af990f137273f645608a45bcb9c30282017 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 27 May 2021 11:10:59 +0100 Subject: [PATCH 08/60] Add tests, helpers, fixes --- pl_examples/ipu_examples/mnist.py | 29 +-- .../plugins/training_type/ipu.py | 61 +++--- .../connectors/accelerator_connector.py | 10 +- pytorch_lightning/utilities/__init__.py | 1 + pytorch_lightning/utilities/imports.py | 6 + tests/accelerators/test_ipu.py | 176 ++++++++++++++++++ tests/helpers/runif.py | 7 + 7 files changed, 253 insertions(+), 37 deletions(-) create mode 100644 tests/accelerators/test_ipu.py diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index bc535daf94542..980dd3430446c 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -48,21 +48,28 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - return loss + logits = self(x) + acc = self.accuracy(logits, y) + return acc def test_step(self, batch, batch_idx): x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - return loss + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def accuracy(self, logits, y): + # todo (sean): currently IPU poptorch doesn't implicit convert bools to tensor + # hence we use an explicit calculation for accuracy here. Once fixed in poptorch + # we can use the accuracy metric. + acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) + return acc - def on_validation_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx: int) -> None: - self.log('val_loss', outputs.mean(), prog_bar=True) + def validation_epoch_end(self, outputs) -> None: + self.log('val_acc', torch.stack(outputs).mean(), prog_bar=True) - def on_test_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx: int) -> None: - self.log('test_loss', outputs.mean(), prog_bar=True) + def test_epoch_end(self, outputs) -> None: + self.log('test_acc', torch.stack(outputs).mean()) def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) @@ -73,7 +80,7 @@ def configure_optimizers(self): model = LitClassifier() - trainer = pl.Trainer(max_epochs=10, accelerator='ipu', ipu_cores=8) + trainer = pl.Trainer(max_epochs=2, ipu_cores=8) trainer.fit(model, datamodule=dm) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 68be0ba1451bd..4b729b6c8c034 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -107,25 +107,32 @@ def lightning_module(self) -> Optional[LightningModule]: return model.module if isinstance(model, LightningIPUModule) else model def pre_dispatch(self) -> None: + ''' + The issue here is we assume we're training. + What if we're not training? + I say + ''' if self.half: - log.info('Using 16bit precision, converting model to FP16.') + log.info('Using full 16bit precision, converting LightningModule weights to FP16.') self.model = self.model.half() precision = self.lightning_module.trainer.accelerator.precision_plugin.precision precision = 16 if self.half else precision - # Create model for training which will run training. - optimizer = self.lightning_module.trainer.optimizers[0] model = LightningIPUModule(self.lightning_module, precision) - - self.model = poptorch.trainingModel(model=model, options=self._create_opts(training=True), optimizer=optimizer) + self.model = model # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, they sync weights first. - # todo: not sure this is the cleanest way to do this... - self.inference_models = {} + self.poptorch_wrapped_models = {} + if self.lightning_module.trainer.training: + # Create model for training which will run training. + optimizer = self.lightning_module.trainer.optimizers[0] + self.poptorch_wrapped_models['train'] = poptorch.trainingModel( + model=model, options=self._create_opts(training=True), optimizer=optimizer + ) for x in ('val', 'test', 'predict'): - self.inference_models[x] = poptorch.inferenceModel( + self.poptorch_wrapped_models[x] = poptorch.inferenceModel( model=model, options=self._create_opts(training=False), ) @@ -141,6 +148,10 @@ def _create_opts(self, training): gradient_accumulation = self.lightning_module.trainer.accumulate_grad_batches if training else 1 opts.Training.gradientAccumulation(gradient_accumulation) opts.autoRoundNumIPUs(self.autoround_num_ipus) + + # todo (sean): unsure if this is necessary but to be safe. + if os.environ.get("PL_GLOBAL_SEED"): + opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"])) return opts def on_reset_train_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: @@ -197,28 +208,32 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], return dataloader def training_step(self, *args, **kwargs): - args, batch_idx = self._prepare_input(args) - return self.model(args, batch_idx, **kwargs) + args = self._prepare_input(args) + return self.poptorch_wrapped_models['train'](*args, **kwargs) def _prepare_input(self, args): - args, batch_idx = args - # explicit conversion to tuple as Lists are not supported in jit as they are mutable - # todo: we probably want to apply this to all lists in the object - # todo: do we need to do additional checks for dicts? - args = tuple(args) + # Ensure we replicate primitives values to have enough dimensions to split across devices accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches num_repeat = self.replication_factor * self.device_iterations * accumulate_grad_batches - batch_idx = torch.tensor(batch_idx, dtype=torch.int).unsqueeze(0).repeat(num_repeat) - return args, batch_idx + + def to_tuple(x): + return tuple(x) + + def to_tensor(x): + return torch.tensor(x).unsqueeze(0).repeat(num_repeat) + + args = apply_to_collection(args, dtype=list, function=to_tuple) + args = apply_to_collection(args, dtype=(int, float), function=to_tensor) + return args def validation_step(self, *args, **kwargs): - args, batch_idx = self._prepare_input(args) - return self.inference_models['val'](args, batch_idx, **kwargs) + args = self._prepare_input(args) + return self.poptorch_wrapped_models['val'](*args, **kwargs) def test_step(self, *args, **kwargs): - args, batch_idx = self._prepare_input(args) - return self.inference_models['test'](args, batch_idx, **kwargs) + args = self._prepare_input(args) + return self.poptorch_wrapped_models['test'](*args, **kwargs) def predict_step(self, *args, **kwargs): - args, batch_idx = self._prepare_input(args) - return self.inference_models['predict'](args, batch_idx, **kwargs) + args = self._prepare_input(args) + return self.poptorch_wrapped_models['predict'](*args, **kwargs) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c91b2b71dc693..1d50a93b0b086 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -60,6 +60,7 @@ from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, _NATIVE_AMP_AVAILABLE, _TPU_AVAILABLE, AMPType, @@ -367,7 +368,7 @@ def select_precision_plugin(self) -> PrecisionPlugin: # set precision type self.amp_type = AMPType.from_str(self.amp_type) - if self._device_type == DeviceType.IPU: + if self.on_ipu: return IPUPrecisionPlugin(self.precision) if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): @@ -632,8 +633,11 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') - num_cores = self.tpu_cores if self.tpu_cores is not None else 0 - rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') + num_tpu_cores = self.tpu_cores if self.tpu_cores is not None else 0 + rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores') + + num_ipu_cores = self.ipu_cores if self.ipu_cores is not None else 0 + rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipu_cores} IPU cores') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: rank_zero_warn( diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 9920c9e41cb8f..613a5013d5198 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -43,6 +43,7 @@ _HOROVOD_AVAILABLE, _HYDRA_AVAILABLE, _HYDRA_EXPERIMENTAL_AVAILABLE, + _IPU_AVAILABLE, _IS_INTERACTIVE, _module_available, _NATIVE_AMP_AVAILABLE, diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index a6f2b192a97f7..2a51b01404821 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -97,3 +97,9 @@ def _compare_version(package: str, op, version) -> bool: from pytorch_lightning.utilities.xla_device import XLADeviceUtils # noqa: E402 _TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists() + +if _POPTORCH_AVAILABLE: + import poptorch + _IPU_AVAILABLE = poptorch.ipuHardwareIsAvailable() +else: + _IPU_AVAILABLE = False diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py new file mode 100644 index 0000000000000..6a585934e13fe --- /dev/null +++ b/tests/accelerators/test_ipu.py @@ -0,0 +1,176 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest +import torch +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from pytorch_lightning import seed_everything, Trainer +from tests.helpers.boring_model import BoringModel +from tests.helpers.datamodules import ClassifDataModule +from tests.helpers.datasets import SklearnDataset +from tests.helpers.runif import RunIf +from tests.helpers.simple_models import ClassificationModel + + +class IPUModel(BoringModel): + + def training_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def validation_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def test_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def training_epoch_end(self, outputs) -> None: + pass + + def validation_epoch_end(self, outputs) -> None: + pass + + def test_epoch_end(self, outputs) -> None: + pass + + +class IPUClassificationModel(ClassificationModel): + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def accuracy(self, logits, y): + # todo (sean): currently IPU poptorch doesn't implicit convert bools to tensor + # hence we use an explicit calculation for accuracy here. Once fixed in poptorch + # we can use the accuracy metric. + acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) + return acc + + def validation_epoch_end(self, outputs) -> None: + self.log('val_acc', torch.stack(outputs).mean()) + + def test_epoch_end(self, outputs) -> None: + self.log('test_acc', torch.stack(outputs).mean()) + + +@RunIf(ipu=True) +@pytest.mark.parametrize('ipu_cores', [1, 4]) +def test_all_stages(tmpdir, ipu_cores): + model = IPUModel() + trainer = Trainer(fast_dev_run=True, accelerator='ipu', ipu_cores=ipu_cores) + trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, model.val_dataloader()) + + +@RunIf(ipu=True) +@pytest.mark.parametrize('ipu_cores', [1, 4]) +def test_inference_only(tmpdir, ipu_cores): + model = IPUModel() + + trainer = Trainer(fast_dev_run=True, accelerator='ipu', ipu_cores=ipu_cores) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, model.val_dataloader()) + + +def test_optimization(tmpdir): + seed_everything(42) + + # Override to drop last uneven batch, as IPU poptorch does not support uneven inputs. + class DataModule(ClassifDataModule): + + def train_dataloader(self): + return DataLoader( + SklearnDataset(self.x_train, self.y_train, self._x_type, self._y_type), + batch_size=self.batch_size, + drop_last=True + ) + + def val_dataloader(self): + return DataLoader( + SklearnDataset(self.x_valid, self.y_valid, self._x_type, self._y_type), + batch_size=self.batch_size, + drop_last=True + ) + + def test_dataloader(self): + return DataLoader( + SklearnDataset(self.x_test, self.y_test, self._x_type, self._y_type), + batch_size=self.batch_size, + drop_last=True + ) + + dm = DataModule(length=1024) + model = IPUClassificationModel() + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + weights_summary=None, + deterministic=True, + ipu_cores=2, + ) + + # fit model + trainer.fit(model, dm) + assert trainer.state.finished, f"Training failed with {trainer.state}" + assert dm.trainer is not None + + # validate + result = trainer.validate(datamodule=dm) + assert dm.trainer is not None + assert result[0]['val_acc'] > 0.7 + + # test + result = trainer.test(datamodule=dm) + assert dm.trainer is not None + test_result = result[0]['test_acc'] + assert test_result > 0.6 + + # test saved model + model_path = os.path.join(tmpdir, 'model.pt') + trainer.save_checkpoint(model_path) + + model = IPUClassificationModel.load_from_checkpoint(model_path) + + trainer = Trainer(default_root_dir=tmpdir, deterministic=True) + + result = trainer.test(model, dm.test_dataloader()) + saved_result = result[0]['test_acc'] + assert saved_result > 0.6 and (saved_result == test_result) diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 630a341ec2d30..2e528cbd6430d 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -28,6 +28,7 @@ _FAIRSCALE_PIPE_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE, + _POPTORCH_AVAILABLE, _RPC_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE, _TPU_AVAILABLE, @@ -63,6 +64,7 @@ def __new__( amp_apex: bool = False, amp_native: bool = False, tpu: bool = False, + ipu: bool = False, horovod: bool = False, horovod_nccl: bool = False, skip_windows: bool = False, @@ -85,6 +87,7 @@ def __new__( amp_apex: NVIDIA Apex is installed amp_native: if native PyTorch native AMP is supported tpu: if TPU is available + ipu: if IPU is available horovod: if Horovod is installed horovod_nccl: if Horovod is installed with NCCL support skip_windows: skip test for Windows platform (typically fo some limited torch functionality) @@ -139,6 +142,10 @@ def __new__( conditions.append(not _TPU_AVAILABLE) reasons.append("TPU") + if ipu: + conditions.append(not _POPTORCH_AVAILABLE) + reasons.append("IPU") + if horovod: conditions.append(not _HOROVOD_AVAILABLE) reasons.append("Horovod") From a51f23ee5a0c0103c31f9462a082b76cea4d5d05 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 27 May 2021 12:07:22 +0100 Subject: [PATCH 09/60] Small cleanups --- .../plugins/training_type/ipu.py | 87 +++++++++---------- tests/helpers/runif.py | 4 +- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 4b729b6c8c034..a3e0ef35141c9 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -4,6 +4,7 @@ from typing import Any, Iterable, List, Optional, Union import torch +from torch.nn import Module from torch.utils.data import DataLoader from pytorch_lightning import _logger as log @@ -19,11 +20,8 @@ if _POPTORCH_AVAILABLE: import poptorch - if not poptorch.ipuHardwareIsAvailable(): - raise MisconfigurationException("IPU Accelerator requires IPUs to run.") - -# todo: No idea what's happening with grad accumulation, need to check since IPUs handle grad accum. -# todo: or even lr scheduling... +# todo: Check gradient accumulation to ensure this works, similar to DeepSpeed IPUs manage this. +# todo: Check lr scheduling to ensure that when the LR is changed, we update the optimizer state. class LightningIPUModule(_LightningModuleWrapperBase): @@ -65,6 +63,7 @@ def __init__( self.autoround_num_ipus = autoround_num_ipus self.autoreport = autoreport self.autoreport_dir = autoreport_dir + self.poptorch_models = {} if self.autoreport: options = {"autoReport.all": self.autoreport} @@ -74,44 +73,16 @@ def __init__( options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) - @property - def on_gpu(self) -> bool: - return False - - @property - def root_device(self) -> torch.device: - pass - - def model_to_device(self) -> None: - pass - - @property - def is_global_zero(self) -> bool: - return True - - def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: - return tensor - - def barrier(self, name: Optional[str] = None) -> None: - pass - - def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: - return tensor - - def broadcast(self, obj: object, src: int = 0) -> object: - return obj + def setup(self, model: Module) -> None: + super().setup(model) + if not poptorch.ipuHardwareIsAvailable(): + raise MisconfigurationException("IPU Accelerator requires IPUs to run.") @property def lightning_module(self) -> Optional[LightningModule]: - model = self.model.module if isinstance(self.model, poptorch.PoplarExecutor) else self.model - return model.module if isinstance(model, LightningIPUModule) else model + return self.model.module if isinstance(self.model, LightningIPUModule) else self.model def pre_dispatch(self) -> None: - ''' - The issue here is we assume we're training. - What if we're not training? - I say - ''' if self.half: log.info('Using full 16bit precision, converting LightningModule weights to FP16.') self.model = self.model.half() @@ -124,15 +95,14 @@ def pre_dispatch(self) -> None: # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, they sync weights first. - self.poptorch_wrapped_models = {} if self.lightning_module.trainer.training: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] - self.poptorch_wrapped_models['train'] = poptorch.trainingModel( + self.poptorch_models['train'] = poptorch.trainingModel( model=model, options=self._create_opts(training=True), optimizer=optimizer ) for x in ('val', 'test', 'predict'): - self.poptorch_wrapped_models[x] = poptorch.inferenceModel( + self.poptorch_models[x] = poptorch.inferenceModel( model=model, options=self._create_opts(training=False), ) @@ -209,7 +179,7 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], def training_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_wrapped_models['train'](*args, **kwargs) + return self.poptorch_models['train'](*args, **kwargs) def _prepare_input(self, args): # Ensure we replicate primitives values to have enough dimensions to split across devices @@ -228,12 +198,39 @@ def to_tensor(x): def validation_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_wrapped_models['val'](*args, **kwargs) + return self.poptorch_models['val'](*args, **kwargs) def test_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_wrapped_models['test'](*args, **kwargs) + return self.poptorch_models['test'](*args, **kwargs) def predict_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_wrapped_models['predict'](*args, **kwargs) + return self.poptorch_models['predict'](*args, **kwargs) + + @property + def on_gpu(self) -> bool: + return False + + @property + def root_device(self) -> torch.device: + pass + + def model_to_device(self) -> None: + pass + + @property + def is_global_zero(self) -> bool: + return True + + def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: + return tensor + + def barrier(self, name: Optional[str] = None) -> None: + pass + + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + return tensor + + def broadcast(self, obj: object, src: int = 0) -> object: + return obj diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 2e528cbd6430d..737ddd68dff17 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -27,8 +27,8 @@ _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _FAIRSCALE_PIPE_AVAILABLE, _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, _NATIVE_AMP_AVAILABLE, - _POPTORCH_AVAILABLE, _RPC_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE, _TPU_AVAILABLE, @@ -143,7 +143,7 @@ def __new__( reasons.append("TPU") if ipu: - conditions.append(not _POPTORCH_AVAILABLE) + conditions.append(not _IPU_AVAILABLE) reasons.append("IPU") if horovod: From be7de87f4ec2024cdf3641f19661409e57d5d0ea Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 1 Jun 2021 12:17:24 +0100 Subject: [PATCH 10/60] Refactors based on review --- .../plugins/training_type/ipu.py | 81 +++++++++++++------ tests/accelerators/test_ipu.py | 3 + 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index a3e0ef35141c9..fd2a701464918 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -1,6 +1,7 @@ import inspect import json import os +from enum import Enum from typing import Any, Iterable, List, Optional, Union import torch @@ -20,9 +21,22 @@ if _POPTORCH_AVAILABLE: import poptorch -# todo: Check gradient accumulation to ensure this works, similar to DeepSpeed IPUs manage this. +# todo: Check gradient accumulation to ensure this works, similar to DeepSpeed, IPUs manage this. # todo: Check lr scheduling to ensure that when the LR is changed, we update the optimizer state. +# todo: does creating an inference model and a training model allocate double the IPU cores? +# todo: can we have one inference model for test/val/predict which takes a bool to choose a path? + + +class IPUStage(Enum): + training = torch.tensor([0]) + validation = torch.tensor([1]) + testing = torch.tensor([2]) + predicting = torch.tensor([3]) + + def __eq__(self, other): + return torch.equal(self.value, other) + class LightningIPUModule(_LightningModuleWrapperBase): @@ -30,11 +44,23 @@ def __init__(self, pl_module: LightningModule, precision: int): super().__init__(pl_module) self.precision = precision - def forward(self, *inputs, **kwargs): + def forward(self, stage, *inputs, **kwargs): if self.precision == 16: inputs = self._move_float_tensors_to_half(inputs) - return super().forward(*inputs, **kwargs) + trainer = self.module.trainer + if trainer and IPUStage.training == stage: + output = self.module.training_step(*inputs, **kwargs) + elif trainer and IPUStage.testing == stage: + output = self.module.test_step(*inputs, **kwargs) + elif trainer and IPUStage.validation == stage: + output = self.module.validation_step(*inputs, **kwargs) + elif trainer and IPUStage.predicting == stage: + output = self.module.predict_step(*inputs, **kwargs) + else: + output = self.module(*inputs, **kwargs) + + return output @staticmethod def batch_to(data): @@ -63,7 +89,8 @@ def __init__( self.autoround_num_ipus = autoround_num_ipus self.autoreport = autoreport self.autoreport_dir = autoreport_dir - self.poptorch_models = {} + self.train_model = None + self.inference_model = None if self.autoreport: options = {"autoReport.all": self.autoreport} @@ -89,23 +116,22 @@ def pre_dispatch(self) -> None: precision = self.lightning_module.trainer.accelerator.precision_plugin.precision precision = 16 if self.half else precision - model = LightningIPUModule(self.lightning_module, precision) - self.model = model - # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, they sync weights first. + model = LightningIPUModule(self.lightning_module, precision) + self.model = model if self.lightning_module.trainer.training: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] - self.poptorch_models['train'] = poptorch.trainingModel( + self.train_model = poptorch.trainingModel( model=model, options=self._create_opts(training=True), optimizer=optimizer ) - for x in ('val', 'test', 'predict'): - self.poptorch_models[x] = poptorch.inferenceModel( - model=model, - options=self._create_opts(training=False), - ) + + self.inference_model = poptorch.inferenceModel( + model=model, + options=self._create_opts(training=False), + ) @property def replication_factor(self): @@ -177,36 +203,45 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], dataloader.multiprocessing_context = multiprocessing_context return dataloader - def training_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models['train'](*args, **kwargs) + @property + def _n_replicate(self): + accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + return self.replication_factor * self.device_iterations * accumulate_grad_batches def _prepare_input(self, args): - # Ensure we replicate primitives values to have enough dimensions to split across devices - accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches - num_repeat = self.replication_factor * self.device_iterations * accumulate_grad_batches def to_tuple(x): return tuple(x) def to_tensor(x): - return torch.tensor(x).unsqueeze(0).repeat(num_repeat) + return torch.tensor(x).unsqueeze(0).repeat(self._n_replicate) args = apply_to_collection(args, dtype=list, function=to_tuple) args = apply_to_collection(args, dtype=(int, float), function=to_tensor) return args + def _prepare_stage(self, stage: IPUStage): + return stage.value.repeat(self._n_replicate) + + def training_step(self, *args, **kwargs): + args = self._prepare_input(args) + stage = self._prepare_stage(IPUStage.training) + return self.train_model(stage, *args, **kwargs) + def validation_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['val'](*args, **kwargs) + stage = self._prepare_stage(IPUStage.validation) + return self.inference_model(stage, *args, **kwargs) def test_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['test'](*args, **kwargs) + stage = self._prepare_stage(IPUStage.testing) + return self.inference_model(stage, *args, **kwargs) def predict_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['predict'](*args, **kwargs) + stage = self._prepare_stage(IPUStage.predicting) + return self.inference_model(stage, *args, **kwargs) @property def on_gpu(self) -> bool: diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 6a585934e13fe..e8814435c0148 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -174,3 +174,6 @@ def test_dataloader(self): result = trainer.test(model, dm.test_dataloader()) saved_result = result[0]['test_acc'] assert saved_result > 0.6 and (saved_result == test_result) + + +# todo add test for precision 16 and fully half precision + device iterations From 83c8a79f63538b3c8e252f7625b5f4af63602b38 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 1 Jun 2021 15:30:30 +0100 Subject: [PATCH 11/60] Swap to special tests --- tests/accelerators/test_ipu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index e8814435c0148..c14e47e2cd7c6 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -87,7 +87,7 @@ def test_epoch_end(self, outputs) -> None: self.log('test_acc', torch.stack(outputs).mean()) -@RunIf(ipu=True) +@RunIf(ipu=True, special=True) @pytest.mark.parametrize('ipu_cores', [1, 4]) def test_all_stages(tmpdir, ipu_cores): model = IPUModel() @@ -98,7 +98,7 @@ def test_all_stages(tmpdir, ipu_cores): trainer.predict(model, model.val_dataloader()) -@RunIf(ipu=True) +@RunIf(ipu=True, special=True) @pytest.mark.parametrize('ipu_cores', [1, 4]) def test_inference_only(tmpdir, ipu_cores): model = IPUModel() From a6018e549d4d723adb01deaf6d58f1f34b6c5124 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 1 Jun 2021 15:56:48 +0100 Subject: [PATCH 12/60] Add special tests --- .azure-pipelines/ipu-tests.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.azure-pipelines/ipu-tests.yml b/.azure-pipelines/ipu-tests.yml index 763549e88200b..ffaf13dca9bd0 100644 --- a/.azure-pipelines/ipu-tests.yml +++ b/.azure-pipelines/ipu-tests.yml @@ -89,3 +89,9 @@ jobs: env: MKL_THREADING_LAYER: "GNU" displayName: 'Testing: standard' + + - bash: | + bash tests/special_tests.sh + env: + MKL_THREADING_LAYER: "GNU" + displayName: 'Testing: special' From 0e71bbef307f5b473454714f8e9d85c047bfb395 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 1 Jun 2021 16:32:52 +0100 Subject: [PATCH 13/60] Add source --- .azure-pipelines/ipu-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.azure-pipelines/ipu-tests.yml b/.azure-pipelines/ipu-tests.yml index ffaf13dca9bd0..5f45f96fb75d7 100644 --- a/.azure-pipelines/ipu-tests.yml +++ b/.azure-pipelines/ipu-tests.yml @@ -91,6 +91,9 @@ jobs: displayName: 'Testing: standard' - bash: | + source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh + source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh + bash tests/special_tests.sh env: MKL_THREADING_LAYER: "GNU" From 6e38bd178b75867df0767d631a9e8d84fcdd53c3 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 1 Jun 2021 21:54:37 +0100 Subject: [PATCH 14/60] Cleanups --- .../plugins/training_type/ipu.py | 75 ++++++------------- 1 file changed, 22 insertions(+), 53 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index fd2a701464918..018e0a2311350 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -1,7 +1,6 @@ import inspect import json import os -from enum import Enum from typing import Any, Iterable, List, Optional, Union import torch @@ -21,22 +20,6 @@ if _POPTORCH_AVAILABLE: import poptorch -# todo: Check gradient accumulation to ensure this works, similar to DeepSpeed, IPUs manage this. -# todo: Check lr scheduling to ensure that when the LR is changed, we update the optimizer state. - -# todo: does creating an inference model and a training model allocate double the IPU cores? -# todo: can we have one inference model for test/val/predict which takes a bool to choose a path? - - -class IPUStage(Enum): - training = torch.tensor([0]) - validation = torch.tensor([1]) - testing = torch.tensor([2]) - predicting = torch.tensor([3]) - - def __eq__(self, other): - return torch.equal(self.value, other) - class LightningIPUModule(_LightningModuleWrapperBase): @@ -44,23 +27,11 @@ def __init__(self, pl_module: LightningModule, precision: int): super().__init__(pl_module) self.precision = precision - def forward(self, stage, *inputs, **kwargs): + def forward(self, *inputs, **kwargs): if self.precision == 16: inputs = self._move_float_tensors_to_half(inputs) - trainer = self.module.trainer - if trainer and IPUStage.training == stage: - output = self.module.training_step(*inputs, **kwargs) - elif trainer and IPUStage.testing == stage: - output = self.module.test_step(*inputs, **kwargs) - elif trainer and IPUStage.validation == stage: - output = self.module.validation_step(*inputs, **kwargs) - elif trainer and IPUStage.predicting == stage: - output = self.module.predict_step(*inputs, **kwargs) - else: - output = self.module(*inputs, **kwargs) - - return output + return super().forward(*inputs, **kwargs) @staticmethod def batch_to(data): @@ -89,8 +60,7 @@ def __init__( self.autoround_num_ipus = autoround_num_ipus self.autoreport = autoreport self.autoreport_dir = autoreport_dir - self.train_model = None - self.inference_model = None + self.poptorch_models = {} if self.autoreport: options = {"autoReport.all": self.autoreport} @@ -116,22 +86,23 @@ def pre_dispatch(self) -> None: precision = self.lightning_module.trainer.accelerator.precision_plugin.precision precision = 16 if self.half else precision + model = LightningIPUModule(self.lightning_module, precision) + self.model = model + # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, they sync weights first. - model = LightningIPUModule(self.lightning_module, precision) - self.model = model if self.lightning_module.trainer.training: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] - self.train_model = poptorch.trainingModel( - model=model, options=self._create_opts(training=True), optimizer=optimizer + model = poptorch.trainingModel(model=model, options=self._create_opts(training=True), optimizer=optimizer) + self.poptorch_models['train'] = model + for x in ('val', 'test', 'predict'): + model = poptorch.inferenceModel( + model=model, + options=self._create_opts(training=False), ) - - self.inference_model = poptorch.inferenceModel( - model=model, - options=self._create_opts(training=False), - ) + self.poptorch_models[x] = model @property def replication_factor(self): @@ -205,6 +176,7 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], @property def _n_replicate(self): + # Ensure we replicate primitives values to have enough dimensions to split across devices accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches return self.replication_factor * self.device_iterations * accumulate_grad_batches @@ -220,28 +192,21 @@ def to_tensor(x): args = apply_to_collection(args, dtype=(int, float), function=to_tensor) return args - def _prepare_stage(self, stage: IPUStage): - return stage.value.repeat(self._n_replicate) - def training_step(self, *args, **kwargs): args = self._prepare_input(args) - stage = self._prepare_stage(IPUStage.training) - return self.train_model(stage, *args, **kwargs) + return self.poptorch_models['train'](*args, **kwargs) def validation_step(self, *args, **kwargs): args = self._prepare_input(args) - stage = self._prepare_stage(IPUStage.validation) - return self.inference_model(stage, *args, **kwargs) + return self.poptorch_models['val'](*args, **kwargs) def test_step(self, *args, **kwargs): args = self._prepare_input(args) - stage = self._prepare_stage(IPUStage.testing) - return self.inference_model(stage, *args, **kwargs) + return self.poptorch_models['test'](*args, **kwargs) def predict_step(self, *args, **kwargs): args = self._prepare_input(args) - stage = self._prepare_stage(IPUStage.predicting) - return self.inference_model(stage, *args, **kwargs) + return self.poptorch_models['predict'](*args, **kwargs) @property def on_gpu(self) -> bool: @@ -269,3 +234,7 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra def broadcast(self, obj: object, src: int = 0) -> object: return obj + + def teardown(self) -> None: + for k, model in self.poptorch_models.items(): + model.destroy() From 526383fd8c7b0dd968839e2341dad96c1bd7b2d2 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 12:08:06 +0100 Subject: [PATCH 15/60] Add logic to attach/detach model from devices --- pytorch_lightning/accelerators/accelerator.py | 48 ++++++++++++++----- .../plugins/training_type/ipu.py | 38 +++++++++++++++ .../training_type/training_type_plugin.py | 32 +++++++++++++ pytorch_lightning/trainer/evaluation_loop.py | 4 ++ pytorch_lightning/trainer/predict_loop.py | 2 + pytorch_lightning/trainer/training_loop.py | 1 + 6 files changed, 113 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 4ea017ae0c208..9c2b3dadf0961 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -179,10 +179,6 @@ def batch_to_device( return move_data_to_device(batch, device) - def on_train_start(self) -> None: - """Hook to do something upon the training start""" - pass - def training_step( self, step_kwargs: Dict[str, Union[Any, int]], @@ -348,14 +344,6 @@ def clip_gradients( model=self.model, ) - def on_train_epoch_end(self) -> None: - """Hook to do something on the end of an training epoch.""" - pass - - def on_train_end(self) -> None: - """Hook to do something at the end of the training""" - pass - def setup_optimizers(self, trainer: 'pl.Trainer') -> None: """ Creates optimizers and schedulers @@ -547,3 +535,39 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: return self.training_type_plugin.update_global_step(total_batch_idx, current_global_step) + + def on_train_epoch_end(self) -> None: + """Hook to do something on the end of an training epoch.""" + pass + + def on_train_start(self) -> None: + """Called when train begins.""" + return self.training_type_plugin.on_train_start() + + def on_validation_start(self) -> None: + """Called when validation begins.""" + return self.training_type_plugin.on_validation_start() + + def on_test_start(self) -> None: + """Called when test begins.""" + return self.training_type_plugin.on_test_start() + + def on_predict_start(self) -> None: + """Called when predict begins.""" + return self.training_type_plugin.on_predict_start() + + def on_validation_end(self) -> None: + """Called when validation ends.""" + return self.training_type_plugin.on_validation_end() + + def on_test_end(self) -> None: + """Called when test end.""" + return self.training_type_plugin.on_test_end() + + def on_predict_end(self) -> None: + """Called when predict ends.""" + return self.training_type_plugin.on_predict_end() + + def on_train_end(self) -> None: + """Called when train ends.""" + return self.training_type_plugin.on_train_end() diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 018e0a2311350..572dfdae0e492 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -238,3 +238,41 @@ def broadcast(self, obj: object, src: int = 0) -> object: def teardown(self) -> None: for k, model in self.poptorch_models.items(): model.destroy() + + def _compiled(self, model): + return model._executable is not None + + def detach_models(self): + for k, model in self.poptorch_models.items(): + if self._compiled(model) and model.isAttachedToDevice(): + model.detachFromDevice() + + def load_model(self, stage): + self.detach_models() + model = self.poptorch_models[stage] + if self._compiled(model): + model.attachToDevice() + + def on_train_start(self): + self.load_model('train') + + def on_validation_start(self): + self.load_model('val') + + def on_test_start(self): + self.load_model('test') + + def on_predict_start(self): + self.load_model('predict') + + def on_train_end(self): + self.detach_models() + + def on_validation_end(self): + self.detach_models() + + def on_test_end(self): + self.detach_models() + + def on_predict_end(self): + self.detach_models() diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index c80f00d345e8a..d35e02968e753 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -330,3 +330,35 @@ def register_plugins(cls, plugin_registry): def should_rank_save_checkpoint(self) -> bool: """Returns whether the checkpoint should be saved (rank based)""" return self.is_global_zero + + def on_train_start(self) -> None: + """Called when train begins.""" + pass + + def on_validation_start(self) -> None: + """Called when validation begins.""" + pass + + def on_test_start(self) -> None: + """Called when test begins.""" + pass + + def on_predict_start(self) -> None: + """Called when predict begins.""" + pass + + def on_train_end(self) -> None: + """Called when train ends.""" + pass + + def on_validation_end(self) -> None: + """Called when validation ends.""" + pass + + def on_test_end(self) -> None: + """Called when test end.""" + pass + + def on_predict_end(self): + """Called when predict ends.""" + pass diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index f048297892533..d6d2f1af48599 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -79,8 +79,10 @@ def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None: self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end() if self.trainer.testing: self.trainer.call_hook('on_test_start', *args, **kwargs) + self.trainer.accelerator.on_test_start() else: self.trainer.call_hook('on_validation_start', *args, **kwargs) + self.trainer.accelerator.on_validation_start() def on_evaluation_model_eval(self) -> None: model_ref = self.trainer.lightning_module @@ -99,8 +101,10 @@ def on_evaluation_model_train(self) -> None: def on_evaluation_end(self, *args: Any, **kwargs: Any) -> None: if self.trainer.testing: self.trainer.call_hook('on_test_end', *args, **kwargs) + self.trainer.accelerator.on_test_end() else: self.trainer.call_hook('on_validation_end', *args, **kwargs) + self.trainer.accelerator.on_validation_end() if self.trainer.state.fn != TrainerFn.FITTING: # summarize profile results diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py index c06ced6662d81..25d4fd83d8cc5 100644 --- a/pytorch_lightning/trainer/predict_loop.py +++ b/pytorch_lightning/trainer/predict_loop.py @@ -141,6 +141,7 @@ def on_predict_start(self) -> None: # hook self.trainer.call_hook("on_predict_start") self.trainer.call_hook("on_predict_epoch_start") + self.trainer.accelerator.on_predict_start() def on_predict_epoch_end(self) -> Optional[_PREDICT_OUTPUT]: self.trainer.profiler.describe() @@ -162,3 +163,4 @@ def on_predict_end(self): # hook self.trainer.call_hook("on_predict_end") + self.trainer.accelerator.on_predict_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 62138790138ee..32aca773466a8 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -101,6 +101,7 @@ def should_skip_training(self) -> bool: def on_train_start(self): # hook self.trainer.call_hook("on_train_start") + self.trainer.accelerator.on_train_start() def on_train_end(self): if self._teardown_already_run: From e18039c1a2e06ee5ea8a4a412d469409dab419dc Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 12:51:02 +0100 Subject: [PATCH 16/60] Fixes for tests --- pl_examples/ipu_examples/mnist.py | 7 +++---- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- pytorch_lightning/plugins/training_type/ipu.py | 10 +++++----- pytorch_lightning/utilities/device_dtype_mixin.py | 2 +- tests/callbacks/test_pruning.py | 2 +- tests/plugins/test_deepspeed_plugin.py | 3 ++- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 980dd3430446c..aba24ccbeef34 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -19,6 +19,7 @@ import pytorch_lightning as pl from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule +from pytorch_lightning.plugins import IPUPlugin class LitClassifier(pl.LightningModule): @@ -80,9 +81,7 @@ def configure_optimizers(self): model = LitClassifier() - trainer = pl.Trainer(max_epochs=2, ipu_cores=8) + trainer = pl.Trainer(max_epochs=2, ipu_cores=8, plugins=IPUPlugin(device_iterations=1), profiler='simple') trainer.fit(model, datamodule=dm) - - result = trainer.test(model, datamodule=dm) - pprint(result) + trainer.test(model, datamodule=dm) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 8dd04aafa6b86..33e66423624b5 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -63,7 +63,7 @@ def forward(self, *inputs, **kwargs): @staticmethod def batch_to(data): - return data.half() + return data.convert_model_to_half() def _move_float_tensors_to_half(self, batch: Any): batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 572dfdae0e492..8cc43f22dd75f 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -46,16 +46,16 @@ class IPUPlugin(ParallelPlugin): def __init__( self, - half: bool = False, device_iterations: int = 1, autoround_num_ipus: bool = True, autoreport: bool = True, autoreport_dir: Optional[str] = None, + convert_model_to_half: bool = False, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, ): super().__init__(parallel_devices, cluster_environment) - self.half = half + self.convert_model_to_half = convert_model_to_half self.device_iterations = device_iterations self.autoround_num_ipus = autoround_num_ipus self.autoreport = autoreport @@ -80,11 +80,11 @@ def lightning_module(self) -> Optional[LightningModule]: return self.model.module if isinstance(self.model, LightningIPUModule) else self.model def pre_dispatch(self) -> None: - if self.half: + if self.convert_model_to_half: log.info('Using full 16bit precision, converting LightningModule weights to FP16.') self.model = self.model.half() precision = self.lightning_module.trainer.accelerator.precision_plugin.precision - precision = 16 if self.half else precision + precision = 16 if self.convert_model_to_half else precision model = LightningIPUModule(self.lightning_module, precision) self.model = model @@ -250,7 +250,7 @@ def detach_models(self): def load_model(self, stage): self.detach_models() model = self.poptorch_models[stage] - if self._compiled(model): + if self._compiled(model) and not model.isAttachedToDevice(): model.attachToDevice() def on_train_start(self): diff --git a/pytorch_lightning/utilities/device_dtype_mixin.py b/pytorch_lightning/utilities/device_dtype_mixin.py index 13f16d9b426ac..eeb44ed917faf 100644 --- a/pytorch_lightning/utilities/device_dtype_mixin.py +++ b/pytorch_lightning/utilities/device_dtype_mixin.py @@ -90,7 +90,7 @@ def to(self, *args, **kwargs) -> Module: >>> module.weight #doctest: +ELLIPSIS tensor([[...]], dtype=torch.float64) >>> cpu = torch.device('cpu') - >>> module.to(cpu, dtype=torch.half, non_blocking=True) + >>> module.to(cpu, dtype=torch.convert_model_to_half, non_blocking=True) ExampleModule() >>> module.weight #doctest: +ELLIPSIS tensor([[...]], dtype=torch.float16) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index d4957905454d8..f198b29d24e84 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -161,7 +161,7 @@ def test_pruning_callback( ) -@RunIf(special=True) +@RunIf(special=True, min_gpus=2) @pytest.mark.parametrize("parameters_to_prune", [False, True]) @pytest.mark.parametrize("use_global_unstructured", [False, True]) def test_pruning_callback_ddp(tmpdir, use_global_unstructured: bool, parameters_to_prune: bool): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 85d069b90288d..7c8753094ce0c 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -618,7 +618,8 @@ def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel): if trainer.global_rank == 0: saved_model = cls.load_from_checkpoint(checkpoint_path) if model.dtype == torch.half: - saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 + saved_model = saved_model.convert_model_to_half( + ) # model is loaded in float32 as default, move it to float16 model = model.cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): From 2e43fee1b1d360eea0efcfd60f720071ce82b40c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 14:06:46 +0100 Subject: [PATCH 17/60] Fixes for tests --- pl_examples/ipu_examples/mnist.py | 2 +- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- pytorch_lightning/utilities/device_dtype_mixin.py | 2 +- tests/plugins/test_deepspeed_plugin.py | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index aba24ccbeef34..6463b48f0c88b 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -81,7 +81,7 @@ def configure_optimizers(self): model = LitClassifier() - trainer = pl.Trainer(max_epochs=2, ipu_cores=8, plugins=IPUPlugin(device_iterations=1), profiler='simple') + trainer = pl.Trainer(max_epochs=2, ipu_cores=8) trainer.fit(model, datamodule=dm) trainer.test(model, datamodule=dm) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 33e66423624b5..8dd04aafa6b86 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -63,7 +63,7 @@ def forward(self, *inputs, **kwargs): @staticmethod def batch_to(data): - return data.convert_model_to_half() + return data.half() def _move_float_tensors_to_half(self, batch: Any): batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) diff --git a/pytorch_lightning/utilities/device_dtype_mixin.py b/pytorch_lightning/utilities/device_dtype_mixin.py index eeb44ed917faf..13f16d9b426ac 100644 --- a/pytorch_lightning/utilities/device_dtype_mixin.py +++ b/pytorch_lightning/utilities/device_dtype_mixin.py @@ -90,7 +90,7 @@ def to(self, *args, **kwargs) -> Module: >>> module.weight #doctest: +ELLIPSIS tensor([[...]], dtype=torch.float64) >>> cpu = torch.device('cpu') - >>> module.to(cpu, dtype=torch.convert_model_to_half, non_blocking=True) + >>> module.to(cpu, dtype=torch.half, non_blocking=True) ExampleModule() >>> module.weight #doctest: +ELLIPSIS tensor([[...]], dtype=torch.float16) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 7c8753094ce0c..85d069b90288d 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -618,8 +618,7 @@ def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel): if trainer.global_rank == 0: saved_model = cls.load_from_checkpoint(checkpoint_path) if model.dtype == torch.half: - saved_model = saved_model.convert_model_to_half( - ) # model is loaded in float32 as default, move it to float16 + saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 model = model.cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): From 53d31a0c7a9c283b7eb3f1b7f033c9d8c205136d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 14:07:59 +0100 Subject: [PATCH 18/60] Move earlier --- pytorch_lightning/plugins/training_type/ipu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 8cc43f22dd75f..1e34c079ab998 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -70,8 +70,8 @@ def __init__( options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) - def setup(self, model: Module) -> None: - super().setup(model) + def setup_environment(self) -> None: + super().setup_environment() if not poptorch.ipuHardwareIsAvailable(): raise MisconfigurationException("IPU Accelerator requires IPUs to run.") From 62414323bab1a1d60b57512a8c629dd9099e25c0 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 14:51:36 +0100 Subject: [PATCH 19/60] Cleanups --- pl_examples/ipu_examples/mnist.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 6463b48f0c88b..c907f4a15af48 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -12,14 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pprint import pprint - import torch from torch.nn import functional as F import pytorch_lightning as pl from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule -from pytorch_lightning.plugins import IPUPlugin class LitClassifier(pl.LightningModule): @@ -60,7 +57,7 @@ def test_step(self, batch, batch_idx): return acc def accuracy(self, logits, y): - # todo (sean): currently IPU poptorch doesn't implicit convert bools to tensor + # currently IPU poptorch doesn't implicit convert bools to tensor # hence we use an explicit calculation for accuracy here. Once fixed in poptorch # we can use the accuracy metric. acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) From d249a131b06f8f5c36cd4c061bfb56202e7b1042 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 16:49:26 +0100 Subject: [PATCH 20/60] Add check for nvcc --- tests/special_tests.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index cf81700291b8d..b6de1ca69ecef 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -68,7 +68,9 @@ for i in "${!files_arr[@]}"; do done < <(echo "$test_code") done -nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +if nvcc --version; then + nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +fi # echo test report printf '=%.s' {1..80} From d08cf39ac977dc6f79ef93579bbfe9e8593a82b2 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 2 Jun 2021 21:17:53 +0100 Subject: [PATCH 21/60] Add tests, cleanups --- pytorch_lightning/accelerators/accelerator.py | 13 ++++ .../plugins/training_type/ipu.py | 32 ++++++++- .../training_type/training_type_plugin.py | 13 ++++ pytorch_lightning/trainer/evaluation_loop.py | 4 -- pytorch_lightning/trainer/predict_loop.py | 2 - pytorch_lightning/trainer/trainer.py | 5 +- pytorch_lightning/trainer/training_loop.py | 9 +-- tests/accelerators/test_ipu.py | 67 +++++++++++++++++-- tests/plugins/test_sharded_plugin.py | 2 +- 9 files changed, 123 insertions(+), 24 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 9c2b3dadf0961..e4a8b1ff4cb08 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -571,3 +571,16 @@ def on_predict_end(self) -> None: def on_train_end(self) -> None: """Called when train ends.""" return self.training_type_plugin.on_train_end() + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the training loop before anything happens for that batch. + + If you return -1 here, you will skip training for the rest of the current epoch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + batch_idx: the index of the batch + dataloader_idx: the index of the dataloader + """ + return self.training_type_plugin.on_train_batch_start(batch, batch_idx, dataloader_idx) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 1e34c079ab998..499fe48bca359 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -4,10 +4,10 @@ from typing import Any, Iterable, List, Optional, Union import torch -from torch.nn import Module from torch.utils.data import DataLoader from pytorch_lightning import _logger as log +from pytorch_lightning.callbacks import GradientAccumulationScheduler from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -61,6 +61,7 @@ def __init__( self.autoreport = autoreport self.autoreport_dir = autoreport_dir self.poptorch_models = {} + self._original_accumulate_grad_batches = None if self.autoreport: options = {"autoReport.all": self.autoreport} @@ -80,6 +81,7 @@ def lightning_module(self) -> Optional[LightningModule]: return self.model.module if isinstance(self.model, LightningIPUModule) else self.model def pre_dispatch(self) -> None: + self._handle_gradient_accumulation_steps() if self.convert_model_to_half: log.info('Using full 16bit precision, converting LightningModule weights to FP16.') self.model = self.model.half() @@ -174,10 +176,28 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], dataloader.multiprocessing_context = multiprocessing_context return dataloader + def _handle_gradient_accumulation_steps(self): + """ + This functions overrides the trainer.accumulation_scheduler to generate + ``accumulate_grad_batches=1``. + Therefore, ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation. + """ + self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + if self._original_accumulate_grad_batches > 1: + # todo (tchaton) Add support for accumulate_grad_batches being a dictionary. + self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1}) + + def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + if self._original_accumulate_grad_batches > 1: + if total_batch_idx % self._original_accumulate_grad_batches == 0: + current_global_step += 1 + return current_global_step + return super().update_global_step(total_batch_idx, current_global_step) + @property def _n_replicate(self): - # Ensure we replicate primitives values to have enough dimensions to split across devices - accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + # Ensure we replicate values to have enough dimensions to split across devices + accumulate_grad_batches = self._original_accumulate_grad_batches return self.replication_factor * self.device_iterations * accumulate_grad_batches def _prepare_input(self, args): @@ -240,6 +260,7 @@ def teardown(self) -> None: model.destroy() def _compiled(self, model): + # Required to ensure we only attach compiled models, as they are compiled lazily. return model._executable is not None def detach_models(self): @@ -276,3 +297,8 @@ def on_test_end(self): def on_predict_end(self): self.detach_models() + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + # Update optimizer stats if LR scheduler modified the optimizer state + optimizer = self.lightning_module.trainer.optimizers[0] + self.poptorch_models['train'].setOptimizer(optimizer) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index d35e02968e753..e19413c8c664f 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -362,3 +362,16 @@ def on_test_end(self) -> None: def on_predict_end(self): """Called when predict ends.""" pass + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the training loop before anything happens for that batch. + + If you return -1 here, you will skip training for the rest of the current epoch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + batch_idx: the index of the batch + dataloader_idx: the index of the dataloader + """ + pass diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index d6d2f1af48599..f048297892533 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -79,10 +79,8 @@ def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None: self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end() if self.trainer.testing: self.trainer.call_hook('on_test_start', *args, **kwargs) - self.trainer.accelerator.on_test_start() else: self.trainer.call_hook('on_validation_start', *args, **kwargs) - self.trainer.accelerator.on_validation_start() def on_evaluation_model_eval(self) -> None: model_ref = self.trainer.lightning_module @@ -101,10 +99,8 @@ def on_evaluation_model_train(self) -> None: def on_evaluation_end(self, *args: Any, **kwargs: Any) -> None: if self.trainer.testing: self.trainer.call_hook('on_test_end', *args, **kwargs) - self.trainer.accelerator.on_test_end() else: self.trainer.call_hook('on_validation_end', *args, **kwargs) - self.trainer.accelerator.on_validation_end() if self.trainer.state.fn != TrainerFn.FITTING: # summarize profile results diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py index 25d4fd83d8cc5..c06ced6662d81 100644 --- a/pytorch_lightning/trainer/predict_loop.py +++ b/pytorch_lightning/trainer/predict_loop.py @@ -141,7 +141,6 @@ def on_predict_start(self) -> None: # hook self.trainer.call_hook("on_predict_start") self.trainer.call_hook("on_predict_epoch_start") - self.trainer.accelerator.on_predict_start() def on_predict_epoch_end(self) -> Optional[_PREDICT_OUTPUT]: self.trainer.profiler.describe() @@ -163,4 +162,3 @@ def on_predict_end(self): # hook self.trainer.call_hook("on_predict_end") - self.trainer.accelerator.on_predict_end() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b559ed710f29e..3b4ca6b6e2f92 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1249,9 +1249,8 @@ def call_hook(self, hook_name: str, *args, **kwargs) -> Any: hook_fx = getattr(model_ref, hook_name) output = hook_fx(*args, **kwargs) - # if the PL module doesn't have the hook then call the accelerator - # used to auto-reduce things for the user with Results obj - elif hasattr(self.accelerator, hook_name): + # call hook in accelerator + if hasattr(self.accelerator, hook_name): accelerator_hook = getattr(self.accelerator, hook_name) output = accelerator_hook(*args, **kwargs) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 32aca773466a8..64c05b8be1547 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -101,7 +101,6 @@ def should_skip_training(self) -> bool: def on_train_start(self): # hook self.trainer.call_hook("on_train_start") - self.trainer.accelerator.on_train_start() def on_train_end(self): if self._teardown_already_run: @@ -126,9 +125,6 @@ def on_train_end(self): # summarize profile results self.trainer.profiler.describe() - # give accelerators a chance to finish - self.trainer.accelerator.on_train_end() - # reset bookkeeping self.trainer.state.stage = None @@ -631,9 +627,8 @@ def _on_train_epoch_end_hook(self, processed_epoch_output) -> None: else: model_ref.on_train_epoch_end() - # if the PL module doesn't have the hook then call the accelerator - # used to auto-reduce things for the user with Results obj - elif hasattr(self.trainer.accelerator, hook_name): + # call hook in accelerator + if hasattr(self.trainer.accelerator, hook_name): accelerator_hook = getattr(self.trainer.accelerator, hook_name) accelerator_hook() diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index c14e47e2cd7c6..f830f099a7d91 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from typing import Any, Optional import pytest import torch import torch.nn.functional as F from torch.utils.data import DataLoader -from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning import Callback, seed_everything, Trainer +from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule from tests.helpers.datasets import SklearnDataset @@ -91,7 +93,7 @@ def test_epoch_end(self, outputs) -> None: @pytest.mark.parametrize('ipu_cores', [1, 4]) def test_all_stages(tmpdir, ipu_cores): model = IPUModel() - trainer = Trainer(fast_dev_run=True, accelerator='ipu', ipu_cores=ipu_cores) + trainer = Trainer(fast_dev_run=True, ipu_cores=ipu_cores) trainer.fit(model) trainer.validate(model) trainer.test(model) @@ -103,7 +105,7 @@ def test_all_stages(tmpdir, ipu_cores): def test_inference_only(tmpdir, ipu_cores): model = IPUModel() - trainer = Trainer(fast_dev_run=True, accelerator='ipu', ipu_cores=ipu_cores) + trainer = Trainer(fast_dev_run=True, ipu_cores=ipu_cores) trainer.validate(model) trainer.test(model) trainer.predict(model, model.val_dataloader()) @@ -176,4 +178,61 @@ def test_dataloader(self): assert saved_result > 0.6 and (saved_result == test_result) -# todo add test for precision 16 and fully half precision + device iterations +@RunIf(ipu=True, special=True) +def test_mixed_precision(tmpdir): + + class TestCallback(Callback): + + def setup(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', stage: Optional[str] = None) -> None: + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + assert trainer.accelerator.model.precision == 16 + raise SystemExit + + model = IPUModel() + trainer = Trainer(fast_dev_run=True, ipu_cores=1, precision=16, callbacks=TestCallback()) + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True, special=True) +def test_pure_half_precision(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + assert trainer.accelerator.model.precision == 16 + assert trainer.accelerator.training_type_plugin.convert_model_of_to_half + for param in trainer.accelerator.model.parameters(): + assert param.dtype == torch.float16 + raise SystemExit + + model = IPUModel() + trainer = Trainer( + fast_dev_run=True, + ipu_cores=1, + precision=16, + plugins=IPUPlugin(convert_model_to_half=True), + callbacks=TestCallback() + ) + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True, special=True) +def test_device_iterations_ipu_plugin(tmpdir): + + class TestCallback(Callback): + + def setup(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', stage: Optional[str] = None) -> None: + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.device_iterations == 20 + raise SystemExit + + model = IPUModel() + trainer = Trainer(fast_dev_run=True, ipu_cores=1, plugins=IPUPlugin(device_iterations=20), callbacks=TestCallback()) + with pytest.raises(SystemExit): + trainer.fit(model) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 7ab49e6826d58..3c89c71209191 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -51,7 +51,7 @@ def on_fit_start(self, trainer, pl_module): callbacks=[CB()], ) - with pytest.raises(SystemExit): + `with pytest.raises(SystemExit):` trainer.fit(model) From 7469744d765c836109780a62b6579ba9adb4815b Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 10:16:10 +0100 Subject: [PATCH 22/60] Fix errors --- tests/accelerators/test_ipu.py | 30 +++++++++++++++++++++++++--- tests/plugins/test_sharded_plugin.py | 2 +- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index f830f099a7d91..adb377638ff5d 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Any, Optional +from typing import Optional import pytest import torch @@ -20,6 +20,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import Callback, seed_everything, Trainer +from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule @@ -200,7 +201,7 @@ def test_pure_half_precision(tmpdir): class TestCallback(Callback): - def on_train_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) assert trainer.accelerator.precision_plugin.precision == 16 @@ -227,12 +228,35 @@ def test_device_iterations_ipu_plugin(tmpdir): class TestCallback(Callback): - def setup(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', stage: Optional[str] = None) -> None: + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.device_iterations == 20 + # assert device iterations has been set correctly within the poptorch options + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models['train'] + assert poptorch_model._options.toDict()['device_iterations'] == 20 raise SystemExit model = IPUModel() trainer = Trainer(fast_dev_run=True, ipu_cores=1, plugins=IPUPlugin(device_iterations=20), callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) + + +@RunIf(ipu=True, special=True) +def test_accumulated_batches(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + # ensure the accumulation_scheduler is overridden to accumulate every batch + # since ipu handle accumulation + assert trainer.accumulation_scheduler.scheduling == {0: 1} + # assert poptorch option have been set correctly + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models['train'] + assert poptorch_model._options.Training.toDict()['gradient_accumulation'] == 2 + raise SystemExit + + model = IPUModel() + trainer = Trainer(fast_dev_run=True, ipu_cores=1, accumulate_grad_batches=2, callbacks=TestCallback()) + with pytest.raises(SystemExit): + trainer.fit(model) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 3c89c71209191..7ab49e6826d58 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -51,7 +51,7 @@ def on_fit_start(self, trainer, pl_module): callbacks=[CB()], ) - `with pytest.raises(SystemExit):` + with pytest.raises(SystemExit): trainer.fit(model) From f474c5bc1f453cc43845069ad892686b24337df8 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 14:08:23 +0100 Subject: [PATCH 23/60] fix --- pytorch_lightning/trainer/trainer.py | 5 +++-- pytorch_lightning/trainer/training_loop.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 3b4ca6b6e2f92..ff1f7e7607572 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1249,10 +1249,11 @@ def call_hook(self, hook_name: str, *args, **kwargs) -> Any: hook_fx = getattr(model_ref, hook_name) output = hook_fx(*args, **kwargs) - # call hook in accelerator + # if the PL module doesn't have the hook then call the accelerator + # used to auto-reduce things for the user with Results obj if hasattr(self.accelerator, hook_name): accelerator_hook = getattr(self.accelerator, hook_name) - output = accelerator_hook(*args, **kwargs) + accelerator_hook(*args, **kwargs) if not skip: self._cache_logged_metrics() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 64c05b8be1547..8684401b706e2 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -627,7 +627,8 @@ def _on_train_epoch_end_hook(self, processed_epoch_output) -> None: else: model_ref.on_train_epoch_end() - # call hook in accelerator + # if the PL module doesn't have the hook then call the accelerator + # used to auto-reduce things for the user with Results obj if hasattr(self.trainer.accelerator, hook_name): accelerator_hook = getattr(self.trainer.accelerator, hook_name) accelerator_hook() From e178d5f2fe75da724c40327d0f8544ea95c796d6 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 15:11:01 +0100 Subject: [PATCH 24/60] Try condition --- pytorch_lightning/trainer/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ff1f7e7607572..9bf227d9581c2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1253,7 +1253,9 @@ def call_hook(self, hook_name: str, *args, **kwargs) -> Any: # used to auto-reduce things for the user with Results obj if hasattr(self.accelerator, hook_name): accelerator_hook = getattr(self.accelerator, hook_name) - accelerator_hook(*args, **kwargs) + accelerator_output = accelerator_hook(*args, **kwargs) + if not output: + output = accelerator_output if not skip: self._cache_logged_metrics() From c70492082d5f0ec9ff66176c9a33895c76eb51da Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 15:47:31 +0100 Subject: [PATCH 25/60] Add missing annotation --- tests/accelerators/test_ipu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index adb377638ff5d..43623f84a44fa 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -112,6 +112,7 @@ def test_inference_only(tmpdir, ipu_cores): trainer.predict(model, model.val_dataloader()) +@RunIf(ipu=True, special=True) def test_optimization(tmpdir): seed_everything(42) @@ -184,7 +185,7 @@ def test_mixed_precision(tmpdir): class TestCallback(Callback): - def setup(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', stage: Optional[str] = None) -> None: + def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 From c54a2166908c090ee6b20fa4090719bfba1c2dfc Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 16:07:51 +0100 Subject: [PATCH 26/60] Clearer --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9bf227d9581c2..b52e24fdeb3b3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1254,7 +1254,7 @@ def call_hook(self, hook_name: str, *args, **kwargs) -> Any: if hasattr(self.accelerator, hook_name): accelerator_hook = getattr(self.accelerator, hook_name) accelerator_output = accelerator_hook(*args, **kwargs) - if not output: + if output is None: output = accelerator_output if not skip: From 2ea176655af058180130a0e241e17ba66732d999 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 16:08:45 +0100 Subject: [PATCH 27/60] Clearer message --- pytorch_lightning/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b52e24fdeb3b3..fda7b015a38a2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1249,11 +1249,11 @@ def call_hook(self, hook_name: str, *args, **kwargs) -> Any: hook_fx = getattr(model_ref, hook_name) output = hook_fx(*args, **kwargs) - # if the PL module doesn't have the hook then call the accelerator - # used to auto-reduce things for the user with Results obj + # call the accelerator hook if hasattr(self.accelerator, hook_name): accelerator_hook = getattr(self.accelerator, hook_name) accelerator_output = accelerator_hook(*args, **kwargs) + # used to auto-reduce things for the user with Results obj if output is None: output = accelerator_output From 751f0ea4f20d866f047e0021886efd547310b9e6 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 3 Jun 2021 19:11:32 +0100 Subject: [PATCH 28/60] Fix variable --- tests/accelerators/test_ipu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 43623f84a44fa..0d5a6e89bb331 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -207,7 +207,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 - assert trainer.accelerator.training_type_plugin.convert_model_of_to_half + assert trainer.accelerator.training_type_plugin.convert_model_to_half for param in trainer.accelerator.model.parameters(): assert param.dtype == torch.float16 raise SystemExit From 61d2014afaf096f14cbd8929292d2fa8a7479052 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 12:37:00 +0100 Subject: [PATCH 29/60] Cleanups --- pytorch_lightning/accelerators/ipu.py | 4 +- .../plugins/training_type/ipu.py | 105 +++++++++++------- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py index 8374bc1bc1554..34bee31b5a91d 100644 --- a/pytorch_lightning/accelerators/ipu.py +++ b/pytorch_lightning/accelerators/ipu.py @@ -15,13 +15,15 @@ from torch.optim import Optimizer +import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException class IPUAccelerator(Accelerator): + """ Accelerator for IPUs. """ - def setup_optimizers(self, trainer): + def setup_optimizers(self, trainer: 'pl.Trainer') -> None: super().setup_optimizers(trainer) if len(self.optimizers) > 1: diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 499fe48bca359..2527470a12166 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -43,6 +43,9 @@ def _move_float_tensors_to_half(self, batch: Any): class IPUPlugin(ParallelPlugin): + """ + Plugin for training on IPU devices. + """ def __init__( self, @@ -53,7 +56,19 @@ def __init__( convert_model_to_half: bool = False, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, - ): + ) -> None: + """ + Arguments: + + device_iterations: Number of iterations to run on device at once before returning to host. + This can be used as an optimization to speed up training. + https://docs.graphcore.ai/projects/poptorch-user-guide/en/0.1.67/batching.html + autoround_num_ipus: When selecting multiple IPUs, auto-rounds to powers of 2 as required for IPUs. + autoreport: Enable auto-reporting for IPUs using PopVision + https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html + autoreport_dir: Optional directory to store autoReport output. + convert_model_to_half: Converts the model to half precision, which can be used for pure FP16 training. + """ super().__init__(parallel_devices, cluster_environment) self.convert_model_to_half = convert_model_to_half self.device_iterations = device_iterations @@ -92,7 +107,7 @@ def pre_dispatch(self) -> None: self.model = model # Separate models are instantiated for different stages, but they share the same weights on host. - # When validation/test models are run, they sync weights first. + # When validation/test models are run, weights are synced first. if self.lightning_module.trainer.training: # Create model for training which will run training. @@ -228,33 +243,6 @@ def predict_step(self, *args, **kwargs): args = self._prepare_input(args) return self.poptorch_models['predict'](*args, **kwargs) - @property - def on_gpu(self) -> bool: - return False - - @property - def root_device(self) -> torch.device: - pass - - def model_to_device(self) -> None: - pass - - @property - def is_global_zero(self) -> bool: - return True - - def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: - return tensor - - def barrier(self, name: Optional[str] = None) -> None: - pass - - def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: - return tensor - - def broadcast(self, obj: object, src: int = 0) -> object: - return obj - def teardown(self) -> None: for k, model in self.poptorch_models.items(): model.destroy() @@ -263,42 +251,77 @@ def _compiled(self, model): # Required to ensure we only attach compiled models, as they are compiled lazily. return model._executable is not None - def detach_models(self): + def _detach_models(self): + """ + Detaches all stage specific models from IPU devices. + """ for k, model in self.poptorch_models.items(): if self._compiled(model) and model.isAttachedToDevice(): model.detachFromDevice() - def load_model(self, stage): - self.detach_models() + def _load_model(self, stage): + """ + Loads the stage specific accelerator model onto device if compiled and not attached to IPU devices. + Args: + stage: The stage to load + """ + self._detach_models() model = self.poptorch_models[stage] if self._compiled(model) and not model.isAttachedToDevice(): model.attachToDevice() def on_train_start(self): - self.load_model('train') + self._load_model('train') def on_validation_start(self): - self.load_model('val') + self._load_model('val') def on_test_start(self): - self.load_model('test') + self._load_model('test') def on_predict_start(self): - self.load_model('predict') + self._load_model('predict') def on_train_end(self): - self.detach_models() + self._detach_models() def on_validation_end(self): - self.detach_models() + self._detach_models() def on_test_end(self): - self.detach_models() + self._detach_models() def on_predict_end(self): - self.detach_models() + self._detach_models() def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: - # Update optimizer stats if LR scheduler modified the optimizer state + # Updates optimizer stats if LR scheduler modified the optimizer state optimizer = self.lightning_module.trainer.optimizers[0] self.poptorch_models['train'].setOptimizer(optimizer) + + @property + def on_gpu(self) -> bool: + return False + + @property + def root_device(self) -> torch.device: + pass + + def model_to_device(self) -> None: + pass + + @property + def is_global_zero(self) -> bool: + return True + + def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: + return tensor + + def barrier(self, name: Optional[str] = None) -> None: + pass + + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + return tensor + + def broadcast(self, obj: object, src: int = 0) -> object: + return obj From 62860ffa8c0e78292d7245dff1850e8336f857b9 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 13:40:04 +0100 Subject: [PATCH 30/60] Add comment --- pl_examples/ipu_examples/mnist.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index c907f4a15af48..465b22c16d1f0 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -64,6 +64,8 @@ def accuracy(self, logits, y): return acc def validation_epoch_end(self, outputs) -> None: + # since the training step/validation step and test step are run on the IPU device + # we must log the average loss outside the step functions. self.log('val_acc', torch.stack(outputs).mean(), prog_bar=True) def test_epoch_end(self, outputs) -> None: From b5a50325a739f5b6e7ff33d0e49b02babe47c59c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 13:59:28 +0100 Subject: [PATCH 31/60] CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bc8ffcf1d40e..d63afef4fdc61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added trainer stage hooks for Training Plugins and Accelerators ([#7864](https://github.com/PyTorchLightning/pytorch-lightning/pull/7864)) +- Added IPU Accelerator ([#7867](https://github.com/PyTorchLightning/pytorch-lightning/pull/7867)) + + ### Changed - Changed calling of `untoggle_optimizer(opt_idx)` out of the closure function ([#7563](https://github.com/PyTorchLightning/pytorch-lightning/pull/7563) From 72ed367c4ffd35de1af96bc6b5e5bad602df9a70 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 14:07:14 +0100 Subject: [PATCH 32/60] Add simple selection test --- tests/accelerators/test_ipu.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 0d5a6e89bb331..e5aff0de0d015 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -20,6 +20,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import Callback, seed_everything, Trainer +from pytorch_lightning.accelerators import IPUAccelerator from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin from tests.helpers.boring_model import BoringModel @@ -90,6 +91,13 @@ def test_epoch_end(self, outputs) -> None: self.log('test_acc', torch.stack(outputs).mean()) +def test_accelerator_selected(tmpdir): + trainer = Trainer(ipu_cores=1) + assert isinstance(trainer.accelerator, IPUAccelerator) + trainer = Trainer(ipu_cores=1, accelerator='ipu') + assert isinstance(trainer.accelerator, IPUAccelerator) + + @RunIf(ipu=True, special=True) @pytest.mark.parametrize('ipu_cores', [1, 4]) def test_all_stages(tmpdir, ipu_cores): From 3fb031d180c3653002d276e0a1ebd24e9ce1cc09 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 14:15:21 +0100 Subject: [PATCH 33/60] Remove special=True to see what happens --- tests/accelerators/test_ipu.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index e5aff0de0d015..c313e2ff29044 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -98,7 +98,7 @@ def test_accelerator_selected(tmpdir): assert isinstance(trainer.accelerator, IPUAccelerator) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) @pytest.mark.parametrize('ipu_cores', [1, 4]) def test_all_stages(tmpdir, ipu_cores): model = IPUModel() @@ -109,7 +109,7 @@ def test_all_stages(tmpdir, ipu_cores): trainer.predict(model, model.val_dataloader()) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) @pytest.mark.parametrize('ipu_cores', [1, 4]) def test_inference_only(tmpdir, ipu_cores): model = IPUModel() @@ -120,7 +120,7 @@ def test_inference_only(tmpdir, ipu_cores): trainer.predict(model, model.val_dataloader()) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) def test_optimization(tmpdir): seed_everything(42) @@ -188,7 +188,7 @@ def test_dataloader(self): assert saved_result > 0.6 and (saved_result == test_result) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) def test_mixed_precision(tmpdir): class TestCallback(Callback): @@ -205,7 +205,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st trainer.fit(model) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) def test_pure_half_precision(tmpdir): class TestCallback(Callback): @@ -232,7 +232,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: trainer.fit(model) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) def test_device_iterations_ipu_plugin(tmpdir): class TestCallback(Callback): @@ -251,7 +251,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: trainer.fit(model) -@RunIf(ipu=True, special=True) +@RunIf(ipu=True) def test_accumulated_batches(tmpdir): class TestCallback(Callback): From 515d4918cf263d298a55f6983f20e9b63ea1a141 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 14:36:50 +0100 Subject: [PATCH 34/60] Fix test --- tests/accelerators/test_ipu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index c313e2ff29044..fbfd2b9c63780 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -239,14 +239,14 @@ class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - assert trainer.accelerator.training_type_plugin.device_iterations == 20 + assert trainer.accelerator.training_type_plugin.device_iterations == 2 # assert device iterations has been set correctly within the poptorch options poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models['train'] - assert poptorch_model._options.toDict()['device_iterations'] == 20 + assert poptorch_model._options.toDict()['device_iterations'] == 2 raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipu_cores=1, plugins=IPUPlugin(device_iterations=20), callbacks=TestCallback()) + trainer = Trainer(fast_dev_run=True, ipu_cores=1, plugins=IPUPlugin(device_iterations=2), callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) From ed168086179c63d5489f94ef9c08cebc36ee2f6b Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Mon, 7 Jun 2021 14:51:47 +0100 Subject: [PATCH 35/60] Update tests/accelerators/test_ipu.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- tests/accelerators/test_ipu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index fbfd2b9c63780..f9cff82097cad 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -91,6 +91,7 @@ def test_epoch_end(self, outputs) -> None: self.log('test_acc', torch.stack(outputs).mean()) +@RunIf(ipu=True) def test_accelerator_selected(tmpdir): trainer = Trainer(ipu_cores=1) assert isinstance(trainer.accelerator, IPUAccelerator) From 7f50295c45b605b0bddb89f10de573a83b7dda96 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 15:31:31 +0100 Subject: [PATCH 36/60] Convert ipu_cores -> ipus --- pl_examples/ipu_examples/mnist.py | 2 +- pytorch_lightning/trainer/trainer.py | 4 ++-- tests/accelerators/test_ipu.py | 26 +++++++++++++------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 465b22c16d1f0..32ae010fc2d2a 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -80,7 +80,7 @@ def configure_optimizers(self): model = LitClassifier() - trainer = pl.Trainer(max_epochs=2, ipu_cores=8) + trainer = pl.Trainer(max_epochs=2, ipus=8) trainer.fit(model, datamodule=dm) trainer.test(model, datamodule=dm) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0bdf0f73d0e0a..e7a255a65c33d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -107,7 +107,7 @@ def __init__( gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, - ipu_cores: Optional[int] = None, + ipus: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: Optional[int] = None, overfit_batches: Union[int, float] = 0.0, @@ -324,7 +324,7 @@ def __init__( self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector( - num_processes, tpu_cores, ipu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, + num_processes, tpu_cores, ipus, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins ) self.logger_connector = LoggerConnector(self, log_gpu_memory) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index f9cff82097cad..be6bdfea9d11a 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -93,17 +93,17 @@ def test_epoch_end(self, outputs) -> None: @RunIf(ipu=True) def test_accelerator_selected(tmpdir): - trainer = Trainer(ipu_cores=1) + trainer = Trainer(ipus=1) assert isinstance(trainer.accelerator, IPUAccelerator) - trainer = Trainer(ipu_cores=1, accelerator='ipu') + trainer = Trainer(ipus=1, accelerator='ipu') assert isinstance(trainer.accelerator, IPUAccelerator) @RunIf(ipu=True) -@pytest.mark.parametrize('ipu_cores', [1, 4]) -def test_all_stages(tmpdir, ipu_cores): +@pytest.mark.parametrize('ipus', [1, 4]) +def test_all_stages(tmpdir, ipus): model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipu_cores=ipu_cores) + trainer = Trainer(fast_dev_run=True, ipus=ipus) trainer.fit(model) trainer.validate(model) trainer.test(model) @@ -111,11 +111,11 @@ def test_all_stages(tmpdir, ipu_cores): @RunIf(ipu=True) -@pytest.mark.parametrize('ipu_cores', [1, 4]) -def test_inference_only(tmpdir, ipu_cores): +@pytest.mark.parametrize('ipus', [1, 4]) +def test_inference_only(tmpdir, ipus): model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipu_cores=ipu_cores) + trainer = Trainer(fast_dev_run=True, ipus=ipus) trainer.validate(model) trainer.test(model) trainer.predict(model, model.val_dataloader()) @@ -157,7 +157,7 @@ def test_dataloader(self): max_epochs=1, weights_summary=None, deterministic=True, - ipu_cores=2, + ipus=2, ) # fit model @@ -201,7 +201,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipu_cores=1, precision=16, callbacks=TestCallback()) + trainer = Trainer(fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) @@ -224,7 +224,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: model = IPUModel() trainer = Trainer( fast_dev_run=True, - ipu_cores=1, + ipus=1, precision=16, plugins=IPUPlugin(convert_model_to_half=True), callbacks=TestCallback() @@ -247,7 +247,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipu_cores=1, plugins=IPUPlugin(device_iterations=2), callbacks=TestCallback()) + trainer = Trainer(fast_dev_run=True, ipus=1, plugins=IPUPlugin(device_iterations=2), callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) @@ -267,6 +267,6 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipu_cores=1, accumulate_grad_batches=2, callbacks=TestCallback()) + trainer = Trainer(fast_dev_run=True, ipus=1, accumulate_grad_batches=2, callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) From c53cf88de36121e55e9c95144fdf6f323b48c26e Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 19:56:24 +0100 Subject: [PATCH 37/60] Add typing, fail earlier --- .../plugins/training_type/ipu.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 2527470a12166..e585f27df5fd9 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -34,10 +34,10 @@ def forward(self, *inputs, **kwargs): return super().forward(*inputs, **kwargs) @staticmethod - def batch_to(data): + def batch_to(data: torch.Tensor) -> torch.Tensor: return data.half() - def _move_float_tensors_to_half(self, batch: Any): + def _move_float_tensors_to_half(self, batch: Any) -> Any: batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) return batch @@ -70,6 +70,9 @@ def __init__( convert_model_to_half: Converts the model to half precision, which can be used for pure FP16 training. """ super().__init__(parallel_devices, cluster_environment) + if not poptorch.ipuHardwareIsAvailable(): + raise MisconfigurationException("IPU Accelerator requires IPUs to run.") + self.convert_model_to_half = convert_model_to_half self.device_iterations = device_iterations self.autoround_num_ipus = autoround_num_ipus @@ -86,11 +89,6 @@ def __init__( options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) - def setup_environment(self) -> None: - super().setup_environment() - if not poptorch.ipuHardwareIsAvailable(): - raise MisconfigurationException("IPU Accelerator requires IPUs to run.") - @property def lightning_module(self) -> Optional[LightningModule]: return self.model.module if isinstance(self.model, LightningIPUModule) else self.model @@ -125,7 +123,7 @@ def pre_dispatch(self) -> None: def replication_factor(self): return len(self.parallel_devices) - def _create_opts(self, training): + def _create_opts(self, training: bool): opts = poptorch.Options() opts.deviceIterations(self.device_iterations) opts.replicationFactor(self.replication_factor) @@ -138,16 +136,16 @@ def _create_opts(self, training): opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"])) return opts - def on_reset_train_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: return self.process_dataloader(dataloader) - def on_reset_val_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: return self.process_dataloader(dataloader) - def on_reset_test_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: return self.process_dataloader(dataloader) - def on_reset_predict_dataloader(self, dataloader) -> Union[Iterable, DataLoader]: + def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: return self.process_dataloader(dataloader) def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: @@ -215,7 +213,7 @@ def _n_replicate(self): accumulate_grad_batches = self._original_accumulate_grad_batches return self.replication_factor * self.device_iterations * accumulate_grad_batches - def _prepare_input(self, args): + def _prepare_input(self, args: Any): def to_tuple(x): return tuple(x) @@ -247,7 +245,7 @@ def teardown(self) -> None: for k, model in self.poptorch_models.items(): model.destroy() - def _compiled(self, model): + def _compiled(self, model: Any): # Required to ensure we only attach compiled models, as they are compiled lazily. return model._executable is not None @@ -259,7 +257,7 @@ def _detach_models(self): if self._compiled(model) and model.isAttachedToDevice(): model.detachFromDevice() - def _load_model(self, stage): + def _load_model(self, stage: str): """ Loads the stage specific accelerator model onto device if compiled and not attached to IPU devices. Args: From a6dbd8a411349fac3cc68e15589115f42d2944f6 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 7 Jun 2021 19:58:28 +0100 Subject: [PATCH 38/60] simplify precision --- pytorch_lightning/plugins/training_type/ipu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index e585f27df5fd9..727a392ed6271 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -23,12 +23,12 @@ class LightningIPUModule(_LightningModuleWrapperBase): - def __init__(self, pl_module: LightningModule, precision: int): + def __init__(self, pl_module: LightningModule, precision: Union[str, int]): super().__init__(pl_module) self.precision = precision def forward(self, *inputs, **kwargs): - if self.precision == 16: + if self.precision in ("mixed", 16): inputs = self._move_float_tensors_to_half(inputs) return super().forward(*inputs, **kwargs) @@ -98,7 +98,7 @@ def pre_dispatch(self) -> None: if self.convert_model_to_half: log.info('Using full 16bit precision, converting LightningModule weights to FP16.') self.model = self.model.half() - precision = self.lightning_module.trainer.accelerator.precision_plugin.precision + precision = self.lightning_module.trainer.precision precision = 16 if self.convert_model_to_half else precision model = LightningIPUModule(self.lightning_module, precision) From 953454b0fba0ebf6a925a3e76f778b983c6fad76 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 12:41:28 +0100 Subject: [PATCH 39/60] Add test, add helper --- .../plugins/training_type/ipu.py | 5 +- tests/accelerators/test_ipu.py | 46 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 727a392ed6271..3347e897a0a74 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -71,7 +71,10 @@ def __init__( """ super().__init__(parallel_devices, cluster_environment) if not poptorch.ipuHardwareIsAvailable(): - raise MisconfigurationException("IPU Accelerator requires IPUs to run.") + raise MisconfigurationException( + "The IPU Accelerator requires IPU devices to run. " + "Learn more or get started with IPUs at https://www.graphcore.ai/getstarted" + ) self.convert_model_to_half = convert_model_to_half self.device_iterations = device_iterations diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index be6bdfea9d11a..cdfcf203b220d 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -270,3 +270,49 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: trainer = Trainer(fast_dev_run=True, ipus=1, accumulate_grad_batches=2, callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) + + +@RunIf(ipu=True) +def test_stages_correct(tmpdir): + """Ensure all stages correctly are traced correctly by asserting the output for each stage""" + + class StageModel(IPUModel): + + def training_step(self, batch, batch_idx): + loss = super().training_step(batch, batch_idx) + # tracing requires a loss value that depends on the model. + # force it to be a value but ensure we use the loss. + return (loss - loss) + torch.tensor(1) + + def validation_step(self, batch, batch_idx): + loss = super().validation_step(batch, batch_idx) + return (loss - loss) + torch.tensor(2) + + def test_step(self, batch, batch_idx): + loss = super().validation_step(batch, batch_idx) + return (loss - loss) + torch.tensor(3) + + def predict_step(self, batch, batch_idx, dataloader_idx=None): + output = super().predict_step(batch, batch_idx) + return (output - output) + torch.tensor(4) + + class TestCallback(Callback): + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs['loss'].item() == 1 + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs.item() == 2 + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs.item() == 3 + + def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert torch.all(outputs == 4).item() + + model = StageModel() + trainer = Trainer(fast_dev_run=True, ipus=1, callbacks=TestCallback()) + trainer.fit(model) + trainer.test(model) + trainer.validate(model) + trainer.predict(model, model.test_dataloader()) From 24829bfb3ef554285b9cd19b1366be5d749c6a02 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 13:50:36 +0100 Subject: [PATCH 40/60] fix accum --- pytorch_lightning/plugins/training_type/ipu.py | 6 +++++- tests/accelerators/test_ipu.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 3347e897a0a74..37a6da94b495c 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -199,8 +199,12 @@ def _handle_gradient_accumulation_steps(self): Therefore, ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation. """ self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + if not isinstance(self._original_accumulate_grad_batches, int): + raise MisconfigurationException( + f"IPUs currently only support accumulate_grad_batches being an integer value. " + f"Received {self._original_accumulate_grad_batches}" + ) if self._original_accumulate_grad_batches > 1: - # todo (tchaton) Add support for accumulate_grad_batches being a dictionary. self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1}) def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index cdfcf203b220d..5d75ea0afc46d 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -23,6 +23,7 @@ from pytorch_lightning.accelerators import IPUAccelerator from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule from tests.helpers.datasets import SklearnDataset @@ -316,3 +317,13 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da trainer.test(model) trainer.validate(model) trainer.predict(model, model.test_dataloader()) + + +@RunIf(ipu=True) +def test_accumulate_grad_batches_dict_fails(tmpdir): + model = IPUModel() + trainer = Trainer(ipus=1, accumulate_grad_batches={0: 1}) + with pytest.raises( + MisconfigurationException, match="IPUs currently only support accumulate_grad_batches being an integer value." + ): + trainer.fit(model) From d7d38c56d0726efece789fa7a5b8b93dda4a0898 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 8 Jun 2021 13:53:19 +0100 Subject: [PATCH 41/60] Update pytorch_lightning/plugins/training_type/ipu.py Co-authored-by: thomas chaton --- pytorch_lightning/plugins/training_type/ipu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 37a6da94b495c..40d25eb0caca1 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -249,7 +249,7 @@ def predict_step(self, *args, **kwargs): return self.poptorch_models['predict'](*args, **kwargs) def teardown(self) -> None: - for k, model in self.poptorch_models.items(): + for model in self.poptorch_models.values(): model.destroy() def _compiled(self, model: Any): From c333e2768447e88fd18ec18d7bbdf8c116ebd48d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 14:09:46 +0100 Subject: [PATCH 42/60] Use stages --- .../plugins/training_type/ipu.py | 25 ++++++++++--------- tests/accelerators/test_ipu.py | 5 ++-- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 40d25eb0caca1..857529163a325 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -12,6 +12,7 @@ from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities import _POPTORCH_AVAILABLE from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -110,12 +111,12 @@ def pre_dispatch(self) -> None: # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, weights are synced first. - if self.lightning_module.trainer.training: + if self.lightning_module.trainer.state.stage is RunningStage.TRAINING: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] model = poptorch.trainingModel(model=model, options=self._create_opts(training=True), optimizer=optimizer) - self.poptorch_models['train'] = model - for x in ('val', 'test', 'predict'): + self.poptorch_models[RunningStage.TRAINING] = model + for x in (RunningStage.VALIDATING, RunningStage.TESTING, RunningStage.PREDICTING): model = poptorch.inferenceModel( model=model, options=self._create_opts(training=False), @@ -234,19 +235,19 @@ def to_tensor(x): def training_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['train'](*args, **kwargs) + return self.poptorch_models[RunningStage.TRAINING](*args, **kwargs) def validation_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['val'](*args, **kwargs) + return self.poptorch_models[RunningStage.VALIDATING](*args, **kwargs) def test_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['test'](*args, **kwargs) + return self.poptorch_models[RunningStage.TESTING](*args, **kwargs) def predict_step(self, *args, **kwargs): args = self._prepare_input(args) - return self.poptorch_models['predict'](*args, **kwargs) + return self.poptorch_models[RunningStage.PREDICTING](*args, **kwargs) def teardown(self) -> None: for model in self.poptorch_models.values(): @@ -276,16 +277,16 @@ def _load_model(self, stage: str): model.attachToDevice() def on_train_start(self): - self._load_model('train') + self._load_model(RunningStage.TRAINING) def on_validation_start(self): - self._load_model('val') + self._load_model(RunningStage.VALIDATING) def on_test_start(self): - self._load_model('test') + self._load_model(RunningStage.TESTING) def on_predict_start(self): - self._load_model('predict') + self._load_model(RunningStage.PREDICTING) def on_train_end(self): self._detach_models() @@ -302,7 +303,7 @@ def on_predict_end(self): def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: # Updates optimizer stats if LR scheduler modified the optimizer state optimizer = self.lightning_module.trainer.optimizers[0] - self.poptorch_models['train'].setOptimizer(optimizer) + self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer) @property def on_gpu(self) -> bool: diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 5d75ea0afc46d..37ee51ba3379f 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -23,6 +23,7 @@ from pytorch_lightning.accelerators import IPUAccelerator from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin +from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule @@ -243,7 +244,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.device_iterations == 2 # assert device iterations has been set correctly within the poptorch options - poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models['train'] + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] assert poptorch_model._options.toDict()['device_iterations'] == 2 raise SystemExit @@ -263,7 +264,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: # since ipu handle accumulation assert trainer.accumulation_scheduler.scheduling == {0: 1} # assert poptorch option have been set correctly - poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models['train'] + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] assert poptorch_model._options.Training.toDict()['gradient_accumulation'] == 2 raise SystemExit From 9d3741a47ecd3b96c006acbd0c4d45c923ab918b Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 14:34:31 +0100 Subject: [PATCH 43/60] Make sure warning message returned --- pytorch_lightning/plugins/training_type/ipu.py | 2 +- tests/accelerators/test_ipu.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 857529163a325..6606eede7c47c 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -71,7 +71,7 @@ def __init__( convert_model_to_half: Converts the model to half precision, which can be used for pure FP16 training. """ super().__init__(parallel_devices, cluster_environment) - if not poptorch.ipuHardwareIsAvailable(): + if not _POPTORCH_AVAILABLE or not poptorch.ipuHardwareIsAvailable(): raise MisconfigurationException( "The IPU Accelerator requires IPU devices to run. " "Learn more or get started with IPUs at https://www.graphcore.ai/getstarted" diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 37ee51ba3379f..f50390039191e 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -93,6 +93,14 @@ def test_epoch_end(self, outputs) -> None: self.log('test_acc', torch.stack(outputs).mean()) +def test_fail_if_no_ipus(tmpdir): + with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): + Trainer(ipus=1) + + with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): + Trainer(ipus=1, accelerator='ipu') + + @RunIf(ipu=True) def test_accelerator_selected(tmpdir): trainer = Trainer(ipus=1) From fd1899a45570a3c0a901a568c5180f17fb25f698 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 14:47:38 +0100 Subject: [PATCH 44/60] thorw error --- .../plugins/precision/ipu_precision.py | 37 +++++++++++++++++-- tests/accelerators/test_ipu.py | 8 ++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py index 4e88a6cf73fe1..b0da6e13e20f4 100644 --- a/pytorch_lightning/plugins/precision/ipu_precision.py +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -1,8 +1,25 @@ -from typing import Any +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, Union from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.utilities import GradClipAlgorithmType +from pytorch_lightning.utilities.exceptions import MisconfigurationException class IPUPrecisionPlugin(PrecisionPlugin): @@ -20,5 +37,19 @@ def backward( # IPU internally manages bwd step. return closure_loss - def clip_gradients(self, *args, **kwargs) -> None: - pass + def clip_gradients( + self, + optimizer: Optimizer, + clip_val: Union[int, float], + gradient_clip_algorithm: GradClipAlgorithmType = GradClipAlgorithmType.NORM, + model: Optional[Module] = None + ) -> None: + """Clips the gradients""" + if clip_val is None: + return + + clip_val = float(clip_val) + if clip_val <= 0: + return + + raise MisconfigurationException("IPUs currently do not support clipping gradients.") diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index f50390039191e..4bb4a14a38957 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -336,3 +336,11 @@ def test_accumulate_grad_batches_dict_fails(tmpdir): MisconfigurationException, match="IPUs currently only support accumulate_grad_batches being an integer value." ): trainer.fit(model) + + +@RunIf(ipu=True) +def test_clip_gradients_fails(tmpdir): + model = IPUModel() + trainer = Trainer(ipus=1, gradient_clip_val=10) + with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): + trainer.fit(model) From 07279546307bd6b5081e08e459ce1c0585579196 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 14:58:08 +0100 Subject: [PATCH 45/60] Add more tests, use fs --- pytorch_lightning/plugins/training_type/ipu.py | 7 +++++-- tests/accelerators/test_ipu.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 6606eede7c47c..dda3174b8f096 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -16,6 +16,7 @@ from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities import _POPTORCH_AVAILABLE from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException if _POPTORCH_AVAILABLE: @@ -88,8 +89,10 @@ def __init__( if self.autoreport: options = {"autoReport.all": self.autoreport} if self.autoreport_dir: - if not os.path.exists(self.autoreport_dir): - os.makedirs(self.autoreport_dir) + self._fs = get_filesystem(str(self.autoreport_dir)) + + if not self._fs.exists(self.autoreport_dir): + self._fs.makedirs(self.autoreport_dir) options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 4bb4a14a38957..f67937c12e579 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -344,3 +344,14 @@ def test_clip_gradients_fails(tmpdir): trainer = Trainer(ipus=1, gradient_clip_val=10) with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): trainer.fit(model) + + +@RunIf(ipu=True) +def test_autoreport(tmpdir): + """Ensure autoreport dumps to a file.""" + model = IPUModel() + autoreport_path = os.path.join(tmpdir, 'report/') + trainer = Trainer(ipus=1, fast_dev_run=True, plugins=IPUPlugin(autoreport=True, autoreport_dir=autoreport_path)) + trainer.fit(model) + assert os.path.exists(autoreport_path) + assert os.path.isfile(autoreport_path + 'profile.pop') From ce182f777592e20b985a5424fbdffc7635d68fc5 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 15:00:57 +0100 Subject: [PATCH 46/60] add comment --- pl_examples/ipu_examples/mnist.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 32ae010fc2d2a..87087d67766b2 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -47,6 +47,9 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): x, y = batch logits = self(x) + # we currently return the accuracy as the validation_step/test_step is run on the IPU devices. + # Outputs from the step functions are sent to the host device, where we calculate the metrics in + # validation_epoch_end and test_epoch_end for the test_step. acc = self.accuracy(logits, y) return acc From 7e81bcd8fdad861fb7b0a5d162d2715c44679d42 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 8 Jun 2021 16:47:17 +0100 Subject: [PATCH 47/60] Clean --- .../plugins/training_type/ipu.py | 3 ++ tests/accelerators/test_ipu.py | 37 +++---------------- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index dda3174b8f096..208ffb96d5c0c 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -192,6 +192,9 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], if not contains_dataset: dl_args.pop('dataset') + # Override to drop last uneven batch, as IPUs does not support uneven inputs. + dl_args['drop_last'] = True + dataloader = poptorch.DataLoader(**dl_args, options=opts) dataloader.multiprocessing_context = multiprocessing_context return dataloader diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index f67937c12e579..f5db660ffeb20 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -17,17 +17,16 @@ import pytest import torch import torch.nn.functional as F -from torch.utils.data import DataLoader from pytorch_lightning import Callback, seed_everything, Trainer from pytorch_lightning.accelerators import IPUAccelerator from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.utilities import _IPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule -from tests.helpers.datasets import SklearnDataset from tests.helpers.runif import RunIf from tests.helpers.simple_models import ClassificationModel @@ -93,6 +92,7 @@ def test_epoch_end(self, outputs) -> None: self.log('test_acc', torch.stack(outputs).mean()) +@pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine") def test_fail_if_no_ipus(tmpdir): with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): Trainer(ipus=1) @@ -135,37 +135,12 @@ def test_inference_only(tmpdir, ipus): def test_optimization(tmpdir): seed_everything(42) - # Override to drop last uneven batch, as IPU poptorch does not support uneven inputs. - class DataModule(ClassifDataModule): - - def train_dataloader(self): - return DataLoader( - SklearnDataset(self.x_train, self.y_train, self._x_type, self._y_type), - batch_size=self.batch_size, - drop_last=True - ) - - def val_dataloader(self): - return DataLoader( - SklearnDataset(self.x_valid, self.y_valid, self._x_type, self._y_type), - batch_size=self.batch_size, - drop_last=True - ) - - def test_dataloader(self): - return DataLoader( - SklearnDataset(self.x_test, self.y_test, self._x_type, self._y_type), - batch_size=self.batch_size, - drop_last=True - ) - - dm = DataModule(length=1024) + dm = ClassifDataModule(length=1024) model = IPUClassificationModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - weights_summary=None, deterministic=True, ipus=2, ) @@ -181,7 +156,7 @@ def test_dataloader(self): assert result[0]['val_acc'] > 0.7 # test - result = trainer.test(datamodule=dm) + result = trainer.test(model, datamodule=dm) assert dm.trainer is not None test_result = result[0]['test_acc'] assert test_result > 0.6 @@ -194,9 +169,9 @@ def test_dataloader(self): trainer = Trainer(default_root_dir=tmpdir, deterministic=True) - result = trainer.test(model, dm.test_dataloader()) + result = trainer.test(model, datamodule=dm) saved_result = result[0]['test_acc'] - assert saved_result > 0.6 and (saved_result == test_result) + assert saved_result == test_result @RunIf(ipu=True) From d1788d1a336ee4ad98b7ab766cf1b75f97b104fe Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Jun 2021 12:18:03 +0100 Subject: [PATCH 48/60] Address feedback, add IPU tests --- .../plugins/training_type/ipu.py | 64 ++++++-- tests/accelerators/test_ipu.py | 142 +++++++++++++++++- 2 files changed, 195 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 208ffb96d5c0c..a86e050391c57 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -14,7 +14,7 @@ from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.trainer.supporters import CombinedLoader -from pytorch_lightning.utilities import _POPTORCH_AVAILABLE +from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -58,6 +58,8 @@ def __init__( convert_model_to_half: bool = False, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, + training_opts: Optional['poptorch.Options'] = None, + inference_opts: Optional['poptorch.Options'] = None ) -> None: """ Arguments: @@ -70,6 +72,9 @@ def __init__( https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html autoreport_dir: Optional directory to store autoReport output. convert_model_to_half: Converts the model to half precision, which can be used for pure FP16 training. + training_opts: Optional ``poptorch.Options`` to override the default created options for training. + inference_opts: Optional ``poptorch.Options`` to override the default + created options for validation/testing and predicting. """ super().__init__(parallel_devices, cluster_environment) if not _POPTORCH_AVAILABLE or not poptorch.ipuHardwareIsAvailable(): @@ -85,6 +90,8 @@ def __init__( self.autoreport_dir = autoreport_dir self.poptorch_models = {} self._original_accumulate_grad_batches = None + self._training_opts = training_opts + self._inference_opts = inference_opts if self.autoreport: options = {"autoReport.all": self.autoreport} @@ -96,10 +103,6 @@ def __init__( options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) - @property - def lightning_module(self) -> Optional[LightningModule]: - return self.model.module if isinstance(self.model, LightningIPUModule) else self.model - def pre_dispatch(self) -> None: self._handle_gradient_accumulation_steps() if self.convert_model_to_half: @@ -117,12 +120,12 @@ def pre_dispatch(self) -> None: if self.lightning_module.trainer.state.stage is RunningStage.TRAINING: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] - model = poptorch.trainingModel(model=model, options=self._create_opts(training=True), optimizer=optimizer) + model = poptorch.trainingModel(model=model, options=self.training_opts, optimizer=optimizer) self.poptorch_models[RunningStage.TRAINING] = model for x in (RunningStage.VALIDATING, RunningStage.TESTING, RunningStage.PREDICTING): model = poptorch.inferenceModel( model=model, - options=self._create_opts(training=False), + options=self.inference_opts, ) self.poptorch_models[x] = model @@ -138,11 +141,55 @@ def _create_opts(self, training: bool): opts.Training.gradientAccumulation(gradient_accumulation) opts.autoRoundNumIPUs(self.autoround_num_ipus) - # todo (sean): unsure if this is necessary but to be safe. if os.environ.get("PL_GLOBAL_SEED"): opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"])) return opts + @property + def training_opts(self) -> 'poptorch.Options': + if self._training_opts is None: + self._training_opts = self._create_opts(training=True) + self._validate_opts(self._training_opts, training=True) + return self._training_opts + + @property + def inference_opts(self) -> 'poptorch.Options': + if self._inference_opts is None: + self._inference_opts = self._create_opts(training=False) + self._validate_opts(self._inference_opts, training=False) + return self._inference_opts + + def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: + if opts is not None: + if opts.replication_factor != self.replication_factor: + rank_zero_warn( + f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} " + f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. " + f"Setting to {self.replication_factor} in the poptorch.Options.", UserWarning + ) + opts.set(replication_factor=self.replication_factor) + if not training: + if opts.Training.gradient_accumulation != 1: + rank_zero_warn( + "Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options.", UserWarning + ) + opts.Training.set(gradient_accumulation=1) + else: + accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + if opts.Training.gradient_accumulation != self.lightning_module.trainer.accumulate_grad_batches: + rank_zero_warn( + f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. " + f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. " + f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " + f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}", UserWarning + ) + opts.Training.set(gradient_accumulation=self.lightning_module.trainer.accumulate_grad_batches) + + @property + def lightning_module(self) -> Optional[LightningModule]: + return self.model.module if isinstance(self.model, LightningIPUModule) else self.model + def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: return self.process_dataloader(dataloader) @@ -191,7 +238,6 @@ def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], dl_args['multiprocessing_context'] = multiprocessing_context if not contains_dataset: dl_args.pop('dataset') - # Override to drop last uneven batch, as IPUs does not support uneven inputs. dl_args['drop_last'] = True diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index f5db660ffeb20..b1b5621e26d02 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -30,6 +30,9 @@ from tests.helpers.runif import RunIf from tests.helpers.simple_models import ClassificationModel +if _IPU_AVAILABLE: + import poptorch + class IPUModel(BoringModel): @@ -141,7 +144,6 @@ def test_optimization(tmpdir): trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - deterministic=True, ipus=2, ) @@ -167,7 +169,7 @@ def test_optimization(tmpdir): model = IPUClassificationModel.load_from_checkpoint(model_path) - trainer = Trainer(default_root_dir=tmpdir, deterministic=True) + trainer = Trainer(default_root_dir=tmpdir, ipus=2) result = trainer.test(model, datamodule=dm) saved_result = result[0]['test_acc'] @@ -330,3 +332,139 @@ def test_autoreport(tmpdir): trainer.fit(model) assert os.path.exists(autoreport_path) assert os.path.isfile(autoreport_path + 'profile.pop') + + +@RunIf(ipu=True) +def test_manual_poptorch_opts(tmpdir): + """Ensure if the user passes manual poptorch Options, we run with the correct object.""" + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.deviceIterations(20) + inference_opts.replicationFactor(1) + inference_opts.Training.gradientAccumulation(1) + + training_opts = poptorch.Options() + training_opts.deviceIterations(20) + training_opts.replicationFactor(1) + training_opts.Training.gradientAccumulation(1) + + trainer = Trainer( + ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.training_opts == training_opts + assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts + trainer.fit(model) + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_ipu_count(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and the number of ipus do not match, we warn and we set it for the user. + """ + + manual_ipus = 1 + expected_ipus = 2 + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.replicationFactor(manual_ipus) + + training_opts = poptorch.Options() + training_opts.replicationFactor(manual_ipus) + + trainer = Trainer( + ipus=expected_ipus, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} " + f"which differs to the ipus={expected_ipus} flag passed to the Trainer. " + f"Setting to {expected_ipus} in the poptorch.Options." + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2 + assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_inference_grad_accum(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and grad accumulation is set greater than 1 for inference, we warn and set to 1. + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.Training.gradientAccumulation(4) + + training_opts = poptorch.Options() + training_opts.Training.gradientAccumulation(1) + + trainer = Trainer( + ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match="Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options.", + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_train_grad_accum(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and grad accumulation differs to accumulate_grad_batches, we + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.Training.gradientAccumulation(1) + + training_opts = poptorch.Options() + training_opts.Training.gradientAccumulation(2) + + trainer = Trainer( + ipus=1, + fast_dev_run=True, + accumulate_grad_batches=1, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match=f"Training poptorch.Options set gradientAccumulation to {2}. " + f"This is different to accumulate_grad_batches which was set to {1}. " + f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " + f"Setting poptorch.Options gradientAccumulation to {1}", + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_default_opts(tmpdir): + """ + Ensure default opts are set correctly in the IPUPlugin. + """ + + model = IPUModel() + + trainer = Trainer(ipus=1, fast_dev_run=True) + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + inference_opts = trainer.accelerator.training_type_plugin.inference_opts + training_opts = trainer.accelerator.training_type_plugin.training_opts + for opts in (inference_opts, training_opts): + assert isinstance(opts, poptorch.Options) + assert opts.Training.gradient_accumulation == 1 + assert opts.device_iterations == 1 + assert opts.replication_factor == 1 From 08e5338203208e9c93b151442df69d624ab5d40c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Jun 2021 12:41:42 +0100 Subject: [PATCH 49/60] Fixes --- .../plugins/training_type/ipu.py | 19 +++++++++---------- tests/accelerators/test_ipu.py | 9 +-------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index a86e050391c57..d5e0a4a9900b0 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -168,23 +168,22 @@ def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: f"Setting to {self.replication_factor} in the poptorch.Options.", UserWarning ) opts.set(replication_factor=self.replication_factor) - if not training: - if opts.Training.gradient_accumulation != 1: - rank_zero_warn( - "Inference poptorch.Options should set gradientAccumulation to 1. " - "Setting gradientAccumulation to 1 for inference options.", UserWarning - ) - opts.Training.set(gradient_accumulation=1) - else: + if training: accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches - if opts.Training.gradient_accumulation != self.lightning_module.trainer.accumulate_grad_batches: + if opts.Training.gradient_accumulation != accumulate_grad_batches: rank_zero_warn( f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. " f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. " f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}", UserWarning ) - opts.Training.set(gradient_accumulation=self.lightning_module.trainer.accumulate_grad_batches) + opts.Training.set(gradient_accumulation=accumulate_grad_batches) + elif opts.Training.gradient_accumulation != 1: + rank_zero_warn( + "Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options.", UserWarning + ) + opts.Training.set(gradient_accumulation=1) @property def lightning_module(self) -> Optional[LightningModule]: diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index b1b5621e26d02..ededc1ac8439c 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -339,23 +339,16 @@ def test_manual_poptorch_opts(tmpdir): """Ensure if the user passes manual poptorch Options, we run with the correct object.""" model = IPUModel() inference_opts = poptorch.Options() - inference_opts.deviceIterations(20) - inference_opts.replicationFactor(1) - inference_opts.Training.gradientAccumulation(1) - training_opts = poptorch.Options() - training_opts.deviceIterations(20) - training_opts.replicationFactor(1) - training_opts.Training.gradientAccumulation(1) trainer = Trainer( ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) ) + trainer.fit(model) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.training_opts == training_opts assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts - trainer.fit(model) @RunIf(ipu=True) From 45dc6a66446e09e296e730770f1d1a11c68db03d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Jun 2021 12:45:14 +0100 Subject: [PATCH 50/60] Fix signature --- pytorch_lightning/plugins/precision/ipu_precision.py | 5 +++++ pytorch_lightning/plugins/training_type/ipu.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py index b0da6e13e20f4..e6983966e166b 100644 --- a/pytorch_lightning/plugins/precision/ipu_precision.py +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -17,6 +17,7 @@ from torch.nn import Module from torch.optim import Optimizer +import pytorch_lightning as pl from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -30,7 +31,11 @@ def __init__(self, precision: int) -> None: def backward( self, + model: 'pl.LightningModule', closure_loss: Tensor, + optimizer: Optimizer, + opt_idx: int, + should_accumulate: bool, *args: Any, **kwargs: Any, ) -> Tensor: diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index d5e0a4a9900b0..75bd6ce092ed1 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -29,7 +29,7 @@ def __init__(self, pl_module: LightningModule, precision: Union[str, int]): super().__init__(pl_module) self.precision = precision - def forward(self, *inputs, **kwargs): + def forward(self, *inputs: Any, **kwargs: Any) -> Any: if self.precision in ("mixed", 16): inputs = self._move_float_tensors_to_half(inputs) From de040c633a3b4f54be3c1829bfcdcc6427b822f7 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Jun 2021 12:47:22 +0100 Subject: [PATCH 51/60] Add types --- pytorch_lightning/accelerators/ipu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py index 34bee31b5a91d..c9bee827af0e6 100644 --- a/pytorch_lightning/accelerators/ipu.py +++ b/pytorch_lightning/accelerators/ipu.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from collections import Callable +from typing import Any from torch.optim import Optimizer @@ -29,6 +30,6 @@ def setup_optimizers(self, trainer: 'pl.Trainer') -> None: if len(self.optimizers) > 1: raise MisconfigurationException("IPUs currently only support one optimizer.") - def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs): + def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None: # Optimizer step is handled by the IPU accelerator. lambda_closure() From 42d7ab08365699da2a8bbd3f59a4a1eeaec294f2 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Jun 2021 15:37:46 +0100 Subject: [PATCH 52/60] Remove autoround --- pytorch_lightning/plugins/training_type/ipu.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 75bd6ce092ed1..1d64636e5f43a 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -52,7 +52,6 @@ class IPUPlugin(ParallelPlugin): def __init__( self, device_iterations: int = 1, - autoround_num_ipus: bool = True, autoreport: bool = True, autoreport_dir: Optional[str] = None, convert_model_to_half: bool = False, @@ -67,7 +66,6 @@ def __init__( device_iterations: Number of iterations to run on device at once before returning to host. This can be used as an optimization to speed up training. https://docs.graphcore.ai/projects/poptorch-user-guide/en/0.1.67/batching.html - autoround_num_ipus: When selecting multiple IPUs, auto-rounds to powers of 2 as required for IPUs. autoreport: Enable auto-reporting for IPUs using PopVision https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html autoreport_dir: Optional directory to store autoReport output. @@ -85,7 +83,6 @@ def __init__( self.convert_model_to_half = convert_model_to_half self.device_iterations = device_iterations - self.autoround_num_ipus = autoround_num_ipus self.autoreport = autoreport self.autoreport_dir = autoreport_dir self.poptorch_models = {} @@ -139,7 +136,6 @@ def _create_opts(self, training: bool): opts.replicationFactor(self.replication_factor) gradient_accumulation = self.lightning_module.trainer.accumulate_grad_batches if training else 1 opts.Training.gradientAccumulation(gradient_accumulation) - opts.autoRoundNumIPUs(self.autoround_num_ipus) if os.environ.get("PL_GLOBAL_SEED"): opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"])) From 36f36720c99474c2d2578b7b7e79d5dff0e2dd7c Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 9 Jun 2021 19:42:02 +0100 Subject: [PATCH 53/60] Add docstring --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index eb205004fe070..16db47e61dd5d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -284,6 +284,8 @@ def __init__( tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] + ipus: How many IPUs to train on. + track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. truncated_bptt_steps: Deprecated in v1.3 to be removed in 1.5. From f9d61c52929f64d016cead4b5cd251f13e818c39 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 10 Jun 2021 21:53:28 +0100 Subject: [PATCH 54/60] ipu_cores -> ipus --- .../trainer/connectors/accelerator_connector.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 1d50a93b0b086..6b6c73ef327a8 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -83,7 +83,7 @@ def __init__( self, num_processes, tpu_cores, - ipu_cores, + ipus, distributed_backend, auto_select_gpus, gpus, @@ -103,7 +103,7 @@ def __init__( self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - self.ipu_cores = ipu_cores + self.ipus = ipus self.distributed_backend = distributed_backend self.auto_select_gpus = auto_select_gpus self.gpus = gpus @@ -256,7 +256,7 @@ def on_tpu(self) -> bool: @property def on_ipu(self) -> bool: - return self.ipu_cores is not None + return self.ipus is not None @property def tpu_id(self) -> Optional[int]: @@ -334,8 +334,8 @@ def parallel_devices(self) -> List[Union[torch.device, int]]: if isinstance(self.tpu_cores, int): devices = list(range(self.tpu_cores)) elif self.on_ipu: - if isinstance(self.ipu_cores, int): - devices = list(range(self.ipu_cores)) + if isinstance(self.ipus, int): + devices = list(range(self.ipus)) else: devices = [torch.device("cpu")] * self.num_processes return devices @@ -636,8 +636,8 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): num_tpu_cores = self.tpu_cores if self.tpu_cores is not None else 0 rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores') - num_ipu_cores = self.ipu_cores if self.ipu_cores is not None else 0 - rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipu_cores} IPU cores') + num_ipus = self.ipus if self.ipus is not None else 0 + rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPU cores') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: rank_zero_warn( From cf48ff86f0050d353bc93d8df843f3856e8a45a9 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 11 Jun 2021 11:06:55 +0100 Subject: [PATCH 55/60] Add test, remove unnecessary precision set --- .../plugins/precision/ipu_precision.py | 4 ---- .../trainer/connectors/accelerator_connector.py | 2 +- tests/accelerators/test_ipu.py | 15 +++++++++++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py index e6983966e166b..21510f2914180 100644 --- a/pytorch_lightning/plugins/precision/ipu_precision.py +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -25,10 +25,6 @@ class IPUPrecisionPlugin(PrecisionPlugin): - def __init__(self, precision: int) -> None: - super().__init__() - self.precision = precision - def backward( self, model: 'pl.LightningModule', diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6b6c73ef327a8..feceb386b9aa9 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -369,7 +369,7 @@ def select_precision_plugin(self) -> PrecisionPlugin: self.amp_type = AMPType.from_str(self.amp_type) if self.on_ipu: - return IPUPrecisionPlugin(self.precision) + return IPUPrecisionPlugin() if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index ededc1ac8439c..4feb117ebf543 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -183,7 +183,6 @@ class TestCallback(Callback): def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) - assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 raise SystemExit @@ -201,7 +200,6 @@ class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) - assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 assert trainer.accelerator.training_type_plugin.convert_model_to_half for param in trainer.accelerator.model.parameters(): @@ -461,3 +459,16 @@ def test_default_opts(tmpdir): assert opts.Training.gradient_accumulation == 1 assert opts.device_iterations == 1 assert opts.replication_factor == 1 + + +@RunIf(ipu=True) +def test_clip_val_fail(tmpdir): + """ + Ensure if clipping value is greater than 0 or not None, we throw an exception. + """ + + model = IPUModel() + + trainer = Trainer(ipus=1, fast_dev_run=True, gradient_clip_val=10) + with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): + trainer.fit(model) From 02a75b518c4a9b47946171ecf8c2bbab04b23d19 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 11 Jun 2021 11:10:29 +0100 Subject: [PATCH 56/60] Add optimizer test --- tests/accelerators/test_ipu.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 4feb117ebf543..3f26148767126 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -469,6 +469,24 @@ def test_clip_val_fail(tmpdir): model = IPUModel() - trainer = Trainer(ipus=1, fast_dev_run=True, gradient_clip_val=10) + trainer = Trainer(ipus=1, gradient_clip_val=10) with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): trainer.fit(model) + + +@RunIf(ipu=True) +def test_multi_optimizers_fail(tmpdir): + """ + Ensure if there are multiple optimizers, we throw an exception + """ + + class TestModel(IPUModel): + + def configure_optimizers(self): + return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())] + + model = TestModel() + + trainer = Trainer(ipus=1) + with pytest.raises(MisconfigurationException, match="IPUs currently only support one optimizer."): + trainer.fit(model) From d18fc559f44e49a66b5fa6eba13aa75ada92e681 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 11 Jun 2021 12:58:18 +0100 Subject: [PATCH 57/60] Add precision back with test --- pytorch_lightning/plugins/precision/ipu_precision.py | 4 ++++ .../trainer/connectors/accelerator_connector.py | 2 +- tests/accelerators/test_ipu.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py index 21510f2914180..e6983966e166b 100644 --- a/pytorch_lightning/plugins/precision/ipu_precision.py +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -25,6 +25,10 @@ class IPUPrecisionPlugin(PrecisionPlugin): + def __init__(self, precision: int) -> None: + super().__init__() + self.precision = precision + def backward( self, model: 'pl.LightningModule', diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index feceb386b9aa9..6b6c73ef327a8 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -369,7 +369,7 @@ def select_precision_plugin(self) -> PrecisionPlugin: self.amp_type = AMPType.from_str(self.amp_type) if self.on_ipu: - return IPUPrecisionPlugin() + return IPUPrecisionPlugin(self.precision) if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 3f26148767126..67a70b1af8999 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -183,6 +183,7 @@ class TestCallback(Callback): def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 raise SystemExit @@ -200,6 +201,7 @@ class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 assert trainer.accelerator.training_type_plugin.convert_model_to_half for param in trainer.accelerator.model.parameters(): @@ -490,3 +492,13 @@ def configure_optimizers(self): trainer = Trainer(ipus=1) with pytest.raises(MisconfigurationException, match="IPUs currently only support one optimizer."): trainer.fit(model) + + +@RunIf(ipu=True) +def test_precision_plugin(tmpdir): + """ + Ensure precision plugin value is set correctly. + """ + + plugin = IPUPrecisionPlugin(precision=16) + assert plugin.precision == 16 From 043884af85baad959545048f0bb29c4d1abaa147 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 11 Jun 2021 15:17:15 +0100 Subject: [PATCH 58/60] Address code review --- .../plugins/training_type/ipu.py | 23 ++++-- .../connectors/accelerator_connector.py | 2 +- tests/accelerators/test_ipu.py | 71 +++++++++++-------- 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 1d64636e5f43a..9cacbed585aa9 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import inspect import json import os @@ -94,9 +107,7 @@ def __init__( options = {"autoReport.all": self.autoreport} if self.autoreport_dir: self._fs = get_filesystem(str(self.autoreport_dir)) - - if not self._fs.exists(self.autoreport_dir): - self._fs.makedirs(self.autoreport_dir) + self._fs.makedirs(self.autoreport_dir, exist_ok=True) options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) @@ -161,7 +172,7 @@ def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: rank_zero_warn( f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} " f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. " - f"Setting to {self.replication_factor} in the poptorch.Options.", UserWarning + f"Setting to {self.replication_factor} in the poptorch.Options." ) opts.set(replication_factor=self.replication_factor) if training: @@ -171,13 +182,13 @@ def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. " f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. " f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " - f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}", UserWarning + f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}" ) opts.Training.set(gradient_accumulation=accumulate_grad_batches) elif opts.Training.gradient_accumulation != 1: rank_zero_warn( "Inference poptorch.Options should set gradientAccumulation to 1. " - "Setting gradientAccumulation to 1 for inference options.", UserWarning + "Setting gradientAccumulation to 1 for inference options." ) opts.Training.set(gradient_accumulation=1) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6b6c73ef327a8..8f5de9a6302aa 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -637,7 +637,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores') num_ipus = self.ipus if self.ipus is not None else 0 - rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPU cores') + rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: rank_zero_warn( diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 67a70b1af8999..bf5f0ff6ca125 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -98,17 +98,17 @@ def test_epoch_end(self, outputs) -> None: @pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine") def test_fail_if_no_ipus(tmpdir): with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): - Trainer(ipus=1) + Trainer(default_root_dir=tmpdir, ipus=1) with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): - Trainer(ipus=1, accelerator='ipu') + Trainer(default_root_dir=tmpdir, ipus=1, accelerator='ipu') @RunIf(ipu=True) def test_accelerator_selected(tmpdir): - trainer = Trainer(ipus=1) + trainer = Trainer(default_root_dir=tmpdir, ipus=1) assert isinstance(trainer.accelerator, IPUAccelerator) - trainer = Trainer(ipus=1, accelerator='ipu') + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accelerator='ipu') assert isinstance(trainer.accelerator, IPUAccelerator) @@ -116,7 +116,7 @@ def test_accelerator_selected(tmpdir): @pytest.mark.parametrize('ipus', [1, 4]) def test_all_stages(tmpdir, ipus): model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipus=ipus) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=ipus) trainer.fit(model) trainer.validate(model) trainer.test(model) @@ -128,7 +128,7 @@ def test_all_stages(tmpdir, ipus): def test_inference_only(tmpdir, ipus): model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipus=ipus) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=ipus) trainer.validate(model) trainer.test(model) trainer.predict(model, model.val_dataloader()) @@ -188,7 +188,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) with pytest.raises(SystemExit): trainer.fit(model) @@ -210,6 +210,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: model = IPUModel() trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, @@ -234,7 +235,13 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipus=1, plugins=IPUPlugin(device_iterations=2), callbacks=TestCallback()) + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + ipus=1, + plugins=IPUPlugin(device_iterations=2), + callbacks=TestCallback() + ) with pytest.raises(SystemExit): trainer.fit(model) @@ -254,7 +261,9 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: raise SystemExit model = IPUModel() - trainer = Trainer(fast_dev_run=True, ipus=1, accumulate_grad_batches=2, callbacks=TestCallback()) + trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, ipus=1, accumulate_grad_batches=2, callbacks=TestCallback() + ) with pytest.raises(SystemExit): trainer.fit(model) @@ -298,7 +307,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da assert torch.all(outputs == 4).item() model = StageModel() - trainer = Trainer(fast_dev_run=True, ipus=1, callbacks=TestCallback()) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, callbacks=TestCallback()) trainer.fit(model) trainer.test(model) trainer.validate(model) @@ -308,7 +317,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da @RunIf(ipu=True) def test_accumulate_grad_batches_dict_fails(tmpdir): model = IPUModel() - trainer = Trainer(ipus=1, accumulate_grad_batches={0: 1}) + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={0: 1}) with pytest.raises( MisconfigurationException, match="IPUs currently only support accumulate_grad_batches being an integer value." ): @@ -318,7 +327,7 @@ def test_accumulate_grad_batches_dict_fails(tmpdir): @RunIf(ipu=True) def test_clip_gradients_fails(tmpdir): model = IPUModel() - trainer = Trainer(ipus=1, gradient_clip_val=10) + trainer = Trainer(default_root_dir=tmpdir, ipus=1, gradient_clip_val=10) with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): trainer.fit(model) @@ -328,7 +337,12 @@ def test_autoreport(tmpdir): """Ensure autoreport dumps to a file.""" model = IPUModel() autoreport_path = os.path.join(tmpdir, 'report/') - trainer = Trainer(ipus=1, fast_dev_run=True, plugins=IPUPlugin(autoreport=True, autoreport_dir=autoreport_path)) + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(autoreport=True, autoreport_dir=autoreport_path) + ) trainer.fit(model) assert os.path.exists(autoreport_path) assert os.path.isfile(autoreport_path + 'profile.pop') @@ -342,7 +356,10 @@ def test_manual_poptorch_opts(tmpdir): training_opts = poptorch.Options() trainer = Trainer( - ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) ) trainer.fit(model) @@ -368,6 +385,7 @@ def test_manual_poptorch_opts_ipu_count(tmpdir): training_opts.replicationFactor(manual_ipus) trainer = Trainer( + default_root_dir=tmpdir, ipus=expected_ipus, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) @@ -399,7 +417,10 @@ def test_manual_poptorch_opts_inference_grad_accum(tmpdir): training_opts.Training.gradientAccumulation(1) trainer = Trainer( - ipus=1, fast_dev_run=True, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) ) with pytest.warns( UserWarning, @@ -426,6 +447,7 @@ def test_manual_poptorch_opts_train_grad_accum(tmpdir): training_opts.Training.gradientAccumulation(2) trainer = Trainer( + default_root_dir=tmpdir, ipus=1, fast_dev_run=True, accumulate_grad_batches=1, @@ -451,7 +473,7 @@ def test_default_opts(tmpdir): model = IPUModel() - trainer = Trainer(ipus=1, fast_dev_run=True) + trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True) trainer.fit(model) assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) inference_opts = trainer.accelerator.training_type_plugin.inference_opts @@ -464,20 +486,7 @@ def test_default_opts(tmpdir): @RunIf(ipu=True) -def test_clip_val_fail(tmpdir): - """ - Ensure if clipping value is greater than 0 or not None, we throw an exception. - """ - - model = IPUModel() - - trainer = Trainer(ipus=1, gradient_clip_val=10) - with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): - trainer.fit(model) - - -@RunIf(ipu=True) -def test_multi_optimizers_fail(tmpdir): +def test_multi_optimizers_fails(tmpdir): """ Ensure if there are multiple optimizers, we throw an exception """ @@ -489,7 +498,7 @@ def configure_optimizers(self): model = TestModel() - trainer = Trainer(ipus=1) + trainer = Trainer(default_root_dir=tmpdir, ipus=1) with pytest.raises(MisconfigurationException, match="IPUs currently only support one optimizer."): trainer.fit(model) From b2493913ec965f190d51e1f9316e128774bbe91a Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 11 Jun 2021 15:24:13 +0100 Subject: [PATCH 59/60] Change to probs --- pl_examples/ipu_examples/mnist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py index 87087d67766b2..37cb63c076e2e 100644 --- a/pl_examples/ipu_examples/mnist.py +++ b/pl_examples/ipu_examples/mnist.py @@ -46,11 +46,11 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): x, y = batch - logits = self(x) + probs = self(x) # we currently return the accuracy as the validation_step/test_step is run on the IPU devices. # Outputs from the step functions are sent to the host device, where we calculate the metrics in # validation_epoch_end and test_epoch_end for the test_step. - acc = self.accuracy(logits, y) + acc = self.accuracy(probs, y) return acc def test_step(self, batch, batch_idx): From b0dd20609b7e0ed3ea12f63e9a68a32509bcd3cb Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 11 Jun 2021 15:26:43 +0100 Subject: [PATCH 60/60] Move some of the asserts earlier --- tests/accelerators/test_ipu.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index bf5f0ff6ca125..52496e28b2230 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -182,13 +182,13 @@ def test_mixed_precision(tmpdir): class TestCallback(Callback): def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: - assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) - assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 raise SystemExit model = IPUModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 with pytest.raises(SystemExit): trainer.fit(model) @@ -199,9 +199,6 @@ def test_pure_half_precision(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) - assert trainer.accelerator.precision_plugin.precision == 16 assert trainer.accelerator.model.precision == 16 assert trainer.accelerator.training_type_plugin.convert_model_to_half for param in trainer.accelerator.model.parameters(): @@ -217,6 +214,11 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: plugins=IPUPlugin(convert_model_to_half=True), callbacks=TestCallback() ) + + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + with pytest.raises(SystemExit): trainer.fit(model) @@ -227,7 +229,6 @@ def test_device_iterations_ipu_plugin(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) assert trainer.accelerator.training_type_plugin.device_iterations == 2 # assert device iterations has been set correctly within the poptorch options poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] @@ -242,6 +243,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: plugins=IPUPlugin(device_iterations=2), callbacks=TestCallback() ) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) with pytest.raises(SystemExit): trainer.fit(model)