diff --git a/CHANGELOG.md b/CHANGELOG.md index 54f9ad91db2b1..517f4d67d4233 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added trainer stage hooks for Training Plugins and Accelerators ([#7864](https://github.com/PyTorchLightning/pytorch-lightning/pull/7864)) +- Added IPU Accelerator ([#7867](https://github.com/PyTorchLightning/pytorch-lightning/pull/7867)) + + - Added a warning if `Trainer(log_every_n_steps)` is a value too high for the training dataloader ([#7734](https://github.com/PyTorchLightning/pytorch-lightning/pull/7734)) diff --git a/pl_examples/ipu_examples/__init__.py b/pl_examples/ipu_examples/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pl_examples/ipu_examples/mnist.py b/pl_examples/ipu_examples/mnist.py new file mode 100644 index 0000000000000..37cb63c076e2e --- /dev/null +++ b/pl_examples/ipu_examples/mnist.py @@ -0,0 +1,89 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch.nn import functional as F + +import pytorch_lightning as pl +from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule + + +class LitClassifier(pl.LightningModule): + + def __init__( + self, + hidden_dim: int = 128, + learning_rate: float = 0.0001, + ): + super().__init__() + self.save_hyperparameters() + + self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) + self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + probs = self(x) + # we currently return the accuracy as the validation_step/test_step is run on the IPU devices. + # Outputs from the step functions are sent to the host device, where we calculate the metrics in + # validation_epoch_end and test_epoch_end for the test_step. + acc = self.accuracy(probs, y) + return acc + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def accuracy(self, logits, y): + # currently IPU poptorch doesn't implicit convert bools to tensor + # hence we use an explicit calculation for accuracy here. Once fixed in poptorch + # we can use the accuracy metric. + acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) + return acc + + def validation_epoch_end(self, outputs) -> None: + # since the training step/validation step and test step are run on the IPU device + # we must log the average loss outside the step functions. + self.log('val_acc', torch.stack(outputs).mean(), prog_bar=True) + + def test_epoch_end(self, outputs) -> None: + self.log('test_acc', torch.stack(outputs).mean()) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + +if __name__ == '__main__': + dm = MNISTDataModule(batch_size=32) + + model = LitClassifier() + + trainer = pl.Trainer(max_epochs=2, ipus=8) + + trainer.fit(model, datamodule=dm) + trainer.test(model, datamodule=dm) diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 05e15fe1f1767..2a460a27e373a 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -13,4 +13,5 @@ from pytorch_lightning.accelerators.accelerator import Accelerator # noqa F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa F401 from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa F401 +from pytorch_lightning.accelerators.ipu import IPUAccelerator # noqa F401 from pytorch_lightning.accelerators.tpu import TPUAccelerator # noqa F401 diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py new file mode 100644 index 0000000000000..c9bee827af0e6 --- /dev/null +++ b/pytorch_lightning/accelerators/ipu.py @@ -0,0 +1,35 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import Callable +from typing import Any + +from torch.optim import Optimizer + +import pytorch_lightning as pl +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class IPUAccelerator(Accelerator): + """ Accelerator for IPUs. """ + + def setup_optimizers(self, trainer: 'pl.Trainer') -> None: + super().setup_optimizers(trainer) + + if len(self.optimizers) > 1: + raise MisconfigurationException("IPUs currently only support one optimizer.") + + def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None: + # Optimizer step is handled by the IPU accelerator. + lambda_closure() diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py index 58d43dc54cb7f..cc95671ebf2cc 100644 --- a/pytorch_lightning/plugins/__init__.py +++ b/pytorch_lightning/plugins/__init__.py @@ -9,6 +9,7 @@ from pytorch_lightning.plugins.precision.fully_sharded_native_amp import ( # noqa: F401 FullyShardedNativeMixedPrecisionPlugin, ) +from pytorch_lightning.plugins.precision.ipu_precision import IPUPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 @@ -20,6 +21,7 @@ from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin # noqa: F401 +from pytorch_lightning.plugins.training_type.ipu import IPUPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.rpc import RPCPlugin # noqa: F401 from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin # noqa: F401 @@ -41,6 +43,8 @@ "DeepSpeedPrecisionPlugin", "DoublePrecisionPlugin", "HorovodPlugin", + "IPUPlugin", + "IPUPrecisionPlugin", "NativeMixedPrecisionPlugin", "PrecisionPlugin", "ShardedNativeMixedPrecisionPlugin", diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py new file mode 100644 index 0000000000000..e6983966e166b --- /dev/null +++ b/pytorch_lightning/plugins/precision/ipu_precision.py @@ -0,0 +1,60 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, Union + +from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer + +import pytorch_lightning as pl +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.utilities import GradClipAlgorithmType +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class IPUPrecisionPlugin(PrecisionPlugin): + + def __init__(self, precision: int) -> None: + super().__init__() + self.precision = precision + + def backward( + self, + model: 'pl.LightningModule', + closure_loss: Tensor, + optimizer: Optimizer, + opt_idx: int, + should_accumulate: bool, + *args: Any, + **kwargs: Any, + ) -> Tensor: + # IPU internally manages bwd step. + return closure_loss + + def clip_gradients( + self, + optimizer: Optimizer, + clip_val: Union[int, float], + gradient_clip_algorithm: GradClipAlgorithmType = GradClipAlgorithmType.NORM, + model: Optional[Module] = None + ) -> None: + """Clips the gradients""" + if clip_val is None: + return + + clip_val = float(clip_val) + if clip_val <= 0: + return + + raise MisconfigurationException("IPUs currently do not support clipping gradients.") diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py new file mode 100644 index 0000000000000..9cacbed585aa9 --- /dev/null +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -0,0 +1,391 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +import json +import os +from typing import Any, Iterable, List, Optional, Union + +import torch +from torch.utils.data import DataLoader + +from pytorch_lightning import _logger as log +from pytorch_lightning.callbacks import GradientAccumulationScheduler +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.trainer.supporters import CombinedLoader +from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.cloud_io import get_filesystem +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +if _POPTORCH_AVAILABLE: + import poptorch + + +class LightningIPUModule(_LightningModuleWrapperBase): + + def __init__(self, pl_module: LightningModule, precision: Union[str, int]): + super().__init__(pl_module) + self.precision = precision + + def forward(self, *inputs: Any, **kwargs: Any) -> Any: + if self.precision in ("mixed", 16): + inputs = self._move_float_tensors_to_half(inputs) + + return super().forward(*inputs, **kwargs) + + @staticmethod + def batch_to(data: torch.Tensor) -> torch.Tensor: + return data.half() + + def _move_float_tensors_to_half(self, batch: Any) -> Any: + batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) + return batch + + +class IPUPlugin(ParallelPlugin): + """ + Plugin for training on IPU devices. + """ + + def __init__( + self, + device_iterations: int = 1, + autoreport: bool = True, + autoreport_dir: Optional[str] = None, + convert_model_to_half: bool = False, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + training_opts: Optional['poptorch.Options'] = None, + inference_opts: Optional['poptorch.Options'] = None + ) -> None: + """ + Arguments: + + device_iterations: Number of iterations to run on device at once before returning to host. + This can be used as an optimization to speed up training. + https://docs.graphcore.ai/projects/poptorch-user-guide/en/0.1.67/batching.html + autoreport: Enable auto-reporting for IPUs using PopVision + https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html + autoreport_dir: Optional directory to store autoReport output. + convert_model_to_half: Converts the model to half precision, which can be used for pure FP16 training. + training_opts: Optional ``poptorch.Options`` to override the default created options for training. + inference_opts: Optional ``poptorch.Options`` to override the default + created options for validation/testing and predicting. + """ + super().__init__(parallel_devices, cluster_environment) + if not _POPTORCH_AVAILABLE or not poptorch.ipuHardwareIsAvailable(): + raise MisconfigurationException( + "The IPU Accelerator requires IPU devices to run. " + "Learn more or get started with IPUs at https://www.graphcore.ai/getstarted" + ) + + self.convert_model_to_half = convert_model_to_half + self.device_iterations = device_iterations + self.autoreport = autoreport + self.autoreport_dir = autoreport_dir + self.poptorch_models = {} + self._original_accumulate_grad_batches = None + self._training_opts = training_opts + self._inference_opts = inference_opts + + if self.autoreport: + options = {"autoReport.all": self.autoreport} + if self.autoreport_dir: + self._fs = get_filesystem(str(self.autoreport_dir)) + self._fs.makedirs(self.autoreport_dir, exist_ok=True) + options["autoReport.directory"] = self.autoreport_dir + os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) + + def pre_dispatch(self) -> None: + self._handle_gradient_accumulation_steps() + if self.convert_model_to_half: + log.info('Using full 16bit precision, converting LightningModule weights to FP16.') + self.model = self.model.half() + precision = self.lightning_module.trainer.precision + precision = 16 if self.convert_model_to_half else precision + + model = LightningIPUModule(self.lightning_module, precision) + self.model = model + + # Separate models are instantiated for different stages, but they share the same weights on host. + # When validation/test models are run, weights are synced first. + + if self.lightning_module.trainer.state.stage is RunningStage.TRAINING: + # Create model for training which will run training. + optimizer = self.lightning_module.trainer.optimizers[0] + model = poptorch.trainingModel(model=model, options=self.training_opts, optimizer=optimizer) + self.poptorch_models[RunningStage.TRAINING] = model + for x in (RunningStage.VALIDATING, RunningStage.TESTING, RunningStage.PREDICTING): + model = poptorch.inferenceModel( + model=model, + options=self.inference_opts, + ) + self.poptorch_models[x] = model + + @property + def replication_factor(self): + return len(self.parallel_devices) + + def _create_opts(self, training: bool): + opts = poptorch.Options() + opts.deviceIterations(self.device_iterations) + opts.replicationFactor(self.replication_factor) + gradient_accumulation = self.lightning_module.trainer.accumulate_grad_batches if training else 1 + opts.Training.gradientAccumulation(gradient_accumulation) + + if os.environ.get("PL_GLOBAL_SEED"): + opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"])) + return opts + + @property + def training_opts(self) -> 'poptorch.Options': + if self._training_opts is None: + self._training_opts = self._create_opts(training=True) + self._validate_opts(self._training_opts, training=True) + return self._training_opts + + @property + def inference_opts(self) -> 'poptorch.Options': + if self._inference_opts is None: + self._inference_opts = self._create_opts(training=False) + self._validate_opts(self._inference_opts, training=False) + return self._inference_opts + + def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: + if opts is not None: + if opts.replication_factor != self.replication_factor: + rank_zero_warn( + f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} " + f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. " + f"Setting to {self.replication_factor} in the poptorch.Options." + ) + opts.set(replication_factor=self.replication_factor) + if training: + accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + if opts.Training.gradient_accumulation != accumulate_grad_batches: + rank_zero_warn( + f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. " + f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. " + f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " + f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}" + ) + opts.Training.set(gradient_accumulation=accumulate_grad_batches) + elif opts.Training.gradient_accumulation != 1: + rank_zero_warn( + "Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options." + ) + opts.Training.set(gradient_accumulation=1) + + @property + def lightning_module(self) -> Optional[LightningModule]: + return self.model.module if isinstance(self.model, LightningIPUModule) else self.model + + def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + return self.process_dataloader(dataloader) + + def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + if isinstance(dataloader, CombinedLoader): + dataloader.loaders = apply_to_collection( + dataloader.loaders, + DataLoader, + self.process_dataloader, + ) + return dataloader + elif isinstance(dataloader, list): + dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader) + return dataloader + if not isinstance(dataloader, poptorch.DataLoader): + dataloader = self._convert_to_poptorch_loader( + dataloader=dataloader, opts=self._create_opts(training=self.lightning_module.training) + ) + return dataloader + + def _convert_to_poptorch_loader(self, dataloader: Union[Iterable, DataLoader], + opts: 'poptorch.Options') -> Union[Iterable, DataLoader]: + skip_keys = ('sampler', 'batch_sampler', 'dataset_kind') + + attrs = {k: v for k, v in vars(dataloader).items() if not k.startswith("_")} + + params = set(inspect.signature(dataloader.__init__).parameters) + contains_dataset = True + + if type(dataloader) is not DataLoader: + contains_dataset = "dataset" in params + params.update(inspect.signature(DataLoader.__init__).parameters) + + dl_args = {name: attrs[name] for name in params if name in attrs and name not in skip_keys} + + multiprocessing_context = dataloader.multiprocessing_context + dl_args['multiprocessing_context'] = multiprocessing_context + if not contains_dataset: + dl_args.pop('dataset') + # Override to drop last uneven batch, as IPUs does not support uneven inputs. + dl_args['drop_last'] = True + + dataloader = poptorch.DataLoader(**dl_args, options=opts) + dataloader.multiprocessing_context = multiprocessing_context + return dataloader + + def _handle_gradient_accumulation_steps(self): + """ + This functions overrides the trainer.accumulation_scheduler to generate + ``accumulate_grad_batches=1``. + Therefore, ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation. + """ + self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + if not isinstance(self._original_accumulate_grad_batches, int): + raise MisconfigurationException( + f"IPUs currently only support accumulate_grad_batches being an integer value. " + f"Received {self._original_accumulate_grad_batches}" + ) + if self._original_accumulate_grad_batches > 1: + self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1}) + + def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + if self._original_accumulate_grad_batches > 1: + if total_batch_idx % self._original_accumulate_grad_batches == 0: + current_global_step += 1 + return current_global_step + return super().update_global_step(total_batch_idx, current_global_step) + + @property + def _n_replicate(self): + # Ensure we replicate values to have enough dimensions to split across devices + accumulate_grad_batches = self._original_accumulate_grad_batches + return self.replication_factor * self.device_iterations * accumulate_grad_batches + + def _prepare_input(self, args: Any): + + def to_tuple(x): + return tuple(x) + + def to_tensor(x): + return torch.tensor(x).unsqueeze(0).repeat(self._n_replicate) + + args = apply_to_collection(args, dtype=list, function=to_tuple) + args = apply_to_collection(args, dtype=(int, float), function=to_tensor) + return args + + def training_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.TRAINING](*args, **kwargs) + + def validation_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.VALIDATING](*args, **kwargs) + + def test_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.TESTING](*args, **kwargs) + + def predict_step(self, *args, **kwargs): + args = self._prepare_input(args) + return self.poptorch_models[RunningStage.PREDICTING](*args, **kwargs) + + def teardown(self) -> None: + for model in self.poptorch_models.values(): + model.destroy() + + def _compiled(self, model: Any): + # Required to ensure we only attach compiled models, as they are compiled lazily. + return model._executable is not None + + def _detach_models(self): + """ + Detaches all stage specific models from IPU devices. + """ + for k, model in self.poptorch_models.items(): + if self._compiled(model) and model.isAttachedToDevice(): + model.detachFromDevice() + + def _load_model(self, stage: str): + """ + Loads the stage specific accelerator model onto device if compiled and not attached to IPU devices. + Args: + stage: The stage to load + """ + self._detach_models() + model = self.poptorch_models[stage] + if self._compiled(model) and not model.isAttachedToDevice(): + model.attachToDevice() + + def on_train_start(self): + self._load_model(RunningStage.TRAINING) + + def on_validation_start(self): + self._load_model(RunningStage.VALIDATING) + + def on_test_start(self): + self._load_model(RunningStage.TESTING) + + def on_predict_start(self): + self._load_model(RunningStage.PREDICTING) + + def on_train_end(self): + self._detach_models() + + def on_validation_end(self): + self._detach_models() + + def on_test_end(self): + self._detach_models() + + def on_predict_end(self): + self._detach_models() + + def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + # Updates optimizer stats if LR scheduler modified the optimizer state + optimizer = self.lightning_module.trainer.optimizers[0] + self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer) + + @property + def on_gpu(self) -> bool: + return False + + @property + def root_device(self) -> torch.device: + pass + + def model_to_device(self) -> None: + pass + + @property + def is_global_zero(self) -> bool: + return True + + def reduce(self, tensor: Union[torch.Tensor, Any], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Any]: + return tensor + + def barrier(self, name: Optional[str] = None) -> None: + pass + + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + return tensor + + def broadcast(self, obj: object, src: int = 0) -> object: + return obj diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4d692ec517d19..8f5de9a6302aa 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -21,6 +21,7 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator from pytorch_lightning.plugins import ( ApexMixedPrecisionPlugin, @@ -36,6 +37,8 @@ DoublePrecisionPlugin, FullyShardedNativeMixedPrecisionPlugin, HorovodPlugin, + IPUPlugin, + IPUPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin, ShardedNativeMixedPrecisionPlugin, @@ -57,6 +60,7 @@ from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, _NATIVE_AMP_AVAILABLE, _TPU_AVAILABLE, AMPType, @@ -79,6 +83,7 @@ def __init__( self, num_processes, tpu_cores, + ipus, distributed_backend, auto_select_gpus, gpus, @@ -98,6 +103,7 @@ def __init__( self.num_processes = num_processes self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + self.ipus = ipus self.distributed_backend = distributed_backend self.auto_select_gpus = auto_select_gpus self.gpus = gpus @@ -248,6 +254,10 @@ def on_cpu(self) -> bool: def on_tpu(self) -> bool: return self.tpu_cores is not None + @property + def on_ipu(self) -> bool: + return self.ipus is not None + @property def tpu_id(self) -> Optional[int]: if self.on_tpu and isinstance(self.tpu_cores, list): @@ -323,13 +333,18 @@ def parallel_devices(self) -> List[Union[torch.device, int]]: # https://github.com/PyTorchLightning/pytorch-lightning/issues/3169 if isinstance(self.tpu_cores, int): devices = list(range(self.tpu_cores)) + elif self.on_ipu: + if isinstance(self.ipus, int): + devices = list(range(self.ipus)) else: devices = [torch.device("cpu")] * self.num_processes return devices @property def root_gpu(self) -> Optional[int]: - return self.accelerator.root_device.index if not isinstance(self.accelerator, TPUAccelerator) else None + return self.accelerator.root_device.index if not isinstance( + self.accelerator, (IPUAccelerator, TPUAccelerator) + ) else None @property def is_training_type_in_plugins(self) -> bool: @@ -353,6 +368,9 @@ def select_precision_plugin(self) -> PrecisionPlugin: # set precision type self.amp_type = AMPType.from_str(self.amp_type) + if self.on_ipu: + return IPUPrecisionPlugin(self.precision) + if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) @@ -459,6 +477,8 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: plugin = HorovodPlugin(parallel_devices=self.parallel_devices) elif self.on_tpu and isinstance(self.tpu_cores, list): plugin = SingleTPUPlugin(self.tpu_id) + elif self.on_ipu: + plugin = IPUPlugin(parallel_devices=self.parallel_devices) else: single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids) plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu")) @@ -499,6 +519,8 @@ def select_accelerator(self) -> Accelerator: acc_cls = GPUAccelerator elif self.on_tpu: acc_cls = TPUAccelerator + elif self.on_ipu: + acc_cls = IPUAccelerator else: acc_cls = CPUAccelerator # as precision_plugin is dependent on training_type_plugin, make sure @@ -562,6 +584,8 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): self._device_type = DeviceType.TPU if isinstance(self.tpu_cores, int): self._distrib_type = DistributedType.TPU_SPAWN + elif self.distributed_backend == 'ipu': + self._device_type = DeviceType.IPU elif self.distributed_backend and self._distrib_type is None: self._distrib_type = DistributedType(self.distributed_backend) @@ -609,8 +633,11 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') - num_cores = self.tpu_cores if self.tpu_cores is not None else 0 - rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') + num_tpu_cores = self.tpu_cores if self.tpu_cores is not None else 0 + rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_tpu_cores} TPU cores') + + num_ipus = self.ipus if self.ipus is not None else 0 + rank_zero_info(f'IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs') if torch.cuda.is_available() and self._device_type != DeviceType.GPU: rank_zero_warn( diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c042a6c6d6ade..2e817cc1b4bbe 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -107,6 +107,7 @@ def __init__( gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, + ipus: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: Optional[int] = None, overfit_batches: Union[int, float] = 0.0, @@ -283,6 +284,8 @@ def __init__( tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] + ipus: How many IPUs to train on. + track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. truncated_bptt_steps: Deprecated in v1.3 to be removed in 1.5. @@ -323,8 +326,8 @@ def __init__( self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector( - num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, benchmark, - replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins + num_processes, tpu_cores, ipus, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, + benchmark, replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 6664be43bef88..613a5013d5198 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -43,10 +43,12 @@ _HOROVOD_AVAILABLE, _HYDRA_AVAILABLE, _HYDRA_EXPERIMENTAL_AVAILABLE, + _IPU_AVAILABLE, _IS_INTERACTIVE, _module_available, _NATIVE_AMP_AVAILABLE, _OMEGACONF_AVAILABLE, + _POPTORCH_AVAILABLE, _RPC_AVAILABLE, _TORCH_GREATER_EQUAL_1_5, _TORCH_GREATER_EQUAL_1_6, diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py index 98e10a9126a44..3cb4d24d126fa 100644 --- a/pytorch_lightning/utilities/enums.py +++ b/pytorch_lightning/utilities/enums.py @@ -97,6 +97,7 @@ class DeviceType(LightningEnum): """ CPU = 'CPU' GPU = 'GPU' + IPU = 'IPU' TPU = 'TPU' diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index f40d092f68e9f..2a51b01404821 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -85,6 +85,7 @@ def _compare_version(package: str, op, version) -> bool: _KINETO_AVAILABLE = _TORCH_GREATER_EQUAL_1_8_1 and torch.profiler.kineto_available() _NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast") _OMEGACONF_AVAILABLE = _module_available("omegaconf") +_POPTORCH_AVAILABLE = _module_available('poptorch') _RPC_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.rpc') _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != 'none']) _TORCHTEXT_AVAILABLE = _module_available("torchtext") @@ -96,3 +97,9 @@ def _compare_version(package: str, op, version) -> bool: from pytorch_lightning.utilities.xla_device import XLADeviceUtils # noqa: E402 _TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists() + +if _POPTORCH_AVAILABLE: + import poptorch + _IPU_AVAILABLE = poptorch.ipuHardwareIsAvailable() +else: + _IPU_AVAILABLE = False diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py new file mode 100644 index 0000000000000..52496e28b2230 --- /dev/null +++ b/tests/accelerators/test_ipu.py @@ -0,0 +1,515 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Optional + +import pytest +import torch +import torch.nn.functional as F + +from pytorch_lightning import Callback, seed_everything, Trainer +from pytorch_lightning.accelerators import IPUAccelerator +from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin +from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.utilities import _IPU_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.helpers.boring_model import BoringModel +from tests.helpers.datamodules import ClassifDataModule +from tests.helpers.runif import RunIf +from tests.helpers.simple_models import ClassificationModel + +if _IPU_AVAILABLE: + import poptorch + + +class IPUModel(BoringModel): + + def training_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def validation_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def test_step(self, batch, batch_idx): + output = self(batch) + loss = self.loss(batch, output) + return loss + + def training_epoch_end(self, outputs) -> None: + pass + + def validation_epoch_end(self, outputs) -> None: + pass + + def test_epoch_end(self, outputs) -> None: + pass + + +class IPUClassificationModel(ClassificationModel): + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + acc = self.accuracy(logits, y) + return acc + + def accuracy(self, logits, y): + # todo (sean): currently IPU poptorch doesn't implicit convert bools to tensor + # hence we use an explicit calculation for accuracy here. Once fixed in poptorch + # we can use the accuracy metric. + acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) + return acc + + def validation_epoch_end(self, outputs) -> None: + self.log('val_acc', torch.stack(outputs).mean()) + + def test_epoch_end(self, outputs) -> None: + self.log('test_acc', torch.stack(outputs).mean()) + + +@pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine") +def test_fail_if_no_ipus(tmpdir): + with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): + Trainer(default_root_dir=tmpdir, ipus=1) + + with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): + Trainer(default_root_dir=tmpdir, ipus=1, accelerator='ipu') + + +@RunIf(ipu=True) +def test_accelerator_selected(tmpdir): + trainer = Trainer(default_root_dir=tmpdir, ipus=1) + assert isinstance(trainer.accelerator, IPUAccelerator) + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accelerator='ipu') + assert isinstance(trainer.accelerator, IPUAccelerator) + + +@RunIf(ipu=True) +@pytest.mark.parametrize('ipus', [1, 4]) +def test_all_stages(tmpdir, ipus): + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=ipus) + trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, model.val_dataloader()) + + +@RunIf(ipu=True) +@pytest.mark.parametrize('ipus', [1, 4]) +def test_inference_only(tmpdir, ipus): + model = IPUModel() + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=ipus) + trainer.validate(model) + trainer.test(model) + trainer.predict(model, model.val_dataloader()) + + +@RunIf(ipu=True) +def test_optimization(tmpdir): + seed_everything(42) + + dm = ClassifDataModule(length=1024) + model = IPUClassificationModel() + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + ipus=2, + ) + + # fit model + trainer.fit(model, dm) + assert trainer.state.finished, f"Training failed with {trainer.state}" + assert dm.trainer is not None + + # validate + result = trainer.validate(datamodule=dm) + assert dm.trainer is not None + assert result[0]['val_acc'] > 0.7 + + # test + result = trainer.test(model, datamodule=dm) + assert dm.trainer is not None + test_result = result[0]['test_acc'] + assert test_result > 0.6 + + # test saved model + model_path = os.path.join(tmpdir, 'model.pt') + trainer.save_checkpoint(model_path) + + model = IPUClassificationModel.load_from_checkpoint(model_path) + + trainer = Trainer(default_root_dir=tmpdir, ipus=2) + + result = trainer.test(model, datamodule=dm) + saved_result = result[0]['test_acc'] + assert saved_result == test_result + + +@RunIf(ipu=True) +def test_mixed_precision(tmpdir): + + class TestCallback(Callback): + + def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: + assert trainer.accelerator.model.precision == 16 + raise SystemExit + + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, precision=16, callbacks=TestCallback()) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_pure_half_precision(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + assert trainer.accelerator.model.precision == 16 + assert trainer.accelerator.training_type_plugin.convert_model_to_half + for param in trainer.accelerator.model.parameters(): + assert param.dtype == torch.float16 + raise SystemExit + + model = IPUModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + ipus=1, + precision=16, + plugins=IPUPlugin(convert_model_to_half=True), + callbacks=TestCallback() + ) + + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert isinstance(trainer.accelerator.precision_plugin, IPUPrecisionPlugin) + assert trainer.accelerator.precision_plugin.precision == 16 + + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_device_iterations_ipu_plugin(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + assert trainer.accelerator.training_type_plugin.device_iterations == 2 + # assert device iterations has been set correctly within the poptorch options + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] + assert poptorch_model._options.toDict()['device_iterations'] == 2 + raise SystemExit + + model = IPUModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + ipus=1, + plugins=IPUPlugin(device_iterations=2), + callbacks=TestCallback() + ) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_accumulated_batches(tmpdir): + + class TestCallback(Callback): + + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + # ensure the accumulation_scheduler is overridden to accumulate every batch + # since ipu handle accumulation + assert trainer.accumulation_scheduler.scheduling == {0: 1} + # assert poptorch option have been set correctly + poptorch_model = trainer.accelerator.training_type_plugin.poptorch_models[RunningStage.TRAINING] + assert poptorch_model._options.Training.toDict()['gradient_accumulation'] == 2 + raise SystemExit + + model = IPUModel() + trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, ipus=1, accumulate_grad_batches=2, callbacks=TestCallback() + ) + with pytest.raises(SystemExit): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_stages_correct(tmpdir): + """Ensure all stages correctly are traced correctly by asserting the output for each stage""" + + class StageModel(IPUModel): + + def training_step(self, batch, batch_idx): + loss = super().training_step(batch, batch_idx) + # tracing requires a loss value that depends on the model. + # force it to be a value but ensure we use the loss. + return (loss - loss) + torch.tensor(1) + + def validation_step(self, batch, batch_idx): + loss = super().validation_step(batch, batch_idx) + return (loss - loss) + torch.tensor(2) + + def test_step(self, batch, batch_idx): + loss = super().validation_step(batch, batch_idx) + return (loss - loss) + torch.tensor(3) + + def predict_step(self, batch, batch_idx, dataloader_idx=None): + output = super().predict_step(batch, batch_idx) + return (output - output) + torch.tensor(4) + + class TestCallback(Callback): + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs['loss'].item() == 1 + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs.item() == 2 + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert outputs.item() == 3 + + def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: + assert torch.all(outputs == 4).item() + + model = StageModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, ipus=1, callbacks=TestCallback()) + trainer.fit(model) + trainer.test(model) + trainer.validate(model) + trainer.predict(model, model.test_dataloader()) + + +@RunIf(ipu=True) +def test_accumulate_grad_batches_dict_fails(tmpdir): + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={0: 1}) + with pytest.raises( + MisconfigurationException, match="IPUs currently only support accumulate_grad_batches being an integer value." + ): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_clip_gradients_fails(tmpdir): + model = IPUModel() + trainer = Trainer(default_root_dir=tmpdir, ipus=1, gradient_clip_val=10) + with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_autoreport(tmpdir): + """Ensure autoreport dumps to a file.""" + model = IPUModel() + autoreport_path = os.path.join(tmpdir, 'report/') + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(autoreport=True, autoreport_dir=autoreport_path) + ) + trainer.fit(model) + assert os.path.exists(autoreport_path) + assert os.path.isfile(autoreport_path + 'profile.pop') + + +@RunIf(ipu=True) +def test_manual_poptorch_opts(tmpdir): + """Ensure if the user passes manual poptorch Options, we run with the correct object.""" + model = IPUModel() + inference_opts = poptorch.Options() + training_opts = poptorch.Options() + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + trainer.fit(model) + + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.training_opts == training_opts + assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_ipu_count(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and the number of ipus do not match, we warn and we set it for the user. + """ + + manual_ipus = 1 + expected_ipus = 2 + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.replicationFactor(manual_ipus) + + training_opts = poptorch.Options() + training_opts.replicationFactor(manual_ipus) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=expected_ipus, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} " + f"which differs to the ipus={expected_ipus} flag passed to the Trainer. " + f"Setting to {expected_ipus} in the poptorch.Options." + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2 + assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_inference_grad_accum(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and grad accumulation is set greater than 1 for inference, we warn and set to 1. + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.Training.gradientAccumulation(4) + + training_opts = poptorch.Options() + training_opts.Training.gradientAccumulation(1) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match="Inference poptorch.Options should set gradientAccumulation to 1. " + "Setting gradientAccumulation to 1 for inference options.", + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_manual_poptorch_opts_train_grad_accum(tmpdir): + """ + Ensure if the user passes manual poptorch Options + and grad accumulation differs to accumulate_grad_batches, we + """ + + model = IPUModel() + inference_opts = poptorch.Options() + inference_opts.Training.gradientAccumulation(1) + + training_opts = poptorch.Options() + training_opts.Training.gradientAccumulation(2) + + trainer = Trainer( + default_root_dir=tmpdir, + ipus=1, + fast_dev_run=True, + accumulate_grad_batches=1, + plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + ) + with pytest.warns( + UserWarning, + match=f"Training poptorch.Options set gradientAccumulation to {2}. " + f"This is different to accumulate_grad_batches which was set to {1}. " + f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " + f"Setting poptorch.Options gradientAccumulation to {1}", + ): + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_default_opts(tmpdir): + """ + Ensure default opts are set correctly in the IPUPlugin. + """ + + model = IPUModel() + + trainer = Trainer(default_root_dir=tmpdir, ipus=1, fast_dev_run=True) + trainer.fit(model) + assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) + inference_opts = trainer.accelerator.training_type_plugin.inference_opts + training_opts = trainer.accelerator.training_type_plugin.training_opts + for opts in (inference_opts, training_opts): + assert isinstance(opts, poptorch.Options) + assert opts.Training.gradient_accumulation == 1 + assert opts.device_iterations == 1 + assert opts.replication_factor == 1 + + +@RunIf(ipu=True) +def test_multi_optimizers_fails(tmpdir): + """ + Ensure if there are multiple optimizers, we throw an exception + """ + + class TestModel(IPUModel): + + def configure_optimizers(self): + return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())] + + model = TestModel() + + trainer = Trainer(default_root_dir=tmpdir, ipus=1) + with pytest.raises(MisconfigurationException, match="IPUs currently only support one optimizer."): + trainer.fit(model) + + +@RunIf(ipu=True) +def test_precision_plugin(tmpdir): + """ + Ensure precision plugin value is set correctly. + """ + + plugin = IPUPrecisionPlugin(precision=16) + assert plugin.precision == 16 diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 630a341ec2d30..737ddd68dff17 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -27,6 +27,7 @@ _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _FAIRSCALE_PIPE_AVAILABLE, _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, _NATIVE_AMP_AVAILABLE, _RPC_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE, @@ -63,6 +64,7 @@ def __new__( amp_apex: bool = False, amp_native: bool = False, tpu: bool = False, + ipu: bool = False, horovod: bool = False, horovod_nccl: bool = False, skip_windows: bool = False, @@ -85,6 +87,7 @@ def __new__( amp_apex: NVIDIA Apex is installed amp_native: if native PyTorch native AMP is supported tpu: if TPU is available + ipu: if IPU is available horovod: if Horovod is installed horovod_nccl: if Horovod is installed with NCCL support skip_windows: skip test for Windows platform (typically fo some limited torch functionality) @@ -139,6 +142,10 @@ def __new__( conditions.append(not _TPU_AVAILABLE) reasons.append("TPU") + if ipu: + conditions.append(not _IPU_AVAILABLE) + reasons.append("IPU") + if horovod: conditions.append(not _HOROVOD_AVAILABLE) reasons.append("Horovod")