Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 22 additions & 41 deletions pytorch_lightning/plugins/training_type/ipu.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.trainer.supporters import CombinedLoader
from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn
from pytorch_lightning.utilities import _POPTORCH_AVAILABLE
from pytorch_lightning.utilities.apply_func import apply_to_collection
from pytorch_lightning.utilities.cloud_io import get_filesystem
from pytorch_lightning.utilities.exceptions import MisconfigurationException
Expand Down Expand Up @@ -129,10 +129,18 @@ def pre_dispatch(self) -> None:
self._handle_gradient_accumulation_steps()

@property
def replication_factor(self):
def replication_factor(self) -> int:
if not self.lightning_module:
# The plugin has been passed in by the user and has not been connected to the Trainer.
# Check if the user has passed in custom poptorch.Options to infer number of IPUs being used.
# In this scenario we prioritize the training options.
if self._training_opts:
return self._training_opts.replication_factor
if self._inference_opts:
return self._inference_opts.replication_factor
return len(self.parallel_devices)

def _create_opts(self, training: bool):
def _create_opts(self, training: bool) -> 'poptorch.Options':
opts = poptorch.Options()
opts.deviceIterations(self.device_iterations)
opts.replicationFactor(self.replication_factor)
Expand All @@ -147,71 +155,44 @@ def _create_opts(self, training: bool):
def training_opts(self) -> 'poptorch.Options':
if self._training_opts is None:
self._training_opts = self._create_opts(training=True)
self._validate_opts(self._training_opts, training=True)
return self._training_opts

@property
def inference_opts(self) -> 'poptorch.Options':
if self._inference_opts is None:
self._inference_opts = self._create_opts(training=False)
self._validate_opts(self._inference_opts, training=False)
return self._inference_opts

def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None:
if opts is not None:
if opts.replication_factor != self.replication_factor:
rank_zero_warn(
f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} "
f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. "
f"Setting to {self.replication_factor} in the poptorch.Options."
)
opts.set(replication_factor=self.replication_factor)
if training:
accumulate_grad_batches = self.accumulate_grad_batches
if opts.Training.gradient_accumulation != accumulate_grad_batches:
rank_zero_warn(
f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. "
f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. "
f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}"
)
opts.Training.set(gradient_accumulation=accumulate_grad_batches)
elif opts.Training.gradient_accumulation != 1:
rank_zero_warn(
"Inference poptorch.Options should set gradientAccumulation to 1. "
"Setting gradientAccumulation to 1 for inference options."
)
opts.Training.set(gradient_accumulation=1)

@property
def lightning_module(self) -> Optional['pl.LightningModule']:
return self.model.module if isinstance(self.model, LightningIPUModule) else self.model

def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=True)

def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=False)

def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=False)

def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=False)

def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
def _process_dataloader(
self,
dataloader: Union[Iterable, DataLoader],
is_training: bool,
) -> Union[Iterable, DataLoader]:
if isinstance(dataloader, CombinedLoader):
dataloader.loaders = apply_to_collection(
dataloader.loaders,
DataLoader,
self.process_dataloader,
dataloader.loaders, DataLoader, self._process_dataloader, is_training
)
return dataloader
if isinstance(dataloader, list):
dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader)
dataloader = apply_to_collection(dataloader, DataLoader, self._process_dataloader, is_training)
return dataloader
if not isinstance(dataloader, poptorch.DataLoader):
is_training = self.lightning_module.trainer.training
opts = self.training_opts if is_training else self.inference_opts
dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts)
return dataloader
Expand Down
13 changes: 10 additions & 3 deletions pytorch_lightning/trainer/connectors/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def on_tpu(self) -> bool:

@property
def on_ipu(self) -> bool:
return self.ipus is not None
return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin)

@property
def tpu_id(self) -> Optional[int]:
Expand Down Expand Up @@ -327,6 +327,14 @@ def num_gpus(self) -> int:
return 0
return len(gpus)

@property
def num_ipus(self) -> int:
if isinstance(self.ipus, int):
return self.ipus
if isinstance(self._training_type_plugin, IPUPlugin):
return self._training_type_plugin.replication_factor
return 0

@property
def parallel_devices(self) -> List[Union[torch.device, int]]:
if self.on_gpu:
Expand All @@ -337,8 +345,7 @@ def parallel_devices(self) -> List[Union[torch.device, int]]:
if isinstance(self.tpu_cores, int):
devices = list(range(self.tpu_cores))
elif self.on_ipu:
if isinstance(self.ipus, int):
devices = list(range(self.ipus))
devices = list(range(self.num_ipus))
else:
devices = [torch.device("cpu")] * self.num_processes
return devices
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/trainer/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def tpu_cores(self) -> int:

@property
def ipus(self) -> int:
return self.accelerator_connector.ipus
return self.accelerator_connector.num_ipus

@property
def num_gpus(self) -> int:
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import torch

import pytorch_lightning as pl
from pytorch_lightning.accelerators import Accelerator
from pytorch_lightning.accelerators import Accelerator, IPUAccelerator
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.core.datamodule import LightningDataModule
from pytorch_lightning.core.memory import ModelSummary
Expand Down Expand Up @@ -1207,7 +1207,7 @@ def _log_device_info(self) -> None:
" `Trainer(tpu_cores=8)` or script `--tpu_cores=8`."
)

if _IPU_AVAILABLE and self._device_type != DeviceType.IPU:
if _IPU_AVAILABLE and self._device_type != DeviceType.IPU and not isinstance(self.accelerator, IPUAccelerator):
rank_zero_warn(
"IPU available but not used. Set the `ipus` flag in your trainer"
" `Trainer(ipus=8)` or script `--ipus=8`."
Expand Down
174 changes: 60 additions & 114 deletions tests/accelerators/test_ipu.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.trainer.supporters import CombinedLoader
from pytorch_lightning.utilities import _IPU_AVAILABLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel
Expand Down Expand Up @@ -112,6 +113,19 @@ def test_accelerator_selected(tmpdir):
assert isinstance(trainer.accelerator, IPUAccelerator)


@RunIf(ipu=True)
def test_warning_if_ipus_not_used(tmpdir):
with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"):
Trainer(default_root_dir=tmpdir)


@RunIf(ipu=True)
def test_no_warning_plugin(tmpdir):
with pytest.warns(None) as record:
Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options()))
assert len(record) == 0


@RunIf(ipu=True)
@pytest.mark.parametrize('ipus', [1, 4])
def test_all_stages(tmpdir, ipus):
Expand Down Expand Up @@ -364,140 +378,72 @@ def test_manual_poptorch_opts(tmpdir):


@RunIf(ipu=True)
def test_manual_poptorch_opts_ipu_count(tmpdir):
"""
Ensure if the user passes manual poptorch Options
and the number of ipus do not match, we warn and we set it for the user.
"""

manual_ipus = 1
expected_ipus = 2
model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.replicationFactor(manual_ipus)

training_opts = poptorch.Options()
training_opts.replicationFactor(manual_ipus)

trainer = Trainer(
default_root_dir=tmpdir,
ipus=expected_ipus,
fast_dev_run=True,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
with pytest.warns(
UserWarning,
match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} "
f"which differs to the ipus={expected_ipus} flag passed to the Trainer. "
f"Setting to {expected_ipus} in the poptorch.Options."
):
trainer.fit(model)
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2
assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2


@RunIf(ipu=True)
def test_manual_poptorch_opts_inference_grad_accum(tmpdir):
"""
Ensure if the user passes manual poptorch Options
and grad accumulation is set greater than 1 for inference, we warn and set to 1.
"""

model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.Training.gradientAccumulation(4)

training_opts = poptorch.Options()
training_opts.Training.gradientAccumulation(1)

trainer = Trainer(
default_root_dir=tmpdir,
ipus=1,
fast_dev_run=True,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
with pytest.warns(
UserWarning,
match="Inference poptorch.Options should set gradientAccumulation to 1. "
"Setting gradientAccumulation to 1 for inference options.",
):
trainer.fit(model)
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1


@RunIf(ipu=True)
def test_manual_poptorch_opts_train_grad_accum(tmpdir):
def test_manual_poptorch_opts_custom(tmpdir):
"""
Ensure if the user passes manual poptorch Options
and grad accumulation differs to accumulate_grad_batches, we
Ensure if the user passes manual poptorch Options with custom parameters set,
we respect them in our poptorch options and the dataloaders.
"""

model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.Training.gradientAccumulation(1)

training_opts = poptorch.Options()
training_opts.deviceIterations(8)
training_opts.replicationFactor(2)
training_opts.Training.gradientAccumulation(2)

trainer = Trainer(
default_root_dir=tmpdir,
ipus=1,
fast_dev_run=True,
accumulate_grad_batches=1,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
with pytest.warns(
UserWarning,
match=f"Training poptorch.Options set gradientAccumulation to {2}. "
f"This is different to accumulate_grad_batches which was set to {1}. "
f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
f"Setting poptorch.Options gradientAccumulation to {1}",
):
trainer.fit(model)
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1


@RunIf(ipu=True)
def test_manual_poptorch_opts_custom(tmpdir):
"""
Ensure if the user passes manual poptorch Options with custom parameters set,
we respect them in our poptorch options.
"""

model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.deviceIterations(16)
inference_opts.replicationFactor(2)
inference_opts.replicationFactor(1)
inference_opts.Training.gradientAccumulation(1)

training_opts = poptorch.Options()
training_opts.deviceIterations(8)
training_opts.replicationFactor(2)
training_opts.Training.gradientAccumulation(2)
class TestCallback(Callback):

trainer = Trainer(
default_root_dir=tmpdir,
ipus=2,
fast_dev_run=True,
accumulate_grad_batches=2,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
# ensure dataloaders were correctly set up during training.
plugin = trainer.accelerator.training_type_plugin
assert isinstance(plugin, IPUPlugin)
assert plugin.training_opts.replication_factor == 2
assert plugin.inference_opts.replication_factor == 1

val_dataloader = trainer.val_dataloaders[0]
train_dataloader = trainer.train_dataloader
assert isinstance(train_dataloader, CombinedLoader)
train_dataloader = train_dataloader.loaders
assert isinstance(val_dataloader, poptorch.DataLoader)
assert isinstance(train_dataloader, poptorch.DataLoader)
assert train_dataloader.options.replication_factor == 2
assert val_dataloader.options.replication_factor == 1

plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
# ensure we default to the training options replication factor
assert plugin.replication_factor == 2
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback())
trainer.fit(model)

plugin = trainer.accelerator.training_type_plugin
assert isinstance(plugin, IPUPlugin)
inference_opts = plugin.inference_opts
training_opts = plugin.training_opts
assert inference_opts.device_iterations == 16
assert inference_opts.replication_factor == 2
assert inference_opts.Training.gradient_accumulation == 1

training_opts = plugin.training_opts
assert training_opts.device_iterations == 8
assert training_opts.replication_factor == 2
assert training_opts.Training.gradient_accumulation == 2

inference_opts = plugin.inference_opts
assert inference_opts.device_iterations == 16
assert inference_opts.replication_factor == 1
assert inference_opts.Training.gradient_accumulation == 1


@RunIf(ipu=True)
def test_replication_factor(tmpdir):
"""
Ensure if the user passes manual poptorch Options with custom parameters set,
we set them correctly in the dataloaders.
"""

plugin = IPUPlugin()
trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin)
assert trainer.ipus == 2


@RunIf(ipu=True)
def test_default_opts(tmpdir):
Expand Down