[IPU] Allow poptorch.Options to override Trainer (#8233)

Sean Naren · carmocca · awaelchli · web-flow · commit 6d558961e32e · 2021-07-05T13:42:00.000Z
* Add test for poptorch Options * Hacks to get manual plugin support * Revert changes * Fix tests + ensure logic follow suit * Update pytorch_lightning/plugins/training_type/ipu.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Cleaner * Cleaner Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.trainer.supporters import CombinedLoader
-from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities import _POPTORCH_AVAILABLE
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -129,10 +129,18 @@ def pre_dispatch(self) -> None:
         self._handle_gradient_accumulation_steps()
 
     @property
-    def replication_factor(self):
+    def replication_factor(self) -> int:
+        if not self.lightning_module:
+            # The plugin has been passed in by the user and has not been connected to the Trainer.
+            # Check if the user has passed in custom poptorch.Options to infer number of IPUs being used.
+            # In this scenario we prioritize the training options.
+            if self._training_opts:
+                return self._training_opts.replication_factor
+            if self._inference_opts:
+                return self._inference_opts.replication_factor
         return len(self.parallel_devices)
 
-    def _create_opts(self, training: bool):
+    def _create_opts(self, training: bool) -> 'poptorch.Options':
         opts = poptorch.Options()
         opts.deviceIterations(self.device_iterations)
         opts.replicationFactor(self.replication_factor)
@@ -147,71 +155,44 @@ def _create_opts(self, training: bool):
     def training_opts(self) -> 'poptorch.Options':
         if self._training_opts is None:
             self._training_opts = self._create_opts(training=True)
-        self._validate_opts(self._training_opts, training=True)
         return self._training_opts
 
     @property
     def inference_opts(self) -> 'poptorch.Options':
         if self._inference_opts is None:
             self._inference_opts = self._create_opts(training=False)
-        self._validate_opts(self._inference_opts, training=False)
         return self._inference_opts
 
-    def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None:
-        if opts is not None:
-            if opts.replication_factor != self.replication_factor:
-                rank_zero_warn(
-                    f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} "
-                    f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. "
-                    f"Setting to {self.replication_factor} in the poptorch.Options."
-                )
-                opts.set(replication_factor=self.replication_factor)
-            if training:
-                accumulate_grad_batches = self.accumulate_grad_batches
-                if opts.Training.gradient_accumulation != accumulate_grad_batches:
-                    rank_zero_warn(
-                        f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. "
-                        f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. "
-                        f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
-                        f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}"
-                    )
-                    opts.Training.set(gradient_accumulation=accumulate_grad_batches)
-            elif opts.Training.gradient_accumulation != 1:
-                rank_zero_warn(
-                    "Inference poptorch.Options should set gradientAccumulation to 1. "
-                    "Setting gradientAccumulation to 1 for inference options."
-                )
-                opts.Training.set(gradient_accumulation=1)
-
     @property
     def lightning_module(self) -> Optional['pl.LightningModule']:
         return self.model.module if isinstance(self.model, LightningIPUModule) else self.model
 
     def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=True)
 
     def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=False)
 
     def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=False)
 
     def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=False)
 
-    def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
+    def _process_dataloader(
+        self,
+        dataloader: Union[Iterable, DataLoader],
+        is_training: bool,
+    ) -> Union[Iterable, DataLoader]:
         if isinstance(dataloader, CombinedLoader):
             dataloader.loaders = apply_to_collection(
-                dataloader.loaders,
-                DataLoader,
-                self.process_dataloader,
+                dataloader.loaders, DataLoader, self._process_dataloader, is_training
             )
             return dataloader
         if isinstance(dataloader, list):
-            dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader)
+            dataloader = apply_to_collection(dataloader, DataLoader, self._process_dataloader, is_training)
             return dataloader
         if not isinstance(dataloader, poptorch.DataLoader):
-            is_training = self.lightning_module.trainer.training
             opts = self.training_opts if is_training else self.inference_opts
             dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts)
         return dataloader
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -259,7 +259,7 @@ def on_tpu(self) -> bool:
 
     @property
     def on_ipu(self) -> bool:
-        return self.ipus is not None
+        return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin)
 
     @property
     def tpu_id(self) -> Optional[int]:
@@ -327,6 +327,14 @@ def num_gpus(self) -> int:
             return 0
         return len(gpus)
 
+    @property
+    def num_ipus(self) -> int:
+        if isinstance(self.ipus, int):
+            return self.ipus
+        if isinstance(self._training_type_plugin, IPUPlugin):
+            return self._training_type_plugin.replication_factor
+        return 0
+
     @property
     def parallel_devices(self) -> List[Union[torch.device, int]]:
         if self.on_gpu:
@@ -337,8 +345,7 @@ def parallel_devices(self) -> List[Union[torch.device, int]]:
             if isinstance(self.tpu_cores, int):
                 devices = list(range(self.tpu_cores))
         elif self.on_ipu:
-            if isinstance(self.ipus, int):
-                devices = list(range(self.ipus))
+            devices = list(range(self.num_ipus))
         else:
             devices = [torch.device("cpu")] * self.num_processes
         return devices
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
@@ -137,7 +137,7 @@ def tpu_cores(self) -> int:
 
     @property
     def ipus(self) -> int:
-        return self.accelerator_connector.ipus
+        return self.accelerator_connector.num_ipus
 
     @property
     def num_gpus(self) -> int:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -23,7 +23,7 @@
 import torch
 
 import pytorch_lightning as pl
-from pytorch_lightning.accelerators import Accelerator
+from pytorch_lightning.accelerators import Accelerator, IPUAccelerator
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.memory import ModelSummary
@@ -1209,7 +1209,7 @@ def _log_device_info(self) -> None:
                 " `Trainer(tpu_cores=8)` or script `--tpu_cores=8`."
             )
 
-        if _IPU_AVAILABLE and self._device_type != DeviceType.IPU:
+        if _IPU_AVAILABLE and self._device_type != DeviceType.IPU and not isinstance(self.accelerator, IPUAccelerator):
             rank_zero_warn(
                 "IPU available but not used. Set the `ipus` flag in your trainer"
                 " `Trainer(ipus=8)` or script `--ipus=8`."
diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py
@@ -23,6 +23,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin
 from pytorch_lightning.trainer.states import RunningStage
+from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _IPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
@@ -112,6 +113,19 @@ def test_accelerator_selected(tmpdir):
     assert isinstance(trainer.accelerator, IPUAccelerator)
 
 
+@RunIf(ipu=True)
+def test_warning_if_ipus_not_used(tmpdir):
+    with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"):
+        Trainer(default_root_dir=tmpdir)
+
+
+@RunIf(ipu=True)
+def test_no_warning_plugin(tmpdir):
+    with pytest.warns(None) as record:
+        Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options()))
+    assert len(record) == 0
+
+
 @RunIf(ipu=True)
 @pytest.mark.parametrize('ipus', [1, 4])
 def test_all_stages(tmpdir, ipus):
@@ -364,140 +378,72 @@ def test_manual_poptorch_opts(tmpdir):
 
 
 @RunIf(ipu=True)
-def test_manual_poptorch_opts_ipu_count(tmpdir):
-    """
-    Ensure if the user passes manual poptorch Options
-    and the number of ipus do not match, we warn and we set it for the user.
-    """
-
-    manual_ipus = 1
-    expected_ipus = 2
-    model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.replicationFactor(manual_ipus)
-
-    training_opts = poptorch.Options()
-    training_opts.replicationFactor(manual_ipus)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=expected_ipus,
-        fast_dev_run=True,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    with pytest.warns(
-        UserWarning,
-        match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} "
-        f"which differs to the ipus={expected_ipus} flag passed to the Trainer. "
-        f"Setting to {expected_ipus} in the poptorch.Options."
-    ):
-        trainer.fit(model)
-        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
-        assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2
-        assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2
-
-
-@RunIf(ipu=True)
-def test_manual_poptorch_opts_inference_grad_accum(tmpdir):
-    """
-    Ensure if the user passes manual poptorch Options
-    and grad accumulation is set greater than 1 for inference, we warn and set to 1.
-    """
-
-    model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.Training.gradientAccumulation(4)
-
-    training_opts = poptorch.Options()
-    training_opts.Training.gradientAccumulation(1)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=1,
-        fast_dev_run=True,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    with pytest.warns(
-        UserWarning,
-        match="Inference poptorch.Options should set gradientAccumulation to 1. "
-        "Setting gradientAccumulation to 1 for inference options.",
-    ):
-        trainer.fit(model)
-        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
-        assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
-
-
-@RunIf(ipu=True)
-def test_manual_poptorch_opts_train_grad_accum(tmpdir):
+def test_manual_poptorch_opts_custom(tmpdir):
     """
-    Ensure if the user passes manual poptorch Options
-    and grad accumulation differs to accumulate_grad_batches, we
+    Ensure if the user passes manual poptorch Options with custom parameters set,
+    we respect them in our poptorch options and the dataloaders.
     """
 
     model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.Training.gradientAccumulation(1)
-
     training_opts = poptorch.Options()
+    training_opts.deviceIterations(8)
+    training_opts.replicationFactor(2)
     training_opts.Training.gradientAccumulation(2)
 
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=1,
-        fast_dev_run=True,
-        accumulate_grad_batches=1,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    with pytest.warns(
-        UserWarning,
-        match=f"Training poptorch.Options set gradientAccumulation to {2}. "
-        f"This is different to accumulate_grad_batches which was set to {1}. "
-        f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
-        f"Setting poptorch.Options gradientAccumulation to {1}",
-    ):
-        trainer.fit(model)
-        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
-        assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
-
-
-@RunIf(ipu=True)
-def test_manual_poptorch_opts_custom(tmpdir):
-    """
-    Ensure if the user passes manual poptorch Options with custom parameters set,
-    we respect them in our poptorch options.
-    """
-
-    model = IPUModel()
     inference_opts = poptorch.Options()
     inference_opts.deviceIterations(16)
-    inference_opts.replicationFactor(2)
+    inference_opts.replicationFactor(1)
     inference_opts.Training.gradientAccumulation(1)
 
-    training_opts = poptorch.Options()
-    training_opts.deviceIterations(8)
-    training_opts.replicationFactor(2)
-    training_opts.Training.gradientAccumulation(2)
+    class TestCallback(Callback):
 
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=2,
-        fast_dev_run=True,
-        accumulate_grad_batches=2,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
+        def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            # ensure dataloaders were correctly set up during training.
+            plugin = trainer.accelerator.training_type_plugin
+            assert isinstance(plugin, IPUPlugin)
+            assert plugin.training_opts.replication_factor == 2
+            assert plugin.inference_opts.replication_factor == 1
+
+            val_dataloader = trainer.val_dataloaders[0]
+            train_dataloader = trainer.train_dataloader
+            assert isinstance(train_dataloader, CombinedLoader)
+            train_dataloader = train_dataloader.loaders
+            assert isinstance(val_dataloader, poptorch.DataLoader)
+            assert isinstance(train_dataloader, poptorch.DataLoader)
+            assert train_dataloader.options.replication_factor == 2
+            assert val_dataloader.options.replication_factor == 1
+
+    plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
+    # ensure we default to the training options replication factor
+    assert plugin.replication_factor == 2
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback())
     trainer.fit(model)
+
     plugin = trainer.accelerator.training_type_plugin
     assert isinstance(plugin, IPUPlugin)
-    inference_opts = plugin.inference_opts
-    training_opts = plugin.training_opts
-    assert inference_opts.device_iterations == 16
-    assert inference_opts.replication_factor == 2
-    assert inference_opts.Training.gradient_accumulation == 1
 
+    training_opts = plugin.training_opts
     assert training_opts.device_iterations == 8
     assert training_opts.replication_factor == 2
     assert training_opts.Training.gradient_accumulation == 2
 
+    inference_opts = plugin.inference_opts
+    assert inference_opts.device_iterations == 16
+    assert inference_opts.replication_factor == 1
+    assert inference_opts.Training.gradient_accumulation == 1
+
+
+@RunIf(ipu=True)
+def test_replication_factor(tmpdir):
+    """
+    Ensure if the user passes manual poptorch Options with custom parameters set,
+    we set them correctly in the dataloaders.
+    """
+
+    plugin = IPUPlugin()
+    trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin)
+    assert trainer.ipus == 2
+
 
 @RunIf(ipu=True)
 def test_default_opts(tmpdir):