From 7a21b505f3af11d0667d11e34e97114672640188 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 15:20:15 +0530 Subject: [PATCH 01/17] add exceptions and test --- pytorch_lightning/accelerators/accelerator.py | 12 ++++----- .../trainer/connectors/data_connector.py | 26 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index ea9cb03d18366..c4cb9e016e425 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -73,9 +73,9 @@ def setup(self, trainer: 'Trainer', model: LightningModule) -> None: trainer: the trainer instance to connect to model: the model to train """ - self.connect_training_type_plugin(self.training_type_plugin, model) + self.connect_training_type_plugin(model) self.setup_optimizers(trainer) - self.connect_precision_plugin(self.precision_plugin) + self.connect_precision_plugin() def start_training(self, trainer: 'Trainer') -> None: self.training_type_plugin.start_training(trainer) @@ -332,16 +332,16 @@ def setup_optimizers(self, trainer: 'Trainer') -> None: self.lr_schedulers = lr_schedulers self.optimizer_frequencies = optimizer_frequencies - def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: LightningModule) -> None: + def connect_training_type_plugin(self, model: LightningModule) -> None: """Attaches the training type plugin to the accelerator. Also transfers ownership of the model to this plugin """ - plugin.connect(model) + self.training_type_plugin.connect(model) - def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None: + def connect_precision_plugin(self) -> None: """Attaches the precision plugin to the accelerator""" - model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers) + model, optimizers, schedulers = self.precision_plugin.connect(self.model, self.optimizers, self.lr_schedulers) self.model = model self.optimizers = optimizers self.schedulers = schedulers diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 97484e5f473fd..6a604f1168e4b 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -121,22 +121,20 @@ def attach_datamodule(self, model, datamodule: Optional[LightningDataModule]) -> if datamodule: # Override loader hooks - if is_overridden('train_dataloader', datamodule): - model.train_dataloader = datamodule.train_dataloader - if is_overridden('val_dataloader', datamodule): - model.val_dataloader = datamodule.val_dataloader - if is_overridden('test_dataloader', datamodule): - model.test_dataloader = datamodule.test_dataloader - if is_overridden('predict_dataloader', datamodule): - model.predict_dataloader = datamodule.predict_dataloader + dl_methods = ['train_dataloader', 'val_dataloader', 'test_dataloader', 'predict_dataloader'] + for method in dl_methods: + if is_overridden(method, datamodule): + setattr(model, method, getattr(datamodule, method)) # Override data transfer hooks if dataset-specific to_device logic has been defined in datamodule - if is_overridden('on_before_batch_transfer', datamodule): - model.on_before_batch_transfer = datamodule.on_before_batch_transfer - if is_overridden('transfer_batch_to_device', datamodule): - model.transfer_batch_to_device = datamodule.transfer_batch_to_device - if is_overridden('on_after_batch_transfer', datamodule): - model.on_after_batch_transfer = datamodule.on_after_batch_transfer + batch_transfer_hooks = ['on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer'] + for hook in batch_transfer_hooks: + if is_overridden(hook, datamodule): + setattr(model, hook, getattr(datamodule, hook)) + + # Raise Misconfiguration exception since these hooks are not supported in DP mode + if self.trainer.accelerator_connector.use_dp and is_overridden(model, hook): + raise MisconfigurationException(f'Overriding `{hook}` is not supported in DP mode.') self.trainer.datamodule = datamodule datamodule.trainer = self.trainer From 85e1e978af70a4a75468ab4df9c9bedb8d82bb68 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 15:34:08 +0530 Subject: [PATCH 02/17] hook --- pytorch_lightning/trainer/connectors/data_connector.py | 2 +- tests/accelerators/test_dp.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 6a604f1168e4b..c7605eea19f9e 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -133,7 +133,7 @@ def attach_datamodule(self, model, datamodule: Optional[LightningDataModule]) -> setattr(model, hook, getattr(datamodule, hook)) # Raise Misconfiguration exception since these hooks are not supported in DP mode - if self.trainer.accelerator_connector.use_dp and is_overridden(model, hook): + if self.trainer.accelerator_connector.use_dp and is_overridden(hook, model): raise MisconfigurationException(f'Overriding `{hook}` is not supported in DP mode.') self.trainer.datamodule = datamodule diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 8aeb687f1c927..d751c2a5f38ee 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -17,6 +17,7 @@ import pytorch_lightning as pl import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils +from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.core import memory from tests.helpers import BoringModel From cd5570a8975a2fd2e37ceef5be18a1a7e47fe0e6 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 15:55:17 +0530 Subject: [PATCH 03/17] fix --- tests/accelerators/test_dp.py | 52 +++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index d751c2a5f38ee..0b2f86590ceb0 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pytest import torch import torch.nn.functional as F @@ -20,6 +21,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.core import memory +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.datamodules import ClassifDataModule from tests.helpers.runif import RunIf @@ -124,3 +126,53 @@ def test_dp_test(tmpdir): new_weights = model.layer_0.weight.clone().detach().cpu() assert torch.all(torch.eq(old_weights, new_weights)) + + +@RunIf(min_gpus=2) +def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir): + """ + Test that an exception is raised when overriding batch_transfer_hooks in DP model. + """ + + class CustomModel(BoringModel): + + def transfer_batch_to_device(self, batch, device): + batch = batch.cuda() + return batch + + def transform_hook(self, batch, dataloader_idx): + batch += 1 + return batch + + tutils.set_random_master_port() + trainer_options = dict( + default_root_dir=tmpdir, + max_steps=7, + gpus=[0, 1], + accelerator='dp', + ) + + # Override transfer_batch_to_device hook only + trainer = Trainer(**trainer_options) + model = CustomModel() + + with pytest.raises(MisconfigurationException, match='Overriding `transfer_batch_to_device` is not *. in DP'): + trainer.fit(model) + + # Override on_before_batch_transfer hook only + trainer = Trainer(**trainer_options) + model = CustomModel() + model.transfer_batch_to_device = BoringModel().transfer_batch_to_device + model.on_before_batch_transfer_hook_rank = model.transform_hook + + with pytest.raises(MisconfigurationException, match='Overriding `on_before_batch_transfer` is not *. in DP'): + trainer.fit(model) + + # Override on_after_batch_transfer hook only + trainer = Trainer(**trainer_options) + model = CustomModel() + model.transfer_batch_to_device = BoringModel().transfer_batch_to_device + model.on_after_batch_transfer = model.transform_hook + + with pytest.raises(MisconfigurationException, match='Overriding `on_after_batch_transfer` is not *. in DP'): + trainer.fit(model) From 880418685156c9cbc315f25269a6aef10d22ab21 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 16:46:55 +0530 Subject: [PATCH 04/17] clean up --- .../trainer/connectors/data_connector.py | 12 ++++++---- pytorch_lightning/trainer/trainer.py | 23 ++++++++----------- tests/accelerators/test_dp.py | 2 +- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index c7605eea19f9e..2a8b84f97e146 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -82,6 +82,7 @@ def attach_data(self, model, train_dataloader, val_dataloaders, datamodule): # set up the passed in dataloaders (if needed) self.attach_dataloaders(model, train_dataloader, val_dataloaders) self.attach_datamodule(model, datamodule) + self._validate_data_hooks(model) def __enforce_datamodule_dataloader_override(self, train_dataloader, val_dataloaders, datamodule): # If you supply a datamodule you can't supply train_dataloader or val_dataloaders @@ -90,6 +91,13 @@ def __enforce_datamodule_dataloader_override(self, train_dataloader, val_dataloa 'You cannot pass train_dataloader or val_dataloaders to trainer.fit if you supply a datamodule' ) + def _validate_data_hooks(self, model): + # Raise Misconfiguration exception since these hooks are not supported in DP mode + batch_transfer_hooks = ['on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer'] + for hook in batch_transfer_hooks: + if self.trainer.accelerator_connector.use_dp and is_overridden(hook, model): + raise MisconfigurationException(f'Overriding `{hook}` is not supported in DP mode.') + def attach_dataloaders( self, model, @@ -132,10 +140,6 @@ def attach_datamodule(self, model, datamodule: Optional[LightningDataModule]) -> if is_overridden(hook, datamodule): setattr(model, hook, getattr(datamodule, hook)) - # Raise Misconfiguration exception since these hooks are not supported in DP mode - if self.trainer.accelerator_connector.use_dp and is_overridden(hook, model): - raise MisconfigurationException(f'Overriding `{hook}` is not supported in DP mode.') - self.trainer.datamodule = datamodule datamodule.trainer = self.trainer diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4a1e5ec8986c6..a71aed0a0ec78 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -879,19 +879,24 @@ def test( 'You cannot pass test_dataloaders to trainer.test if you supply a datamodule' ) - # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model or self.lightning_module, datamodule) + if datamodule is not None: + # Attach datamodule to get setup/prepare_data added to model before the call to it below + self.data_connector.attach_datamodule(model or self.lightning_module, datamodule) + + # attach dataloaders + if test_dataloaders is not None: + self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) if model is not None: results = self.__test_given_model(model, test_dataloaders) else: - results = self.__test_using_best_weights(ckpt_path, test_dataloaders) + results = self.__test_using_ckpt_weights(ckpt_path, test_dataloaders) self.teardown('test') self._running_stage = None return results - def __test_using_best_weights(self, ckpt_path, test_dataloaders): + def __test_using_ckpt_weights(self, ckpt_path): model = self.lightning_module # if user requests the best checkpoint but we don't have it, error @@ -918,10 +923,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['state_dict']) - # attach dataloaders - if test_dataloaders is not None: - self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) - # run tests self.tested_ckpt_path = ckpt_path results = self.fit(model) @@ -933,11 +934,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): return results - def __test_given_model(self, model, test_dataloaders): - - # attach data - if test_dataloaders is not None: - self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) + def __test_given_model(self, model): # run test # sets up testing so we short circuit to eval diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 0b2f86590ceb0..30810f67c59eb 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -163,7 +163,7 @@ def transform_hook(self, batch, dataloader_idx): trainer = Trainer(**trainer_options) model = CustomModel() model.transfer_batch_to_device = BoringModel().transfer_batch_to_device - model.on_before_batch_transfer_hook_rank = model.transform_hook + model.on_before_batch_transfer = model.transform_hook with pytest.raises(MisconfigurationException, match='Overriding `on_before_batch_transfer` is not *. in DP'): trainer.fit(model) From e43650d8cccaf2324ce0a088ce406bf36976c456 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 16:59:37 +0530 Subject: [PATCH 05/17] clean up --- pytorch_lightning/trainer/trainer.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index a71aed0a0ec78..c4ab7e08463e5 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -883,20 +883,16 @@ def test( # Attach datamodule to get setup/prepare_data added to model before the call to it below self.data_connector.attach_datamodule(model or self.lightning_module, datamodule) - # attach dataloaders - if test_dataloaders is not None: - self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) - if model is not None: results = self.__test_given_model(model, test_dataloaders) else: - results = self.__test_using_ckpt_weights(ckpt_path, test_dataloaders) + results = self.__test_using_best_weights(ckpt_path, test_dataloaders) self.teardown('test') self._running_stage = None return results - def __test_using_ckpt_weights(self, ckpt_path): + def __test_using_best_weights(self, ckpt_path, test_dataloaders): model = self.lightning_module # if user requests the best checkpoint but we don't have it, error @@ -923,6 +919,10 @@ def __test_using_ckpt_weights(self, ckpt_path): ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['state_dict']) + # attach dataloaders + if test_dataloaders is not None: + self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) + # run tests self.tested_ckpt_path = ckpt_path results = self.fit(model) @@ -934,7 +934,11 @@ def __test_using_ckpt_weights(self, ckpt_path): return results - def __test_given_model(self, model): + def __test_given_model(self, model, test_dataloaders): + + # attach dataloaders + if test_dataloaders is not None: + self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) # run test # sets up testing so we short circuit to eval From 2a98e5e0b101340acf7e58dcff489381227fa0f7 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 17:03:02 +0530 Subject: [PATCH 06/17] regex --- tests/accelerators/test_dp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 30810f67c59eb..9d25be4d7deb0 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -156,7 +156,7 @@ def transform_hook(self, batch, dataloader_idx): trainer = Trainer(**trainer_options) model = CustomModel() - with pytest.raises(MisconfigurationException, match='Overriding `transfer_batch_to_device` is not *. in DP'): + with pytest.raises(MisconfigurationException, match=r'Overriding `transfer_batch_to_device` is not *. in DP'): trainer.fit(model) # Override on_before_batch_transfer hook only @@ -165,7 +165,7 @@ def transform_hook(self, batch, dataloader_idx): model.transfer_batch_to_device = BoringModel().transfer_batch_to_device model.on_before_batch_transfer = model.transform_hook - with pytest.raises(MisconfigurationException, match='Overriding `on_before_batch_transfer` is not *. in DP'): + with pytest.raises(MisconfigurationException, match=r'Overriding `on_before_batch_transfer` is not *. in DP'): trainer.fit(model) # Override on_after_batch_transfer hook only @@ -174,5 +174,5 @@ def transform_hook(self, batch, dataloader_idx): model.transfer_batch_to_device = BoringModel().transfer_batch_to_device model.on_after_batch_transfer = model.transform_hook - with pytest.raises(MisconfigurationException, match='Overriding `on_after_batch_transfer` is not *. in DP'): + with pytest.raises(MisconfigurationException, match=r'Overriding `on_after_batch_transfer` is not *. in DP'): trainer.fit(model) From b902903941bf47f6070d1cca058f46d3a5fa3874 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 17:10:54 +0530 Subject: [PATCH 07/17] regex --- tests/accelerators/test_dp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 9d25be4d7deb0..a202b3f84b2a8 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -156,7 +156,7 @@ def transform_hook(self, batch, dataloader_idx): trainer = Trainer(**trainer_options) model = CustomModel() - with pytest.raises(MisconfigurationException, match=r'Overriding `transfer_batch_to_device` is not *. in DP'): + with pytest.raises(MisconfigurationException, match=r'Overriding `transfer_batch_to_device` is not .* in DP'): trainer.fit(model) # Override on_before_batch_transfer hook only @@ -165,7 +165,7 @@ def transform_hook(self, batch, dataloader_idx): model.transfer_batch_to_device = BoringModel().transfer_batch_to_device model.on_before_batch_transfer = model.transform_hook - with pytest.raises(MisconfigurationException, match=r'Overriding `on_before_batch_transfer` is not *. in DP'): + with pytest.raises(MisconfigurationException, match=r'Overriding `on_before_batch_transfer` is not .* in DP'): trainer.fit(model) # Override on_after_batch_transfer hook only From ad0d5192eed2fcad0f7d53770816af8d8cf3441e Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 17:26:38 +0530 Subject: [PATCH 08/17] docs --- pytorch_lightning/core/hooks.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index e0b33c1219e8b..558615947ff24 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -585,7 +585,6 @@ def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = any other device than the one passed in as argument (unless you know what you are doing). Note: - This hook only runs on single GPU training and DDP (no data-parallel). If you need multi-GPU support for your custom batch objects, you need to define your custom :class:`~torch.nn.parallel.DistributedDataParallel` or :class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel` and @@ -609,6 +608,10 @@ def transfer_batch_to_device(self, batch, device): batch = super().transfer_batch_to_device(data, device) return batch + Raises: + MisconfigurationException: + If using data-parallel(``accelerator='dp'``) in ``Trainer``. + See Also: - :meth:`move_data_to_device` - :meth:`apply_to_collection` @@ -622,9 +625,6 @@ def on_before_batch_transfer(self, batch, dataloader_idx): .. warning:: dataloader_idx always returns 0, and will be updated to support the true idx in the future. - Note: - This hook only runs on single GPU training and DDP (no data-parallel). - Args: batch: A batch of data that needs to be altered or augmented. dataloader_idx: DataLoader idx for batch @@ -637,6 +637,9 @@ def on_before_batch_transfer(self, batch, dataloader_idx): def on_before_batch_transfer(self, batch, dataloader_idx): batch['x'] = transforms(batch['x']) return batch + Raises: + MisconfigurationException: + If using data-parallel(``accelerator='dp'``) in ``Trainer``. See Also: - :meth:`on_after_batch_transfer` @@ -650,9 +653,6 @@ def on_after_batch_transfer(self, batch, dataloader_idx): .. warning:: ``dataloader_idx`` always returns 0, and will be updated to support the true ``idx`` in the future. - Note: - This hook only runs on single GPU training and DDP (no data-parallel). - Args: batch: A batch of data that needs to be altered or augmented. dataloader_idx: DataLoader idx for batch (Default: 0) @@ -666,6 +666,10 @@ def on_after_batch_transfer(self, batch, dataloader_idx): batch['x'] = gpu_transforms(batch['x']) return batch + Raises: + MisconfigurationException: + If using data-parallel(``accelerator='dp'``) in ``Trainer``. + See Also: - :meth:`on_before_batch_transfer` - :meth:`transfer_batch_to_device` From d0c7f95ffc4b3b055f93e50c560cd0e726dba087 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 18:17:47 +0530 Subject: [PATCH 09/17] rev --- pytorch_lightning/accelerators/accelerator.py | 12 ++++----- pytorch_lightning/trainer/trainer.py | 9 +++---- tests/accelerators/test_dp.py | 25 ++++++++++--------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index c4cb9e016e425..ea9cb03d18366 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -73,9 +73,9 @@ def setup(self, trainer: 'Trainer', model: LightningModule) -> None: trainer: the trainer instance to connect to model: the model to train """ - self.connect_training_type_plugin(model) + self.connect_training_type_plugin(self.training_type_plugin, model) self.setup_optimizers(trainer) - self.connect_precision_plugin() + self.connect_precision_plugin(self.precision_plugin) def start_training(self, trainer: 'Trainer') -> None: self.training_type_plugin.start_training(trainer) @@ -332,16 +332,16 @@ def setup_optimizers(self, trainer: 'Trainer') -> None: self.lr_schedulers = lr_schedulers self.optimizer_frequencies = optimizer_frequencies - def connect_training_type_plugin(self, model: LightningModule) -> None: + def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: LightningModule) -> None: """Attaches the training type plugin to the accelerator. Also transfers ownership of the model to this plugin """ - self.training_type_plugin.connect(model) + plugin.connect(model) - def connect_precision_plugin(self) -> None: + def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None: """Attaches the precision plugin to the accelerator""" - model, optimizers, schedulers = self.precision_plugin.connect(self.model, self.optimizers, self.lr_schedulers) + model, optimizers, schedulers = plugin.connect(self.model, self.optimizers, self.lr_schedulers) self.model = model self.optimizers = optimizers self.schedulers = schedulers diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c4ab7e08463e5..7bfc3d41f9a8d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -879,9 +879,8 @@ def test( 'You cannot pass test_dataloaders to trainer.test if you supply a datamodule' ) - if datamodule is not None: - # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model or self.lightning_module, datamodule) + # Attach datamodule to get setup/prepare_data added to model before the call to it below + self.data_connector.attach_datamodule(model or self.lightning_module, datamodule, 'test') if model is not None: results = self.__test_given_model(model, test_dataloaders) @@ -936,7 +935,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): def __test_given_model(self, model, test_dataloaders): - # attach dataloaders + # attach data if test_dataloaders is not None: self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders) @@ -990,7 +989,7 @@ def predict( if datamodule is not None: # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model, datamodule) + self.data_connector.attach_datamodule(model, datamodule, 'predict') # attach data if dataloaders is not None: diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index a202b3f84b2a8..64d9bedefbdc0 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -137,11 +137,7 @@ def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir): class CustomModel(BoringModel): def transfer_batch_to_device(self, batch, device): - batch = batch.cuda() - return batch - - def transform_hook(self, batch, dataloader_idx): - batch += 1 + batch = batch.to(device) return batch tutils.set_random_master_port() @@ -152,27 +148,32 @@ def transform_hook(self, batch, dataloader_idx): accelerator='dp', ) - # Override transfer_batch_to_device hook only trainer = Trainer(**trainer_options) model = CustomModel() with pytest.raises(MisconfigurationException, match=r'Overriding `transfer_batch_to_device` is not .* in DP'): trainer.fit(model) - # Override on_before_batch_transfer hook only + class CustomModel(BoringModel): + + def on_before_batch_transfer(self, batch, dataloader_idx): + batch += 1 + return batch + trainer = Trainer(**trainer_options) model = CustomModel() - model.transfer_batch_to_device = BoringModel().transfer_batch_to_device - model.on_before_batch_transfer = model.transform_hook with pytest.raises(MisconfigurationException, match=r'Overriding `on_before_batch_transfer` is not .* in DP'): trainer.fit(model) - # Override on_after_batch_transfer hook only + class CustomModel(BoringModel): + + def on_after_batch_transfer(self, batch, dataloader_idx): + batch += 1 + return batch + trainer = Trainer(**trainer_options) model = CustomModel() - model.transfer_batch_to_device = BoringModel().transfer_batch_to_device - model.on_after_batch_transfer = model.transform_hook with pytest.raises(MisconfigurationException, match=r'Overriding `on_after_batch_transfer` is not *. in DP'): trainer.fit(model) From 812f909e99b72a9668f67f3395f1daced4510a9e Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 22:11:33 +0530 Subject: [PATCH 10/17] comment and docs --- pytorch_lightning/accelerators/gpu.py | 11 ++++++++++- pytorch_lightning/core/hooks.py | 13 ++++++++++++- .../trainer/connectors/data_connector.py | 1 + tests/accelerators/test_dp.py | 2 +- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 785a0cc8ddea8..f4698903bf1a7 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -1,10 +1,11 @@ import logging import os -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import torch from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.plugins import DataParallelPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException if TYPE_CHECKING: @@ -43,3 +44,11 @@ def set_nvidia_flags() -> None: all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {os.getenv('LOCAL_RANK', 0)} - CUDA_VISIBLE_DEVICES: [{devices}]") + + def to_device(self, batch: Any) -> Any: + # no need to transfer batch to device in DP mode + # TODO: Add supported to allow batch transfer to device in Lightning for DP mode. + if not isinstance(self.training_type_plugin, DataParallelPlugin): + batch = super().to_device(batch) + + return batch diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 558615947ff24..60b75aafc7780 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -581,8 +581,10 @@ def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). Note: + This hook only runs on single GPU training and DDP (no data-parallel). This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). + Data-Parallel support will come in near future. Note: If you need multi-GPU support for your custom batch objects, you need to define your custom @@ -623,7 +625,11 @@ def on_before_batch_transfer(self, batch, dataloader_idx): """ Override to alter or apply batch augmentations to your batch before it is transferred to the device. - .. warning:: dataloader_idx always returns 0, and will be updated to support the true idx in the future. + .. warning:: ``dataloader_idx`` always returns 0, and will be updated to support the true ``idx`` in the future. + + Note: + This hook only runs on single GPU training and DDP (no data-parallel). + Data-Parallel support will come in near future. Args: batch: A batch of data that needs to be altered or augmented. @@ -637,6 +643,7 @@ def on_before_batch_transfer(self, batch, dataloader_idx): def on_before_batch_transfer(self, batch, dataloader_idx): batch['x'] = transforms(batch['x']) return batch + Raises: MisconfigurationException: If using data-parallel(``accelerator='dp'``) in ``Trainer``. @@ -653,6 +660,10 @@ def on_after_batch_transfer(self, batch, dataloader_idx): .. warning:: ``dataloader_idx`` always returns 0, and will be updated to support the true ``idx`` in the future. + Note: + This hook only runs on single GPU training and DDP (no data-parallel). + Data-Parallel support will come in near future. + Args: batch: A batch of data that needs to be altered or augmented. dataloader_idx: DataLoader idx for batch (Default: 0) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 2a8b84f97e146..314669e00bf98 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -93,6 +93,7 @@ def __enforce_datamodule_dataloader_override(self, train_dataloader, val_dataloa def _validate_data_hooks(self, model): # Raise Misconfiguration exception since these hooks are not supported in DP mode + # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode. batch_transfer_hooks = ['on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer'] for hook in batch_transfer_hooks: if self.trainer.accelerator_connector.use_dp and is_overridden(hook, model): diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 64d9bedefbdc0..8ba1261a1b307 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -175,5 +175,5 @@ def on_after_batch_transfer(self, batch, dataloader_idx): trainer = Trainer(**trainer_options) model = CustomModel() - with pytest.raises(MisconfigurationException, match=r'Overriding `on_after_batch_transfer` is not *. in DP'): + with pytest.raises(MisconfigurationException, match=r'Overriding `on_after_batch_transfer` is not .* in DP'): trainer.fit(model) From 0a6b75d8aa238538c15d72044825dc8b35f42d9c Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 20 Feb 2021 23:06:09 +0530 Subject: [PATCH 11/17] chlog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27e5f4be2d04a..903a9dc089a11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -110,6 +110,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error message for AMP + CPU incompatibility ([#6107](https://github.com/PyTorchLightning/pytorch-lightning/pull/6107)) +- Disabled batch transfer in DP mode ([#6093](https://github.com/PyTorchLightning/pytorch-lightning/pull/6093)) + + ## [1.2.0] - 2021-02-18 ### Added From 095602ee94064510059bd6b0cf4bb63050853457 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 22 Feb 2021 11:51:24 +0530 Subject: [PATCH 12/17] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- pytorch_lightning/accelerators/gpu.py | 2 +- pytorch_lightning/core/hooks.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index f4698903bf1a7..8f4fd02a12522 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -47,7 +47,7 @@ def set_nvidia_flags() -> None: def to_device(self, batch: Any) -> Any: # no need to transfer batch to device in DP mode - # TODO: Add supported to allow batch transfer to device in Lightning for DP mode. + # TODO: Add support to allow batch transfer to device in Lightning for DP mode. if not isinstance(self.training_type_plugin, DataParallelPlugin): batch = super().to_device(batch) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 60b75aafc7780..0d0fa2bdcb5cd 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -612,7 +612,7 @@ def transfer_batch_to_device(self, batch, device): Raises: MisconfigurationException: - If using data-parallel(``accelerator='dp'``) in ``Trainer``. + If using data-parallel, ``Trainer(accelerator='dp')``. See Also: - :meth:`move_data_to_device` @@ -625,7 +625,7 @@ def on_before_batch_transfer(self, batch, dataloader_idx): """ Override to alter or apply batch augmentations to your batch before it is transferred to the device. - .. warning:: ``dataloader_idx`` always returns 0, and will be updated to support the true ``idx`` in the future. + .. warning:: ``dataloader_idx`` always returns 0, and will be updated to support the true index in the future. Note: This hook only runs on single GPU training and DDP (no data-parallel). @@ -646,7 +646,7 @@ def on_before_batch_transfer(self, batch, dataloader_idx): Raises: MisconfigurationException: - If using data-parallel(``accelerator='dp'``) in ``Trainer``. + If using data-parallel, ``Trainer(accelerator='dp')``. See Also: - :meth:`on_after_batch_transfer` @@ -679,7 +679,7 @@ def on_after_batch_transfer(self, batch, dataloader_idx): Raises: MisconfigurationException: - If using data-parallel(``accelerator='dp'``) in ``Trainer``. + If using data-parallel, ``Trainer(accelerator='dp')``. See Also: - :meth:`on_before_batch_transfer` From 38fc6950bfc599c3e7a1ec566d3d62c650b267c4 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 22 Feb 2021 16:15:19 +0530 Subject: [PATCH 13/17] Apply suggestions from code review Co-authored-by: chaton --- pytorch_lightning/trainer/connectors/data_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 314669e00bf98..7bafdedf80602 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -94,7 +94,7 @@ def __enforce_datamodule_dataloader_override(self, train_dataloader, val_dataloa def _validate_data_hooks(self, model): # Raise Misconfiguration exception since these hooks are not supported in DP mode # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode. - batch_transfer_hooks = ['on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer'] + batch_transfer_hooks = ('on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer') for hook in batch_transfer_hooks: if self.trainer.accelerator_connector.use_dp and is_overridden(hook, model): raise MisconfigurationException(f'Overriding `{hook}` is not supported in DP mode.') @@ -130,13 +130,13 @@ def attach_datamodule(self, model, datamodule: Optional[LightningDataModule]) -> if datamodule: # Override loader hooks - dl_methods = ['train_dataloader', 'val_dataloader', 'test_dataloader', 'predict_dataloader'] + dl_methods = ('train_dataloader', 'val_dataloader', 'test_dataloader', 'predict_dataloader') for method in dl_methods: if is_overridden(method, datamodule): setattr(model, method, getattr(datamodule, method)) # Override data transfer hooks if dataset-specific to_device logic has been defined in datamodule - batch_transfer_hooks = ['on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer'] + batch_transfer_hooks = ('on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer') for hook in batch_transfer_hooks: if is_overridden(hook, datamodule): setattr(model, hook, getattr(datamodule, hook)) From 40e1d47c56d56d59733826321dd16baa222aa15b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 22 Feb 2021 13:36:26 +0100 Subject: [PATCH 14/17] Monkey-patch device count --- tests/accelerators/test_dp.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 8ba1261a1b307..6e6f69f909752 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -128,11 +128,11 @@ def test_dp_test(tmpdir): assert torch.all(torch.eq(old_weights, new_weights)) -@RunIf(min_gpus=2) -def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir): +def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, monkeypatch): """ Test that an exception is raised when overriding batch_transfer_hooks in DP model. """ + monkeypatch.setattr("torch.cuda.device_count", lambda: 2) class CustomModel(BoringModel): @@ -140,7 +140,6 @@ def transfer_batch_to_device(self, batch, device): batch = batch.to(device) return batch - tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_steps=7, From 8c7c85a7cfac7a9d34579130db773e753c9c2a70 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 22 Feb 2021 21:30:55 +0530 Subject: [PATCH 15/17] docs --- pytorch_lightning/core/hooks.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 0d0fa2bdcb5cd..52ff3ff8206c3 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -581,16 +581,12 @@ def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...). Note: - This hook only runs on single GPU training and DDP (no data-parallel). This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). - Data-Parallel support will come in near future. Note: - If you need multi-GPU support for your custom batch objects, you need to define your custom - :class:`~torch.nn.parallel.DistributedDataParallel` or - :class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel` and - override :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_ddp`. + This hook only runs on single GPU training and DDP (no data-parallel). + Data-Parallel support will come in near future. Args: batch: A batch of data that needs to be transferred to a new device. From 82cfd5f9974c10e166d5a8284e155eea086436be Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 6 Mar 2021 22:20:06 +0530 Subject: [PATCH 16/17] pep --- tests/accelerators/test_dp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 9e9e0942b06db..025605906fd50 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -194,4 +194,4 @@ def test_dp_training_step_dict(tmpdir): gpus=2, accelerator='dp', ) - trainer.fit(model) \ No newline at end of file + trainer.fit(model) From c90ba12825c3058abbb786cd358788766f8b90fb Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 6 Mar 2021 22:37:52 +0530 Subject: [PATCH 17/17] api_change --- pytorch_lightning/trainer/trainer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b1c8cc2d61547..dd0cd8c627965 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -875,8 +875,7 @@ def test( # Attach datamodule to get setup/prepare_data added to model before the call to it below self.data_connector.attach_datamodule(model, datamodule) results = ( - self.__evaluate_given_model(model, dataloaders=test_dataloaders) - if model_provided else + self.__evaluate_given_model(model, dataloaders=test_dataloaders) if model_provided else self.__evaluate_using_weights(model, ckpt_path=ckpt_path, dataloaders=test_dataloaders) ) @@ -991,7 +990,7 @@ def predict( if datamodule is not None: # Attach datamodule to get setup/prepare_data added to model before the call to it below - self.data_connector.attach_datamodule(model, datamodule, 'predict') + self.data_connector.attach_datamodule(model, datamodule) # attach data if dataloaders is not None: