Merge branch 'master' into bugfix_5181

tchaton · web-flow · commit 15d5fd8b5206 · 2021-03-02T15:32:56.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -89,6 +89,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `SingleTPU` calling `all_gather` ([#6296](https://github.com/PyTorchLightning/pytorch-lightning/pull/6296))
 
 
+- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
+
+
 ## [1.2.1] - 2021-02-23
 
 ### Fixed
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -536,12 +536,12 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
         if self.distributed_backend == "horovod":
             self._set_horovod_backend()
 
-        # throw error to force user ddp or ddp2 choice
-        _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
-        if (self.num_nodes > 1 and self._distrib_type not in _ddp):
+        using_valid_distributed = self.use_ddp or self.use_ddp2
+        if self.num_nodes > 1 and not using_valid_distributed:
+            # throw error to force user to choose a supported distributed type such as ddp or ddp2
             raise MisconfigurationException(
-                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
-                'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
+                'Your chosen distributed type does not support num_nodes > 1. '
+                'Please set accelerator=ddp or accelerator=ddp2.'
             )
 
         rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -28,10 +28,13 @@
     DDPPlugin,
     DDPShardedPlugin,
     DDPSpawnPlugin,
+    DDPSpawnShardedPlugin,
+    DeepSpeedPlugin,
     PrecisionPlugin,
     SingleDevicePlugin,
 )
 from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
+from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
@@ -415,3 +418,26 @@ def test_plugin_accelerator_choice(accelerator, plugin):
 
     trainer = Trainer(plugins=plugin, num_processes=2)
     assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
+
+
+@pytest.mark.parametrize(["accelerator", "plugin"], [
+    ('ddp', DDPPlugin),
+    ('ddp_spawn', DDPSpawnPlugin),
+    ('ddp_sharded', DDPShardedPlugin),
+    ('ddp_sharded_spawn', DDPSpawnShardedPlugin),
+    pytest.param(
+        'deepspeed',
+        DeepSpeedPlugin,
+        marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
+    ),
+])
+@mock.patch('torch.cuda.is_available', return_value=True)
+@mock.patch('torch.cuda.device_count', return_value=2)
+def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir):
+    trainer = Trainer(
+        accelerator=accelerator,
+        default_root_dir=tmpdir,
+        num_nodes=2,
+        gpus=2,
+    )
+    assert isinstance(trainer.training_type_plugin, plugin)
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
@@ -17,7 +17,6 @@
 
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.core.memory import ModelSummary, UNKNOWN_SIZE
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.advanced_models import ParityModuleRNN
@@ -292,8 +291,7 @@ def test_empty_model_size(mode):
     assert 0.0 == summary.model_size
 
 
-@RunIf(min_gpus=1)
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
+@RunIf(min_gpus=1, amp_native=True)
 @pytest.mark.parametrize(
     'precision', [
         pytest.param(16, marks=pytest.mark.skip(reason="no longer valid, because 16 can mean mixed precision")),
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
@@ -19,7 +19,7 @@
 import torch
 from pkg_resources import get_distribution
 
-from pytorch_lightning.utilities import _TORCH_QUANTIZE_AVAILABLE
+from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE
 
 
 class RunIf:
@@ -38,6 +38,8 @@ def __new__(
         min_gpus: int = 0,
         min_torch: Optional[str] = None,
         quantization: bool = False,
+        amp_apex: bool = False,
+        amp_native: bool = False,
         skip_windows: bool = False,
         **kwargs
     ):
@@ -47,6 +49,8 @@ def __new__(
             min_gpus: min number of gpus required to run test
             min_torch: minimum pytorch version to run test
             quantization: if `torch.quantization` package is required to run test
+            amp_apex: NVIDIA Apex is installed
+            amp_native: if native PyTorch native AMP is supported
             skip_windows: skip test for Windows platform (typically fo some limited torch functionality)
             kwargs: native pytest.mark.skipif keyword arguments
         """
@@ -67,6 +71,14 @@ def __new__(
             conditions.append(not _TORCH_QUANTIZE_AVAILABLE or _miss_default)
             reasons.append("missing PyTorch quantization")
 
+        if amp_native:
+            conditions.append(not _NATIVE_AMP_AVAILABLE)
+            reasons.append("missing native AMP")
+
+        if amp_apex:
+            conditions.append(not _APEX_AVAILABLE)
+            reasons.append("missing NVIDIA Apex")
+
         if skip_windows:
             conditions.append(sys.platform == "win32")
             reasons.append("unimplemented on Windows")
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
@@ -22,7 +22,6 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
@@ -193,8 +192,7 @@ def test_amp_without_apex(tmpdir):
 
 
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@RunIf(min_gpus=1)
-@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
+@RunIf(min_gpus=1, amp_apex=True)
 def test_amp_with_apex(tmpdir):
     """Check calling apex scaling in training."""
 
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
 from tests.helpers import BoringModel
 from tests.helpers.advanced_models import BasicGAN
 from tests.helpers.runif import RunIf
@@ -120,8 +120,7 @@ def test_horovod_multi_gpu(tmpdir):
 
 @pytest.mark.skip(reason="Horovod has a problem with broadcast when using apex?")
 @pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
-@RunIf(min_gpus=2, skip_windows=True)
-@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
+@RunIf(min_gpus=2, skip_windows=True, amp_apex=True)
 def test_horovod_apex(tmpdir):
     """Test Horovod with multi-GPU support using apex amp."""
     trainer_options = dict(
@@ -143,8 +142,7 @@ def test_horovod_apex(tmpdir):
 
 @pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
 @pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
-@RunIf(min_gpus=2, skip_windows=True)
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires torch.cuda.amp")
+@RunIf(min_gpus=2, skip_windows=True, amp_native=True)
 def test_horovod_amp(tmpdir):
     """Test Horovod with multi-GPU support using native amp."""
     trainer_options = dict(
diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py
@@ -6,12 +6,11 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import NativeMixedPrecisionPlugin
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
 
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
+@RunIf(amp_native=True)
 @mock.patch.dict(
     os.environ, {
         "CUDA_VISIBLE_DEVICES": "0,1",
@@ -49,8 +48,7 @@ def on_after_backward(self):
             assert norm.item() < 15.
 
 
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
-@RunIf(min_gpus=2)
+@RunIf(min_gpus=2, amp_native=True)
 def test_amp_gradient_unscale(tmpdir):
     model = GradientUnscaleBoringModel()
 
@@ -78,8 +76,7 @@ def on_after_backward(self):
             assert norm.item() < 15.
 
 
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
-@RunIf(min_gpus=2)
+@RunIf(min_gpus=2, amp_native=True)
 def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir):
     model = UnscaleAccumulateGradBatchesBoringModel()
 
diff --git a/tests/plugins/test_apex_plugin.py b/tests/plugins/test_apex_plugin.py
@@ -5,10 +5,10 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import ApexMixedPrecisionPlugin
-from pytorch_lightning.utilities import _APEX_AVAILABLE
+from tests.helpers.runif import RunIf
 
 
-@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
+@RunIf(amp_apex=True)
 @mock.patch.dict(
     os.environ, {
         "CUDA_VISIBLE_DEVICES": "0,1",
@@ -36,7 +36,7 @@ def test_amp_choice_default_ddp(mocked_device_count, ddp_backend, gpus):
     assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
 
 
-@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
+@RunIf(amp_apex=True)
 @mock.patch.dict(
     os.environ, {
         "CUDA_VISIBLE_DEVICES": "0,1",
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
@@ -9,7 +9,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
-from pytorch_lightning.utilities import _APEX_AVAILABLE, _DEEPSPEED_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
@@ -122,12 +122,12 @@ def test_deepspeed_plugin_env(tmpdir, monkeypatch, deepspeed_config):
 
 @pytest.mark.parametrize(
     "amp_backend", [
-        pytest.param("native", marks=pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")),
-        pytest.param("apex", marks=pytest.mark.skipif(not _APEX_AVAILABLE, reason="Requires Apex")),
+        pytest.param("native", marks=RunIf(amp_native=True)),
+        pytest.param("apex", marks=RunIf(amp_apex=True)),
     ]
 )
 @pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
+@RunIf(amp_native=True)
 def test_deepspeed_precision_choice(amp_backend, tmpdir):
     """
         Test to ensure precision plugin is also correctly chosen.
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
@@ -6,7 +6,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin
-from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
@@ -39,7 +39,7 @@ def on_fit_start(self, trainer, pl_module):
         trainer.fit(model)
 
 
-@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
+@RunIf(amp_apex=True)
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_invalid_apex_sharded(tmpdir):
     """
@@ -58,10 +58,9 @@ def test_invalid_apex_sharded(tmpdir):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=2)
+@RunIf(min_gpus=2, amp_native=True)
 @pytest.mark.parametrize(["accelerator"], [("ddp_sharded", ), ("ddp_sharded_spawn", )])
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
 def test_ddp_choice_sharded_amp(tmpdir, accelerator):
     """
         Test to ensure that plugin native amp plugin is correctly chosen when using sharded
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
@@ -24,7 +24,6 @@
 
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.utilities import _APEX_AVAILABLE
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -310,8 +309,7 @@ def configure_optimizers(self):
 
 
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@RunIf(min_gpus=1)
-@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
+@RunIf(min_gpus=1, amp_apex=True)
 def test_multiple_optimizers_manual_apex(tmpdir):
     """
     Tests that only training_step can be used
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -36,7 +36,6 @@
 from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
@@ -881,8 +880,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
     trainer.fit(model)
 
 
-@RunIf(min_gpus=1)
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
+@RunIf(min_gpus=1, amp_native=True)
 def test_gradient_clipping_fp16(tmpdir):
     """
     Test gradient clipping with fp16
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
@@ -20,7 +20,7 @@
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel
@@ -342,8 +342,7 @@ def test_error_on_dataloader_passed_to_fit(tmpdir):
         trainer.tune(model, **fit_options)
 
 
-@RunIf(min_gpus=1)
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
+@RunIf(min_gpus=1, amp_native=True)
 def test_auto_scale_batch_size_with_amp(tmpdir):
     model = EvalModelTemplate()
     batch_size_before = model.batch_size