Skip to content

Commit 15d5fd8

Browse files
authored
Merge branch 'master' into bugfix_5181
2 parents fae08de + 8001987 commit 15d5fd8

File tree

14 files changed

+69
-43
lines changed

14 files changed

+69
-43
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
8989
- Fixed `SingleTPU` calling `all_gather` ([#6296](https://github.com/PyTorchLightning/pytorch-lightning/pull/6296))
9090

9191

92+
- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
93+
94+
9295
## [1.2.1] - 2021-02-23
9396

9497
### Fixed

pytorch_lightning/trainer/connectors/accelerator_connector.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -536,12 +536,12 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
536536
if self.distributed_backend == "horovod":
537537
self._set_horovod_backend()
538538

539-
# throw error to force user ddp or ddp2 choice
540-
_ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
541-
if (self.num_nodes > 1 and self._distrib_type not in _ddp):
539+
using_valid_distributed = self.use_ddp or self.use_ddp2
540+
if self.num_nodes > 1 and not using_valid_distributed:
541+
# throw error to force user to choose a supported distributed type such as ddp or ddp2
542542
raise MisconfigurationException(
543-
'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
544-
'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
543+
'Your chosen distributed type does not support num_nodes > 1. '
544+
'Please set accelerator=ddp or accelerator=ddp2.'
545545
)
546546

547547
rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')

tests/accelerators/test_accelerator_connector.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,13 @@
2828
DDPPlugin,
2929
DDPShardedPlugin,
3030
DDPSpawnPlugin,
31+
DDPSpawnShardedPlugin,
32+
DeepSpeedPlugin,
3133
PrecisionPlugin,
3234
SingleDevicePlugin,
3335
)
3436
from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
37+
from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
3538
from pytorch_lightning.utilities.exceptions import MisconfigurationException
3639
from tests.helpers.boring_model import BoringModel
3740
from tests.helpers.runif import RunIf
@@ -415,3 +418,26 @@ def test_plugin_accelerator_choice(accelerator, plugin):
415418

416419
trainer = Trainer(plugins=plugin, num_processes=2)
417420
assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
421+
422+
423+
@pytest.mark.parametrize(["accelerator", "plugin"], [
424+
('ddp', DDPPlugin),
425+
('ddp_spawn', DDPSpawnPlugin),
426+
('ddp_sharded', DDPShardedPlugin),
427+
('ddp_sharded_spawn', DDPSpawnShardedPlugin),
428+
pytest.param(
429+
'deepspeed',
430+
DeepSpeedPlugin,
431+
marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
432+
),
433+
])
434+
@mock.patch('torch.cuda.is_available', return_value=True)
435+
@mock.patch('torch.cuda.device_count', return_value=2)
436+
def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir):
437+
trainer = Trainer(
438+
accelerator=accelerator,
439+
default_root_dir=tmpdir,
440+
num_nodes=2,
441+
gpus=2,
442+
)
443+
assert isinstance(trainer.training_type_plugin, plugin)

tests/core/test_memory.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
from pytorch_lightning import LightningModule, Trainer
1919
from pytorch_lightning.core.memory import ModelSummary, UNKNOWN_SIZE
20-
from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
2120
from pytorch_lightning.utilities.exceptions import MisconfigurationException
2221
from tests.helpers import BoringModel
2322
from tests.helpers.advanced_models import ParityModuleRNN
@@ -292,8 +291,7 @@ def test_empty_model_size(mode):
292291
assert 0.0 == summary.model_size
293292

294293

295-
@RunIf(min_gpus=1)
296-
@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
294+
@RunIf(min_gpus=1, amp_native=True)
297295
@pytest.mark.parametrize(
298296
'precision', [
299297
pytest.param(16, marks=pytest.mark.skip(reason="no longer valid, because 16 can mean mixed precision")),

tests/helpers/runif.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import torch
2020
from pkg_resources import get_distribution
2121

22-
from pytorch_lightning.utilities import _TORCH_QUANTIZE_AVAILABLE
22+
from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE
2323

2424

2525
class RunIf:
@@ -38,6 +38,8 @@ def __new__(
3838
min_gpus: int = 0,
3939
min_torch: Optional[str] = None,
4040
quantization: bool = False,
41+
amp_apex: bool = False,
42+
amp_native: bool = False,
4143
skip_windows: bool = False,
4244
**kwargs
4345
):
@@ -47,6 +49,8 @@ def __new__(
4749
min_gpus: min number of gpus required to run test
4850
min_torch: minimum pytorch version to run test
4951
quantization: if `torch.quantization` package is required to run test
52+
amp_apex: NVIDIA Apex is installed
53+
amp_native: if native PyTorch native AMP is supported
5054
skip_windows: skip test for Windows platform (typically fo some limited torch functionality)
5155
kwargs: native pytest.mark.skipif keyword arguments
5256
"""
@@ -67,6 +71,14 @@ def __new__(
6771
conditions.append(not _TORCH_QUANTIZE_AVAILABLE or _miss_default)
6872
reasons.append("missing PyTorch quantization")
6973

74+
if amp_native:
75+
conditions.append(not _NATIVE_AMP_AVAILABLE)
76+
reasons.append("missing native AMP")
77+
78+
if amp_apex:
79+
conditions.append(not _APEX_AVAILABLE)
80+
reasons.append("missing NVIDIA Apex")
81+
7082
if skip_windows:
7183
conditions.append(sys.platform == "win32")
7284
reasons.append("unimplemented on Windows")

tests/models/test_amp.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from pytorch_lightning import Trainer
2323
from pytorch_lightning.plugins.environments import SLURMEnvironment
2424
from pytorch_lightning.trainer.states import TrainerState
25-
from pytorch_lightning.utilities import _APEX_AVAILABLE
2625
from pytorch_lightning.utilities.exceptions import MisconfigurationException
2726
from tests.helpers import BoringModel
2827
from tests.helpers.runif import RunIf
@@ -193,8 +192,7 @@ def test_amp_without_apex(tmpdir):
193192

194193

195194
@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
196-
@RunIf(min_gpus=1)
197-
@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
195+
@RunIf(min_gpus=1, amp_apex=True)
198196
def test_amp_with_apex(tmpdir):
199197
"""Check calling apex scaling in training."""
200198

tests/models/test_horovod.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from pytorch_lightning.accelerators import CPUAccelerator
2929
from pytorch_lightning.metrics.classification.accuracy import Accuracy
3030
from pytorch_lightning.trainer.states import TrainerState
31-
from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
31+
from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
3232
from tests.helpers import BoringModel
3333
from tests.helpers.advanced_models import BasicGAN
3434
from tests.helpers.runif import RunIf
@@ -120,8 +120,7 @@ def test_horovod_multi_gpu(tmpdir):
120120

121121
@pytest.mark.skip(reason="Horovod has a problem with broadcast when using apex?")
122122
@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
123-
@RunIf(min_gpus=2, skip_windows=True)
124-
@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
123+
@RunIf(min_gpus=2, skip_windows=True, amp_apex=True)
125124
def test_horovod_apex(tmpdir):
126125
"""Test Horovod with multi-GPU support using apex amp."""
127126
trainer_options = dict(
@@ -143,8 +142,7 @@ def test_horovod_apex(tmpdir):
143142

144143
@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
145144
@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
146-
@RunIf(min_gpus=2, skip_windows=True)
147-
@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires torch.cuda.amp")
145+
@RunIf(min_gpus=2, skip_windows=True, amp_native=True)
148146
def test_horovod_amp(tmpdir):
149147
"""Test Horovod with multi-GPU support using native amp."""
150148
trainer_options = dict(

tests/plugins/test_amp_plugin.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@
66

77
from pytorch_lightning import Trainer
88
from pytorch_lightning.plugins import NativeMixedPrecisionPlugin
9-
from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
109
from tests.helpers.boring_model import BoringModel
1110
from tests.helpers.runif import RunIf
1211

1312

14-
@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
13+
@RunIf(amp_native=True)
1514
@mock.patch.dict(
1615
os.environ, {
1716
"CUDA_VISIBLE_DEVICES": "0,1",
@@ -49,8 +48,7 @@ def on_after_backward(self):
4948
assert norm.item() < 15.
5049

5150

52-
@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
53-
@RunIf(min_gpus=2)
51+
@RunIf(min_gpus=2, amp_native=True)
5452
def test_amp_gradient_unscale(tmpdir):
5553
model = GradientUnscaleBoringModel()
5654

@@ -78,8 +76,7 @@ def on_after_backward(self):
7876
assert norm.item() < 15.
7977

8078

81-
@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
82-
@RunIf(min_gpus=2)
79+
@RunIf(min_gpus=2, amp_native=True)
8380
def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir):
8481
model = UnscaleAccumulateGradBatchesBoringModel()
8582

tests/plugins/test_apex_plugin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
from pytorch_lightning import Trainer
77
from pytorch_lightning.plugins import ApexMixedPrecisionPlugin
8-
from pytorch_lightning.utilities import _APEX_AVAILABLE
8+
from tests.helpers.runif import RunIf
99

1010

11-
@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
11+
@RunIf(amp_apex=True)
1212
@mock.patch.dict(
1313
os.environ, {
1414
"CUDA_VISIBLE_DEVICES": "0,1",
@@ -36,7 +36,7 @@ def test_amp_choice_default_ddp(mocked_device_count, ddp_backend, gpus):
3636
assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
3737

3838

39-
@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
39+
@RunIf(amp_apex=True)
4040
@mock.patch.dict(
4141
os.environ, {
4242
"CUDA_VISIBLE_DEVICES": "0,1",

tests/plugins/test_deepspeed_plugin.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pytorch_lightning import Trainer
1010
from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
1111
from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
12-
from pytorch_lightning.utilities import _APEX_AVAILABLE, _DEEPSPEED_AVAILABLE, _NATIVE_AMP_AVAILABLE
12+
from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
1313
from pytorch_lightning.utilities.exceptions import MisconfigurationException
1414
from tests.helpers.boring_model import BoringModel
1515
from tests.helpers.runif import RunIf
@@ -122,12 +122,12 @@ def test_deepspeed_plugin_env(tmpdir, monkeypatch, deepspeed_config):
122122

123123
@pytest.mark.parametrize(
124124
"amp_backend", [
125-
pytest.param("native", marks=pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")),
126-
pytest.param("apex", marks=pytest.mark.skipif(not _APEX_AVAILABLE, reason="Requires Apex")),
125+
pytest.param("native", marks=RunIf(amp_native=True)),
126+
pytest.param("apex", marks=RunIf(amp_apex=True)),
127127
]
128128
)
129129
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
130-
@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
130+
@RunIf(amp_native=True)
131131
def test_deepspeed_precision_choice(amp_backend, tmpdir):
132132
"""
133133
Test to ensure precision plugin is also correctly chosen.

0 commit comments

Comments
 (0)