Skip to content

Commit a4d676d

Browse files
leezuBorda
authored andcommitted
Fix DeepSpeedPlugin with IterableDataset (#7362)
* deepspeed add train_micro_batch_size_per_gpu argument * Update naming and doc * Modify to use auto naming convention, add test * Add iterable tests * Fix tests, attempt by mocking * Import correct package * Fix comparison * Set as special test * Remove import * Add Changelog Co-authored-by: SeanNaren <[email protected]> (cherry picked from commit 98b94b8)
1 parent fbc8b20 commit a4d676d

File tree

3 files changed

+84
-4
lines changed

3 files changed

+84
-4
lines changed

CHANGELOG.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
66

77

8+
## [1.3.x] - 2021-MM-DD
9+
10+
### Added
11+
12+
13+
### Changed
14+
15+
16+
### Deprecated
17+
18+
19+
### Removed
20+
21+
22+
### Fixed
23+
24+
25+
- Fixed DeepSpeed with IterableDatasets ([#7362](https://github.com/PyTorchLightning/pytorch-lightning/pull/7362))
26+
27+
828
## [1.3.0] - 2021-05-06
929

1030
### Added

pytorch_lightning/plugins/training_type/deepspeed.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def __init__(
8888
allgather_bucket_size: int = 2e8,
8989
reduce_bucket_size: int = 2e8,
9090
zero_allow_untested_optimizer: bool = True,
91+
logging_batch_size_per_gpu: Union[str, int] = "auto",
9192
config: Optional[Union[Path, str, dict]] = None,
9293
logging_level: int = logging.WARN,
9394
num_nodes: int = 1,
@@ -148,6 +149,13 @@ def __init__(
148149
zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a
149150
DeepSpeed supported optimizer when using ZeRO (default: True)
150151
152+
logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging
153+
on a per sample per second basis (only displayed if logging=logging.INFO).
154+
If set to "auto", the plugin tries to infer this from
155+
the train DataLoader's BatchSampler, else defaults to 1.
156+
To obtain accurate logs when using datasets that do not support batch samplers,
157+
set this to the actual per gpu batch size (trainer.batch_size).
158+
151159
config: Pass in a deepspeed formatted config dict,
152160
or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json.
153161
All defaults will be ignored if a config is passed in. (Default: ``None``)
@@ -182,6 +190,7 @@ def __init__(
182190
when using ZeRO Stage 3. This allows a single weight file to contain the entire model,
183191
rather than individual sharded weight files.
184192
Disable to save sharded states individually. (Default: True)
193+
185194
"""
186195
if not _DEEPSPEED_AVAILABLE:
187196
raise MisconfigurationException(
@@ -197,6 +206,7 @@ def __init__(
197206
self.config = self._create_default_config(
198207
zero_optimization,
199208
zero_allow_untested_optimizer,
209+
logging_batch_size_per_gpu,
200210
partition_activations=partition_activations,
201211
cpu_checkpointing=cpu_checkpointing,
202212
contiguous_memory_optimization=contiguous_memory_optimization,
@@ -409,14 +419,22 @@ def _format_batch_size_and_grad_accum_config(self):
409419
" as this will be set via accumulate_grad_batches=x argument passed via the Lightning Trainer."
410420
)
411421
if "train_micro_batch_size_per_gpu" not in self.config:
412-
# train_micro_batch_size_per_gpu is used for throughput logging purposes
413-
# by default we use the batch size of the loader which may be incorrect if a batch sampler is passed
414-
batch_size = self.lightning_module.train_dataloader().batch_sampler.batch_size
422+
batch_size = self._auto_select_batch_size()
415423
self.config["train_micro_batch_size_per_gpu"] = batch_size
416424
self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches
417425
if "gradient_clipping" not in self.config:
418426
self.config["gradient_clipping"] = self.lightning_module.trainer.gradient_clip_val
419427

428+
def _auto_select_batch_size(self):
429+
# train_micro_batch_size_per_gpu is used for throughput logging purposes
430+
# by default we try to use the batch size of the loader
431+
batch_size = 1
432+
if hasattr(self.lightning_module, 'train_dataloader'):
433+
train_dataloader = self.lightning_module.train_dataloader()
434+
if hasattr(train_dataloader, 'batch_sampler'):
435+
batch_size = train_dataloader.batch_sampler.batch_size
436+
return batch_size
437+
420438
def _format_precision_config(self):
421439
amp_type = self.lightning_module.trainer.accelerator_connector.amp_type
422440
amp_level = self.lightning_module.trainer.accelerator_connector.amp_level
@@ -446,6 +464,7 @@ def _create_default_config(
446464
self,
447465
zero_optimization: bool,
448466
zero_allow_untested_optimizer: bool,
467+
logging_batch_size_per_gpu: Union[str, int],
449468
partition_activations: bool,
450469
cpu_checkpointing: bool,
451470
contiguous_memory_optimization: bool,
@@ -466,6 +485,8 @@ def _create_default_config(
466485
"zero_optimization": zero_kwargs,
467486
**cfg
468487
}
488+
if logging_batch_size_per_gpu != 'auto':
489+
cfg = {"train_micro_batch_size_per_gpu": logging_batch_size_per_gpu, **cfg}
469490
return cfg
470491

471492
def _filepath_to_dir(self, filepath: str) -> str:

tests/plugins/test_deepspeed_plugin.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
import torch.nn.functional as F
88
from torch import nn, Tensor
99
from torch.optim import Optimizer
10+
from torch.utils.data import DataLoader
1011

1112
from pytorch_lightning import LightningModule, seed_everything, Trainer
1213
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
1314
from pytorch_lightning.metrics import Accuracy
1415
from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
1516
from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
1617
from pytorch_lightning.utilities.exceptions import MisconfigurationException
17-
from tests.helpers.boring_model import BoringModel
18+
from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset
1819
from tests.helpers.datamodules import ClassifDataModule
1920
from tests.helpers.runif import RunIf
2021

@@ -234,6 +235,44 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
234235
trainer.fit(model)
235236

236237

238+
@RunIf(min_gpus=1, deepspeed=True, special=True)
239+
@pytest.mark.parametrize(['dataset_cls', 'value'], [(RandomDataset, "auto"), (RandomDataset, 10),
240+
(RandomIterableDataset, "auto"), (RandomIterableDataset, 10)])
241+
def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
242+
"""Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
243+
244+
class TestModel(BoringModel):
245+
246+
def train_dataloader(self):
247+
return DataLoader(dataset_cls(32, 64))
248+
249+
class AssertCallback(Callback):
250+
251+
def on_train_start(self, trainer, pl_module) -> None:
252+
assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin)
253+
config = trainer.accelerator.training_type_plugin.config
254+
255+
# int value overrides auto mode
256+
expected_value = value if isinstance(value, int) else 1
257+
if dataset_cls == RandomDataset:
258+
expected_value = pl_module.train_dataloader().batch_size if value == "auto" else value
259+
260+
assert config['train_micro_batch_size_per_gpu'] == expected_value
261+
raise SystemExit
262+
263+
ck = AssertCallback()
264+
model = TestModel()
265+
trainer = Trainer(
266+
default_root_dir=tmpdir,
267+
fast_dev_run=True,
268+
callbacks=ck,
269+
gpus=1,
270+
plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=value, zero_optimization=False),
271+
)
272+
with pytest.raises(SystemExit):
273+
trainer.fit(model)
274+
275+
237276
@RunIf(min_gpus=1, deepspeed=True, special=True)
238277
def test_deepspeed_run_configure_optimizers(tmpdir):
239278
"""

0 commit comments

Comments
 (0)