Skip to content

DeepSpeed Stage 2 Tensors on Different Devices #9521

@kelvins64

Description

@kelvins64

🐛 Bug

Attempting to run Trainer.fit with GPUs other than cuda:0 with the DeepSpeed Zero Stage 2 plugin results in RuntimeError: Expected all tensors to be on the same device, but found at least two devices.

To Reproduce

import os
from typing import Union

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import LightningModule, Trainer
import argparse

class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

class BoringModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(32, 2)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("valid_loss", loss)

    def test_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("test_loss", loss)

    def configure_optimizers(self):
        return torch.optim.SGD(self.layer.parameters(), lr=0.1)

# Start new code
def run(str_args: Union[str, None] = None):
    parser = argparse.ArgumentParser()
    parser = Trainer.add_argparse_args(parser)

    args = parser.parse_args() if str_args is None else parser.parse_args(str_args.split())
# End new code

    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    test_data = DataLoader(RandomDataset(32, 64), batch_size=2)

    model = BoringModel()
# Start new code
    trainer = Trainer.from_argparse_args(
        args,
        plugins='deepspeed_stage_2',
# End new code
        default_root_dir=os.getcwd(),
        limit_train_batches=1,
        limit_val_batches=1,
        num_sanity_val_steps=0,
        max_epochs=1,
        weights_summary=None,
    )
    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
    trainer.test(model, dataloaders=test_data)

if __name__ == "__main__":
    run('--gpus 1,') # New code

The error:

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking arugment for argument mat1 in method wrapper_addmm)

Environment

  • PyTorch Lightning Version (e.g., 1.3.0): 1.4.6
  • PyTorch Version (e.g., 1.8): 1.9.0
  • Python version: 3.9.6
  • OS (e.g., Linux): Linux
  • CUDA/cuDNN version: 11.3
  • GPU models and configuration: NVIDIA Tesla V100
  • How you installed PyTorch (conda, pip, source): pip
  • If compiling from source, the output of torch.__config__.show():
  • Any other relevant information:

Additional context

Metadata

Metadata

Assignees

Labels

bugSomething isn't workinghelp wantedOpen to be worked on

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions