DeepSpeed Stage 2 Tensors on Different Devices

## 🐛 Bug


Attempting to run Trainer.fit with GPUs other than cuda:0 with the DeepSpeed Zero Stage 2 plugin results in `RuntimeError: Expected all tensors to be on the same device, but found at least two devices`.
### To Reproduce

``` python
import os
from typing import Union

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import LightningModule, Trainer
import argparse

class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

class BoringModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(32, 2)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("valid_loss", loss)

    def test_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("test_loss", loss)

    def configure_optimizers(self):
        return torch.optim.SGD(self.layer.parameters(), lr=0.1)

# Start new code
def run(str_args: Union[str, None] = None):
    parser = argparse.ArgumentParser()
    parser = Trainer.add_argparse_args(parser)

    args = parser.parse_args() if str_args is None else parser.parse_args(str_args.split())
# End new code

    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    test_data = DataLoader(RandomDataset(32, 64), batch_size=2)

    model = BoringModel()
# Start new code
    trainer = Trainer.from_argparse_args(
        args,
        plugins='deepspeed_stage_2',
# End new code
        default_root_dir=os.getcwd(),
        limit_train_batches=1,
        limit_val_batches=1,
        num_sanity_val_steps=0,
        max_epochs=1,
        weights_summary=None,
    )
    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
    trainer.test(model, dataloaders=test_data)

if __name__ == "__main__":
    run('--gpus 1,') # New code

```

The error:
```
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking arugment for argument mat1 in method wrapper_addmm)
```

### Environment



- PyTorch Lightning Version (e.g., 1.3.0): 1.4.6
- PyTorch Version (e.g., 1.8): 1.9.0
- Python version: 3.9.6
- OS (e.g., Linux): Linux
- CUDA/cuDNN version: 11.3
- GPU models and configuration: NVIDIA Tesla V100
- How you installed PyTorch (`conda`, `pip`, source): pip
- If compiling from source, the output of `torch.__config__.show()`:
- Any other relevant information:

### Additional context

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

DeepSpeed Stage 2 Tensors on Different Devices #9521

🐛 Bug

To Reproduce

Environment

Additional context

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

DeepSpeed Stage 2 Tensors on Different Devices #9521

Description

🐛 Bug

To Reproduce

Environment

Additional context

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions