Skip to content

ModelCheckpointing behaviour changed from previous versions (self.best_model_path holds rank 1 values)  #14302

@nithinraok

Description

@nithinraok

🐛 Bug

self.best_k_models stores checkpoints from two devices with different values and self.best_model_path points to cuda:1 while checkpoints saved were from cuda:0

This behavior has changed from 1.6.5 to 1.7.x

To Reproduce

import os

import pytorch_lightning as pl
from omegaconf import OmegaConf
from torch import Tensor, nn, optim, utils
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor

from nemo.utils.exp_manager import exp_manager

# define any number of nn.Modules (or use your current ones)
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

# define the LightningModule
class LitAutoEncoder(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = nn.functional.mse_loss(x_hat, x)
        # Logging to TensorBoard by defaulti
        self.log("loss", loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = nn.functional.mse_loss(x_hat, x)
        # Logging to TensorBoard by default
        self.log("val_loss", loss)
        return loss 

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def save_to(self, save_path):
        pass


dataset = MNIST(os.getcwd(), download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset, batch_size = 256)
dev_loader = utils.data.DataLoader(dataset, batch_size = 256)

# init the autoencoder
autoencoder = LitAutoEncoder(encoder, decoder)

trainer = pl.Trainer(
    limit_train_batches=100,
    max_epochs=2,
    devices=2,
    strategy='ddp',
    accelerator='gpu',
    enable_checkpointing=False,
    logger=False,
)

# kwargs = {
#     'dirpath': '/data/local/checkpoints', 
#     'filename': 'test_ptl--{val_loss:.4f}-{epoch}', 
#     'monitor': 'val_loss', 'verbose': True, 
#     'save_last': True, 
#     'save_top_k': 4, 
#     'save_weights_only': False, 
#     'mode': 'min', 
#     'every_n_epochs': 1
#     }
# from pytorch_lightning.callbacks import ModelCheckpoint
# checkpoint_callback = ModelCheckpoint(**kwargs)
# trainer.callbacks.append(checkpoint_callback)

manager = {
    'exp_dir': '/data/recognition/tarred',
    'name': 'test_ptl',
    'create_tensorboard_logger': True,
    'create_checkpoint_callback': True,
    'use_datetime_version': False,
    'create_wandb_logger': False,
    'resume_if_exists': True,
    'resume_ignore_no_checkpoint': True,
    'checkpoint_callback_params': {'save_top_k': 4, 'save_best_model': True},
    'version': 'local'
}

cfg = OmegaConf.create(manager)
log_dir = exp_manager(trainer, cfg)

trainer.fit(model=autoencoder, train_dataloaders=train_loader, val_dataloaders=dev_loader)
# kprint(trainer.callbacks[-1].best_model_path)

Expected behavior

Run code without issues

Environment

1.7.x
a.txt

cc @awaelchli @ananthsub @ninginthecloud @rohitgr7 @otaj

Metadata

Metadata

Assignees

Labels

bugSomething isn't workingcheckpointingRelated to checkpointingwaiting on authorWaiting on user action, correction, or update

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions