-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Closed
Labels
bugSomething isn't workingSomething isn't workingcheckpointingRelated to checkpointingRelated to checkpointingwaiting on authorWaiting on user action, correction, or updateWaiting on user action, correction, or update
Milestone
Description
🐛 Bug
self.best_k_models stores checkpoints from two devices with different values and self.best_model_path points to cuda:1 while checkpoints saved were from cuda:0
This behavior has changed from 1.6.5 to 1.7.x
To Reproduce
import os
import pytorch_lightning as pl
from omegaconf import OmegaConf
from torch import Tensor, nn, optim, utils
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from nemo.utils.exp_manager import exp_manager
# define any number of nn.Modules (or use your current ones)
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
# define the LightningModule
class LitAutoEncoder(pl.LightningModule):
def __init__(self, encoder, decoder):
super().__init__()
self.encoder = encoder
self.decoder = decoder
def training_step(self, batch, batch_idx):
# training_step defines the train loop.
# it is independent of forward
x, y = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
loss = nn.functional.mse_loss(x_hat, x)
# Logging to TensorBoard by defaulti
self.log("loss", loss)
return loss
def validation_step(self, batch, batch_idx):
# training_step defines the train loop.
# it is independent of forward
x, y = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
loss = nn.functional.mse_loss(x_hat, x)
# Logging to TensorBoard by default
self.log("val_loss", loss)
return loss
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
return optimizer
def save_to(self, save_path):
pass
dataset = MNIST(os.getcwd(), download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset, batch_size = 256)
dev_loader = utils.data.DataLoader(dataset, batch_size = 256)
# init the autoencoder
autoencoder = LitAutoEncoder(encoder, decoder)
trainer = pl.Trainer(
limit_train_batches=100,
max_epochs=2,
devices=2,
strategy='ddp',
accelerator='gpu',
enable_checkpointing=False,
logger=False,
)
# kwargs = {
# 'dirpath': '/data/local/checkpoints',
# 'filename': 'test_ptl--{val_loss:.4f}-{epoch}',
# 'monitor': 'val_loss', 'verbose': True,
# 'save_last': True,
# 'save_top_k': 4,
# 'save_weights_only': False,
# 'mode': 'min',
# 'every_n_epochs': 1
# }
# from pytorch_lightning.callbacks import ModelCheckpoint
# checkpoint_callback = ModelCheckpoint(**kwargs)
# trainer.callbacks.append(checkpoint_callback)
manager = {
'exp_dir': '/data/recognition/tarred',
'name': 'test_ptl',
'create_tensorboard_logger': True,
'create_checkpoint_callback': True,
'use_datetime_version': False,
'create_wandb_logger': False,
'resume_if_exists': True,
'resume_ignore_no_checkpoint': True,
'checkpoint_callback_params': {'save_top_k': 4, 'save_best_model': True},
'version': 'local'
}
cfg = OmegaConf.create(manager)
log_dir = exp_manager(trainer, cfg)
trainer.fit(model=autoencoder, train_dataloaders=train_loader, val_dataloaders=dev_loader)
# kprint(trainer.callbacks[-1].best_model_path)Expected behavior
Run code without issues
Environment
1.7.x
a.txt
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingcheckpointingRelated to checkpointingRelated to checkpointingwaiting on authorWaiting on user action, correction, or updateWaiting on user action, correction, or update