-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Closed
Labels
Description
When using O2, data parallel does not work:
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)
however with O1, everything works just fine.
model = GeneralVae(encoder, decoder, rep_size=500).cuda()
optimizer = optim.Adam(model.parameters(), lr=LR)
model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
if data_para and torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)
model = model.cuda()
loss_picture = customLoss()
val_losses = []
train_losses = []
def train(epoch):
train_loader_food = generate_data_loader(train_root, get_batch_size(epoch), int(rampDataSize * data_size))
print("Epoch {}: batch_size {}".format(epoch, get_batch_size(epoch)))
model.train()
train_loss = 0
loss = None
for batch_idx, (data, _, aff) in enumerate(train_loader_food):
data = data[0].cuda(0)
shoaibahmed, seilna, edouardelasalles, bermanmaxim, chenyilun95 and 12 moredonglixp and zhpmatrix