Fix apex distributed training (#1124)

vinhngx · fmassa · commit c187c2b12d86 · 2019-07-19T15:02:37.000+02:00
* adding mixed precision training with Apex

* fix APEX default optimization level

* adding python version check for apex

* fix LINT errors and raise exceptions if apex not available

* fixing apex distributed training

* fix throughput calculation: include forward pass

* remove torch.cuda.set_device(args.gpu) as it's already called in init_distributed_mode

* fix linter: new line

* move Apex initialization code back to the beginning of main

* move apex initialization to before lr_scheduler - for peace of mind. Though, doing apex initialization after lr_scheduler seems to work fine as well
diff --git a/references/classification/train.py b/references/classification/train.py
@@ -26,11 +26,11 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri
 
     header = 'Epoch: [{}]'.format(epoch)
     for image, target in metric_logger.log_every(data_loader, print_freq, header):
+        start_time = time.time()
         image, target = image.to(device), target.to(device)
         output = model(image)
         loss = criterion(output, target)
 
-        start_time = time.time()
         optimizer.zero_grad()
         if apex:
             with amp.scale_loss(loss, optimizer) as scaled_loss:
@@ -170,23 +170,23 @@ def main(args):
     if args.distributed and args.sync_bn:
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
 
-    model_without_ddp = model
-    if args.distributed:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-        model_without_ddp = model.module
-
     criterion = nn.CrossEntropyLoss()
 
     optimizer = torch.optim.SGD(
         model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
 
-    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
-
     if args.apex:
         model, optimizer = amp.initialize(model, optimizer,
                                           opt_level=args.apex_opt_level
                                           )
 
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
     if args.resume:
         checkpoint = torch.load(args.resume, map_location='cpu')
         model_without_ddp.load_state_dict(checkpoint['model'])