Merge pull request #705 from rohan-varma/add_param_server

Jessica Lin · web-flow · commit 8a5b379c2e4a · 2020-03-23T16:17:29.000-07:00
Simple example to demonstrate parameter server training pattern
diff --git a/distributed/rpc/parameter_server/README.md b/distributed/rpc/parameter_server/README.md
@@ -0,0 +1,10 @@
+### RPC-based distributed training
+
+This is a basic example of RPC-based training that uses several trainers remotely train a model hosted on a server. 
+
+To run the example locally, run the following command worker for the server and each worker you wish to spawn, in separate terminal windows:
+`python rpc_parameter_server.py [world_size] [rank] [num_gpus]`. For example, for a master node with world size of 2, the command would be `python rpc_parameter_server.py 2 0 0`. The trainer can then be launched with the command `python rpc_parameter_server.py 2 1 0` in a separate window, and this will begin training with one server and a single trainer.
+
+Note that for demonstration purposes, this example supports only between 0-2 GPUs, although the pattern can be extended to make use of additional GPUs.  
+
+You can pass in the command line arguments `--master_addr=<address>` and `master_port=PORT` to indicate the address:port that the master worker is listening on. All workers will contact the master for rendezvous during worker discovery. By default, `master_addr` will be `localhost` and `master_port` will be 29500.
diff --git a/distributed/rpc/parameter_server/rpc_parameter_server.py b/distributed/rpc/parameter_server/rpc_parameter_server.py
@@ -0,0 +1,310 @@
+import argparse
+import os
+import time
+from threading import Lock
+
+import torch
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import optim
+from torch.distributed.optim import DistributedOptimizer
+from torchvision import datasets, transforms
+
+# --------- MNIST Network to train, from pytorch/examples -----
+
+
+class Net(nn.Module):
+    def __init__(self, num_gpus=0):
+        super(Net, self).__init__()
+        print(f"Using {num_gpus} GPUs to train")
+        self.num_gpus = num_gpus
+        device = torch.device(
+            "cuda:0" if torch.cuda.is_available() and self.num_gpus > 0 else "cpu")
+        print(f"Putting first 2 convs on {str(device)}")
+        # Put conv layers on the first cuda device
+        self.conv1 = nn.Conv2d(1, 32, 3, 1).to(device)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1).to(device)
+        # Put rest of the network on the 2nd cuda device, if there is one
+        if "cuda" in str(device) and num_gpus > 1:
+            device = torch.device("cuda:1")
+
+        print(f"Putting rest of layers on {str(device)}")
+        self.dropout1 = nn.Dropout2d(0.25).to(device)
+        self.dropout2 = nn.Dropout2d(0.5).to(device)
+        self.fc1 = nn.Linear(9216, 128).to(device)
+        self.fc2 = nn.Linear(128, 10).to(device)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.max_pool2d(x, 2)
+
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        # Move tensor to next device if necessary
+        next_device = next(self.fc1.parameters()).device
+        x = x.to(next_device)
+
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+# --------- Helper Methods --------------------
+
+# On the local node, call a method with first arg as the value held by the
+# RRef. Other args are passed in as arguments to the function called.
+# Useful for calling instance methods.
+def call_method(method, rref, *args, **kwargs):
+    return method(rref.local_value(), *args, **kwargs)
+
+# Given an RRef, return the result of calling the passed in method on the value
+# held by the RRef. This call is done on the remote node that owns
+# the RRef. args and kwargs are passed into the method.
+# Example: If the value held by the RRef is of type Foo, then
+# remote_method(Foo.bar, rref, arg1, arg2) is equivalent to calling
+# <foo_instance>.bar(arg1, arg2) on the remote node and getting the result
+# back.
+
+
+def remote_method(method, rref, *args, **kwargs):
+    args = [method, rref] + list(args)
+    return rpc.rpc_sync(rref.owner(), call_method, args=args, kwargs=kwargs)
+
+
+# --------- Parameter Server --------------------
+class ParameterServer(nn.Module):
+    def __init__(self, num_gpus=0):
+        super().__init__()
+        model = Net(num_gpus=num_gpus)
+        self.model = model
+        self.input_device = torch.device(
+            "cuda:0" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
+
+    def forward(self, inp):
+        inp = inp.to(self.input_device)
+        out = self.model(inp)
+        # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors.
+        # Tensors must be moved in and out of GPU memory due to this.
+        out = out.to("cpu")
+        return out
+
+    # Use dist autograd to retrieve gradients accumulated for this model.
+    # Primarily used for verification.
+    def get_dist_gradients(self, cid):
+        grads = dist_autograd.get_gradients(cid)
+        # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors.
+        # Tensors must be moved in and out of GPU memory due to this.
+        cpu_grads = {}
+        for k, v in grads.items():
+            k_cpu, v_cpu = k.to("cpu"), v.to("cpu")
+            cpu_grads[k_cpu] = v_cpu
+        return cpu_grads
+
+    # Wrap local parameters in a RRef. Needed for building the
+    # DistributedOptimizer which optimizes paramters remotely.
+    def get_param_rrefs(self):
+        param_rrefs = [rpc.RRef(param) for param in self.model.parameters()]
+        return param_rrefs
+
+
+param_server = None
+global_lock = Lock()
+
+
+def get_parameter_server(num_gpus=0):
+    global param_server
+    # Ensure that we get only one handle to the ParameterServer.
+    with global_lock:
+        if not param_server:
+            # construct it once
+            param_server = ParameterServer(num_gpus=num_gpus)
+        return param_server
+
+
+def run_parameter_server(rank, world_size):
+    # The parameter server just acts as a host for the model and responds to
+    # requests from trainers, hence it does not need to run a loop.
+    # rpc.shutdown() will wait for all workers to complete by default, which
+    # in this case means that the parameter server will wait for all trainers
+    # to complete, and then exit.
+    print("PS master initializing RPC")
+    rpc.init_rpc(name="parameter_server", rank=rank, world_size=world_size)
+    print("RPC initialized! Running parameter server...")
+    rpc.shutdown()
+    print("RPC shutdown on parameter server.")
+
+
+# --------- Trainers --------------------
+
+# nn.Module corresponding to the network trained by this trainer. The
+# forward() method simply invokes the network on the given parameter
+# server.
+class TrainerNet(nn.Module):
+    def __init__(self, num_gpus=0):
+        super().__init__()
+        self.num_gpus = num_gpus
+        self.param_server_rref = rpc.remote(
+            "parameter_server", get_parameter_server, args=(num_gpus,))
+
+    def get_global_param_rrefs(self):
+        remote_params = remote_method(
+            ParameterServer.get_param_rrefs,
+            self.param_server_rref)
+        return remote_params
+
+    def forward(self, x, cid):
+        model_output = remote_method(
+            ParameterServer.forward, self.param_server_rref, x)
+        return model_output
+
+
+def run_training_loop(rank, num_gpus, train_loader, test_loader):
+    # Runs the typical nueral network forward + backward + optimizer step, but
+    # in a distributed fashion.
+    net = TrainerNet(num_gpus=num_gpus)
+    # Build DistributedOptmizer.
+    param_rrefs = net.get_global_param_rrefs()
+    opt = DistributedOptimizer(optim.SGD, param_rrefs, lr=0.03)
+    for i, (data, target) in enumerate(train_loader):
+        with dist_autograd.context() as cid:
+            model_output = net(data, cid)
+            target = target.to(model_output.device)
+            loss = F.nll_loss(model_output, target)
+            if i % 5 == 0:
+                print(f"Rank {rank} training batch {i} loss {loss.item()}")
+            dist_autograd.backward(cid, [loss])
+            # Ensure that dist autograd ran successfully and gradients were
+            # returned.
+            assert remote_method(
+                ParameterServer.get_dist_gradients,
+                net.param_server_rref,
+                cid) != {}
+            opt.step(cid)
+
+    print("Training complete!")
+    print("Getting accuracy....")
+    get_accuracy(test_loader, net)
+
+
+def get_accuracy(test_loader, model):
+    model.eval()
+    correct_sum = 0
+    # Use GPU to evaluate if possible
+    device = torch.device("cuda:0" if model.num_gpus > 0 
+        and torch.cuda.is_available() else "cpu")
+    with torch.no_grad():
+        for i, (data, target) in enumerate(test_loader):
+            out = model(data, -1)
+            pred = out.argmax(dim=1, keepdim=True)
+            pred, target = pred.to(device), target.to(device)
+            correct = pred.eq(target.view_as(pred)).sum().item()
+            correct_sum += correct
+
+    print(f"Accuracy {correct_sum / len(test_loader.dataset)}")
+
+
+# Main loop for trainers.
+def run_worker(rank, world_size, num_gpus, train_loader, test_loader):
+    print(f"Worker rank {rank} initializing RPC")
+    rpc.init_rpc(
+        name=f"trainer_{rank}",
+        rank=rank,
+        world_size=world_size)
+
+    print(f"Worker {rank} done initializing RPC")
+
+    run_training_loop(rank, num_gpus, train_loader, test_loader)
+    rpc.shutdown()
+
+# --------- Launcher --------------------
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Parameter-Server RPC based training")
+    parser.add_argument(
+        "world_size",
+        type=int,
+        default=4,
+        help="""Total number of participating processes. Should be the sum of
+        master node and all training nodes.""")
+    parser.add_argument(
+        "rank",
+        type=int,
+        default=None,
+        help="Global rank of this process. Pass in 0 for master.")
+    parser.add_argument(
+        "num_gpus",
+        type=int,
+        default=0,
+        help="""Number of GPUs to use for training, Currently supports between 0
+         and 2 GPUs. Note that this argument will be passed to the parameter servers.""")
+    parser.add_argument(
+        "--master_addr",
+        type=str,
+        default="localhost",
+        help="""Address of master, will default to localhost if not provided.
+        Master must be able to accept network traffic on the address + port.""")
+    parser.add_argument(
+        "--master_port",
+        type=str,
+        default="29500",
+        help="""Port that master is listening on, will default to 29500 if not
+        provided. Master must be able to accept network traffic on the host and port.""")
+
+    args = parser.parse_args()
+    assert args.rank is not None, "must provide rank argument."
+    assert args.num_gpus <= 3, f"Only 0-2 GPUs currently supported (got {args.num_gpus})."
+    os.environ['MASTER_ADDR'] = args.master_addr
+    os.environ["MASTER_PORT"] = args.master_port
+    processes = []
+    world_size = args.world_size
+    if args.rank == 0:
+        p = mp.Process(target=run_parameter_server, args=(0, world_size))
+        p.start()
+        processes.append(p)
+    else:
+        # Get data to train on
+        train_loader = torch.utils.data.DataLoader(
+            datasets.MNIST('../data', train=True, download=True,
+                           transform=transforms.Compose([
+                               transforms.ToTensor(),
+                               transforms.Normalize((0.1307,), (0.3081,))
+                           ])),
+            batch_size=32, shuffle=True,)
+        test_loader = torch.utils.data.DataLoader(
+            datasets.MNIST(
+                '../data',
+                train=False,
+                transform=transforms.Compose(
+                    [
+                        transforms.ToTensor(),
+                        transforms.Normalize(
+                            (0.1307,
+                             ),
+                            (0.3081,
+                             ))])),
+            batch_size=32,
+            shuffle=True,
+        )
+        # start training worker on this node
+        p = mp.Process(
+            target=run_worker,
+            args=(
+                args.rank,
+                world_size, args.num_gpus,
+                train_loader,
+                test_loader))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()