From af9a001720a6bde600a1b816263ff407468bc895 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Wed, 16 Nov 2022 16:16:37 +0100 Subject: [PATCH 1/7] Symlink / update multi-node examples --- .../basic/hello_components/pl_multinode.py | 3 +- .../basic/hello_components/pt_multinode.py | 3 +- .../levels/basic/hero_components.rst | 2 +- examples/app_multi_node/train_lt.py | 24 +------ examples/app_multi_node/train_pytorch.py | 70 +------------------ 5 files changed, 5 insertions(+), 97 deletions(-) mode change 100644 => 120000 examples/app_multi_node/train_lt.py mode change 100644 => 120000 examples/app_multi_node/train_pytorch.py diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py index 44db267160069..a9131f41168fb 100644 --- a/docs/source-app/levels/basic/hello_components/pl_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py @@ -5,8 +5,7 @@ class LightningTrainerDistributed(L.LightningWork): - @staticmethod - def run(): + def run(self): model = BoringModel() trainer = L.Trainer(max_epochs=10, strategy="ddp") trainer.fit(model) diff --git a/docs/source-app/levels/basic/hello_components/pt_multinode.py b/docs/source-app/levels/basic/hello_components/pt_multinode.py index 585b85540bf61..86bd7da10c6ff 100644 --- a/docs/source-app/levels/basic/hello_components/pt_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pt_multinode.py @@ -22,8 +22,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no # 2. PREPARE DISTRIBUTED MODEL model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - device_ids = device if torch.cuda.is_available() else None - model = DistributedDataParallel(model, device_ids=device_ids).to(device) + model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst index 6bb8947a1a9cd..f67ed8610710e 100644 --- a/docs/source-app/levels/basic/hero_components.rst +++ b/docs/source-app/levels/basic/hero_components.rst @@ -1,7 +1,7 @@ .. lit_tabs:: :titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo :code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py - :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-13, 25, 30; 7, 10; 15, 21; 9, 15, 24 + :highlights: 7; 10, 11; 9-11, 16, 17; 4, 8, 12, 18-19, 26; 5, 10, 22, 27, 31, 41, 57-59; 3, 11-13, 25, 30; 7, 10; 15, 21; 9, 15, 24 :app_id: abc123 :tab_rows: 3 :height: 620px diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py deleted file mode 100644 index c9e2f62392a56..0000000000000 --- a/examples/app_multi_node/train_lt.py +++ /dev/null @@ -1,23 +0,0 @@ -import lightning as L -from lightning.app.components import LightningTrainerMultiNode -from lightning.pytorch.demos.boring_classes import BoringModel - - -class LightningTrainerDistributed(L.LightningWork): - def run(self): - model = BoringModel() - trainer = L.Trainer( - max_steps=1000, - strategy="ddp", - ) - trainer.fit(model) - - -# Run over 2 nodes of 4 x V100 -app = L.LightningApp( - LightningTrainerMultiNode( - LightningTrainerDistributed, - num_nodes=2, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 - ) -) diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py new file mode 120000 index 0000000000000..22828b03303f5 --- /dev/null +++ b/examples/app_multi_node/train_lt.py @@ -0,0 +1 @@ +pl_multinode.py \ No newline at end of file diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py deleted file mode 100644 index 9ce662fa40009..0000000000000 --- a/examples/app_multi_node/train_pytorch.py +++ /dev/null @@ -1,69 +0,0 @@ -import torch -from torch.nn.parallel.distributed import DistributedDataParallel - -import lightning as L -from lightning.app.components import MultiNode - - -def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int): - # 1. Setting distributed environment - global_rank = local_rank + node_rank * nprocs - world_size = num_nodes * nprocs - - if torch.distributed.is_available() and not torch.distributed.is_initialized(): - torch.distributed.init_process_group( - "nccl" if torch.cuda.is_available() else "gloo", - rank=global_rank, - world_size=world_size, - init_method=f"tcp://{main_address}:{main_port}", - ) - - # 2. Prepare the model - model = torch.nn.Sequential( - torch.nn.Linear(1, 1), - torch.nn.ReLU(), - torch.nn.Linear(1, 1), - ) - - # 3. Setup distributed training - device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None) - - # 4. Prepare loss and optimizer - criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - - # 5. Train the model for 1000 steps. - for step in range(1000): - model.zero_grad() - x = torch.tensor([0.8]).to(device) - target = torch.tensor([1.0]).to(device) - output = model(x) - loss = criterion(output, target) - print(f"global_rank: {global_rank} step: {step} loss: {loss}") - loss.backward() - optimizer.step() - - -class PyTorchDistributed(L.LightningWork): - def run( - self, - main_address: str, - main_port: int, - num_nodes: int, - node_rank: int, - ): - nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 - torch.multiprocessing.spawn( - distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs - ) - - -# Run over 2 nodes of 4 x V100 -app = L.LightningApp( - MultiNode( - PyTorchDistributed, - num_nodes=2, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 - ) -) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py new file mode 120000 index 0000000000000..8c99aaa164339 --- /dev/null +++ b/examples/app_multi_node/train_pytorch.py @@ -0,0 +1 @@ +pt_multinode.py \ No newline at end of file From 04835afec34c32d7890661273b517e4c8ab91d7c Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Thu, 17 Nov 2022 11:20:32 +0100 Subject: [PATCH 2/7] Ignore dangling symlinks in example copy --- docs/source-app/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py index b636b00c6116f..e659b1b6a00bf 100644 --- a/docs/source-app/conf.py +++ b/docs/source-app/conf.py @@ -293,7 +293,7 @@ def setup(app): for path_app_example in glob.glob(os.path.join(_PATH_ROOT, "examples", "app_*")): path_app_example2 = os.path.join(path_examples, os.path.basename(path_app_example)) if not os.path.isdir(path_app_example2): - shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True) + shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True, ignore_dangling_symlinks=True) # Ignoring Third-party packages From c6399991078142de8b84155808ad2c7868597aa2 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Fri, 18 Nov 2022 01:05:06 +0100 Subject: [PATCH 3/7] Fix --- .../levels/basic/real_lightning_component_implementations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/levels/basic/real_lightning_component_implementations.rst b/docs/source-app/levels/basic/real_lightning_component_implementations.rst index da413f459234a..268517463c612 100644 --- a/docs/source-app/levels/basic/real_lightning_component_implementations.rst +++ b/docs/source-app/levels/basic/real_lightning_component_implementations.rst @@ -26,7 +26,7 @@ or cloud GPUs without code changes. .. lit_tabs:: :descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 2.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic. :code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; - :highlights: 2; 4; 10-12; 15-18; 17; 18; 20 + :highlights: 2; 4; 9-11; 14-17; 16; 17; 19 :enable_run: true :tab_rows: 5 :height: 420px From 389b75f166eb932d5c3e820be10a1d540e176c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 18 Nov 2022 19:58:10 +0100 Subject: [PATCH 4/7] fix job path --- .github/workflows/ci-app-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 826dfc70b552f..8ddbf2a5ddb8d 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -11,7 +11,7 @@ on: - ".github/workflows/ci-app-tests.yml" - "src/lightning_app/**" - "tests/tests_app/**" - - "examples/app_*" # some tests_app tests call examples files + - "examples/app_*/**" # some tests_app tests call examples files - "requirements/app/**" - "setup.py" - ".actions/**" From e621d5dee19797a82bfc419d3288fed13a7e77dc Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 10:18:35 +0000 Subject: [PATCH 5/7] Remove symlink --- examples/app_multi_node/train_lt.py | 21 +++++++- examples/app_multi_node/train_pytorch.py | 62 +++++++++++++++++++++++- 2 files changed, 81 insertions(+), 2 deletions(-) mode change 120000 => 100644 examples/app_multi_node/train_lt.py mode change 120000 => 100644 examples/app_multi_node/train_pytorch.py diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py deleted file mode 120000 index 22828b03303f5..0000000000000 --- a/examples/app_multi_node/train_lt.py +++ /dev/null @@ -1 +0,0 @@ -pl_multinode.py \ No newline at end of file diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py new file mode 100644 index 0000000000000..4abe375c89b9b --- /dev/null +++ b/examples/app_multi_node/train_lt.py @@ -0,0 +1,20 @@ +# app.py +import lightning as L +from lightning.app.components import LightningTrainerMultiNode +from lightning.pytorch.demos.boring_classes import BoringModel + + +class LightningTrainerDistributed(L.LightningWork): + def run(self): + model = BoringModel() + trainer = L.Trainer(max_epochs=10, strategy="ddp") + trainer.fit(model) + + +# 8 GPU: (2 nodes of 4 x v100) +component = LightningTrainerMultiNode( + LightningTrainerDistributed, + num_nodes=4, + cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 +) +app = L.LightningApp(component) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py deleted file mode 120000 index 8c99aaa164339..0000000000000 --- a/examples/app_multi_node/train_pytorch.py +++ /dev/null @@ -1 +0,0 @@ -pt_multinode.py \ No newline at end of file diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py new file mode 100644 index 0000000000000..2bcb33dbbc079 --- /dev/null +++ b/examples/app_multi_node/train_pytorch.py @@ -0,0 +1,61 @@ +# app.py +# ! pip install torch +import torch +from torch.nn.parallel.distributed import DistributedDataParallel + +import lightning as L +from lightning.app.components import MultiNode + + +def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int): + # 1. SET UP DISTRIBUTED ENVIRONMENT + global_rank = local_rank + node_rank * nprocs + world_size = num_nodes * nprocs + + if torch.distributed.is_available() and not torch.distributed.is_initialized(): + torch.distributed.init_process_group( + "nccl" if torch.cuda.is_available() else "gloo", + rank=global_rank, + world_size=world_size, + init_method=f"tcp://{main_address}:{main_port}", + ) + + # 2. PREPARE DISTRIBUTED MODEL + model = torch.nn.Linear(32, 2) + device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") + model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) + + # 3. SETUP LOSS AND OPTIMIZER + criterion = torch.nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + # 4.TRAIN THE MODEL FOR 50 STEPS + for step in range(50): + model.zero_grad() + x = torch.randn(64, 32).to(device) + output = model(x) + loss = criterion(output, torch.ones_like(output)) + print(f"global_rank: {global_rank} step: {step} loss: {loss}") + loss.backward() + optimizer.step() + + # 5. VERIFY ALL COPIES OF THE MODEL HAVE THE SAME WEIGTHS AT END OF TRAINING + weight = model.module.weight.clone() + torch.distributed.all_reduce(weight) + assert torch.equal(model.module.weight, weight / world_size) + + print("Multi Node Distributed Training Done!") + + +class PyTorchDistributed(L.LightningWork): + def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int): + nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 + torch.multiprocessing.spawn( + distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs + ) + + +# 32 GPUs: (8 nodes x 4 v 100) +compute = L.CloudCompute("gpu-fast-multi") # 4xV100 +component = MultiNode(PyTorchDistributed, num_nodes=8, cloud_compute=compute) +app = L.LightningApp(component) From b1acd6950130e5216d3bff57c025d35ad66cf5e5 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 10:20:48 +0000 Subject: [PATCH 6/7] 2 nodes --- examples/app_multi_node/train_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index 2bcb33dbbc079..9599bce5bbd85 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -57,5 +57,5 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int) # 32 GPUs: (8 nodes x 4 v 100) compute = L.CloudCompute("gpu-fast-multi") # 4xV100 -component = MultiNode(PyTorchDistributed, num_nodes=8, cloud_compute=compute) +component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) app = L.LightningApp(component) From f649a7e5e2e7999ad9e96a1d40be89ecd98bb179 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 10:22:37 +0000 Subject: [PATCH 7/7] Revert not needed --- docs/source-app/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py index e659b1b6a00bf..b636b00c6116f 100644 --- a/docs/source-app/conf.py +++ b/docs/source-app/conf.py @@ -293,7 +293,7 @@ def setup(app): for path_app_example in glob.glob(os.path.join(_PATH_ROOT, "examples", "app_*")): path_app_example2 = os.path.join(path_examples, os.path.basename(path_app_example)) if not os.path.isdir(path_app_example2): - shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True, ignore_dangling_symlinks=True) + shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True) # Ignoring Third-party packages