From cbe5092de0ae279507817215644aa764ec9368ee Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 14:24:56 +0000 Subject: [PATCH 1/4] [App] Fix multi-node pytorch example CI --- examples/app_multi_node/train_pytorch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index 9599bce5bbd85..6beeac0f04b2b 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -23,7 +23,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no # 2. PREPARE DISTRIBUTED MODEL model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) + model = DistributedDataParallel(model, device_ids=[local_rank] if torch.cuda.is_available() else None).to(device) # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() @@ -55,7 +55,7 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int) ) -# 32 GPUs: (8 nodes x 4 v 100) +# 32 GPUs: (2 nodes x 4 v 100) compute = L.CloudCompute("gpu-fast-multi") # 4xV100 component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) app = L.LightningApp(component) From 7a8a94de91d70dd7eb51475e270dc40f1f89f314 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 14:31:53 +0000 Subject: [PATCH 2/4] Update train_pytorch.py --- examples/app_multi_node/train_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index 6beeac0f04b2b..cc9e84297c151 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -55,7 +55,7 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int) ) -# 32 GPUs: (2 nodes x 4 v 100) +# 8 GPUs: (2 nodes x 4 v 100) compute = L.CloudCompute("gpu-fast-multi") # 4xV100 component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) app = L.LightningApp(component) From ddd83c96f4bcdf41cf8309348b367177dad9cf55 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 14:59:45 +0000 Subject: [PATCH 3/4] Fix docs --- docs/source-app/levels/basic/hello_components/pt_multinode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/levels/basic/hello_components/pt_multinode.py b/docs/source-app/levels/basic/hello_components/pt_multinode.py index 86bd7da10c6ff..8b39c74cdcc2e 100644 --- a/docs/source-app/levels/basic/hello_components/pt_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pt_multinode.py @@ -22,7 +22,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no # 2. PREPARE DISTRIBUTED MODEL model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) + model = DistributedDataParallel(model, device_ids=[local_rank] if torch.cuda.is_available() else None).to(device) # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() From a14346bf74687ee6e7310359437adf8bf596de25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 21 Nov 2022 16:43:36 +0100 Subject: [PATCH 4/4] Trigger CI --- .github/checkgroup.yml | 2 +- .github/workflows/ci-app-examples.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 923694c31687d..71c1d875abb60 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -246,7 +246,7 @@ subprojects: - ".github/workflows/ci-app-examples.yml" - "src/lightning_app/**" - "tests/tests_app_examples/**" - - "examples/app_*" + - "examples/app_*/**" - "requirements/app/**" - "setup.py" - ".actions/**" diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 88eadcfd920f8..9646efa27db66 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -11,7 +11,7 @@ on: - ".github/workflows/ci-app-examples.yml" - "src/lightning_app/**" - "tests/tests_app_examples/**" - - "examples/app_*" + - "examples/app_*/**" - "requirements/app/**" - "setup.py" - ".actions/**"