From af9a001720a6bde600a1b816263ff407468bc895 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Wed, 16 Nov 2022 16:16:37 +0100
Subject: [PATCH 1/7] Symlink / update multi-node examples

---
 .../basic/hello_components/pl_multinode.py    |  3 +-
 .../basic/hello_components/pt_multinode.py    |  3 +-
 .../levels/basic/hero_components.rst          |  2 +-
 examples/app_multi_node/train_lt.py           | 24 +------
 examples/app_multi_node/train_pytorch.py      | 70 +------------------
 5 files changed, 5 insertions(+), 97 deletions(-)
 mode change 100644 => 120000 examples/app_multi_node/train_lt.py
 mode change 100644 => 120000 examples/app_multi_node/train_pytorch.py

diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py
index 44db267160069..a9131f41168fb 100644
--- a/docs/source-app/levels/basic/hello_components/pl_multinode.py
+++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py
@@ -5,8 +5,7 @@
 
 
 class LightningTrainerDistributed(L.LightningWork):
-    @staticmethod
-    def run():
+    def run(self):
         model = BoringModel()
         trainer = L.Trainer(max_epochs=10, strategy="ddp")
         trainer.fit(model)
diff --git a/docs/source-app/levels/basic/hello_components/pt_multinode.py b/docs/source-app/levels/basic/hello_components/pt_multinode.py
index 585b85540bf61..86bd7da10c6ff 100644
--- a/docs/source-app/levels/basic/hello_components/pt_multinode.py
+++ b/docs/source-app/levels/basic/hello_components/pt_multinode.py
@@ -22,8 +22,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no
     # 2. PREPARE DISTRIBUTED MODEL
     model = torch.nn.Linear(32, 2)
     device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
-    device_ids = device if torch.cuda.is_available() else None
-    model = DistributedDataParallel(model, device_ids=device_ids).to(device)
+    model = DistributedDataParallel(model, device_ids=[local_rank]).to(device)
 
     # 3. SETUP LOSS AND OPTIMIZER
     criterion = torch.nn.MSELoss()
diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst
index 6bb8947a1a9cd..f67ed8610710e 100644
--- a/docs/source-app/levels/basic/hero_components.rst
+++ b/docs/source-app/levels/basic/hero_components.rst
@@ -1,7 +1,7 @@
 .. lit_tabs::
    :titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script;  XGBoost; Streamlit demo
    :code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py
-   :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-13, 25, 30; 7, 10; 15, 21; 9, 15, 24
+   :highlights: 7; 10, 11; 9-11, 16, 17; 4, 8, 12, 18-19, 26; 5, 10, 22, 27, 31, 41, 57-59; 3, 11-13, 25, 30; 7, 10; 15, 21; 9, 15, 24
    :app_id: abc123
    :tab_rows: 3
    :height: 620px
diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py
deleted file mode 100644
index c9e2f62392a56..0000000000000
--- a/examples/app_multi_node/train_lt.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import lightning as L
-from lightning.app.components import LightningTrainerMultiNode
-from lightning.pytorch.demos.boring_classes import BoringModel
-
-
-class LightningTrainerDistributed(L.LightningWork):
-    def run(self):
-        model = BoringModel()
-        trainer = L.Trainer(
-            max_steps=1000,
-            strategy="ddp",
-        )
-        trainer.fit(model)
-
-
-# Run over 2 nodes of 4 x V100
-app = L.LightningApp(
-    LightningTrainerMultiNode(
-        LightningTrainerDistributed,
-        num_nodes=2,
-        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100
-    )
-)
diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py
new file mode 120000
index 0000000000000..22828b03303f5
--- /dev/null
+++ b/examples/app_multi_node/train_lt.py
@@ -0,0 +1 @@
+pl_multinode.py
\ No newline at end of file
diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
deleted file mode 100644
index 9ce662fa40009..0000000000000
--- a/examples/app_multi_node/train_pytorch.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import torch
-from torch.nn.parallel.distributed import DistributedDataParallel
-
-import lightning as L
-from lightning.app.components import MultiNode
-
-
-def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int):
-    # 1. Setting distributed environment
-    global_rank = local_rank + node_rank * nprocs
-    world_size = num_nodes * nprocs
-
-    if torch.distributed.is_available() and not torch.distributed.is_initialized():
-        torch.distributed.init_process_group(
-            "nccl" if torch.cuda.is_available() else "gloo",
-            rank=global_rank,
-            world_size=world_size,
-            init_method=f"tcp://{main_address}:{main_port}",
-        )
-
-    # 2. Prepare the model
-    model = torch.nn.Sequential(
-        torch.nn.Linear(1, 1),
-        torch.nn.ReLU(),
-        torch.nn.Linear(1, 1),
-    )
-
-    # 3. Setup distributed training
-    device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
-    model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None)
-
-    # 4. Prepare loss and optimizer
-    criterion = torch.nn.MSELoss()
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
-
-    # 5. Train the model for 1000 steps.
-    for step in range(1000):
-        model.zero_grad()
-        x = torch.tensor([0.8]).to(device)
-        target = torch.tensor([1.0]).to(device)
-        output = model(x)
-        loss = criterion(output, target)
-        print(f"global_rank: {global_rank} step: {step} loss: {loss}")
-        loss.backward()
-        optimizer.step()
-
-
-class PyTorchDistributed(L.LightningWork):
-    def run(
-        self,
-        main_address: str,
-        main_port: int,
-        num_nodes: int,
-        node_rank: int,
-    ):
-        nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
-        torch.multiprocessing.spawn(
-            distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs
-        )
-
-
-# Run over 2 nodes of 4 x V100
-app = L.LightningApp(
-    MultiNode(
-        PyTorchDistributed,
-        num_nodes=2,
-        cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x V100
-    )
-)
diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
new file mode 120000
index 0000000000000..8c99aaa164339
--- /dev/null
+++ b/examples/app_multi_node/train_pytorch.py
@@ -0,0 +1 @@
+pt_multinode.py
\ No newline at end of file

From 04835afec34c32d7890661273b517e4c8ab91d7c Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Thu, 17 Nov 2022 11:20:32 +0100
Subject: [PATCH 2/7] Ignore dangling symlinks in example copy

---
 docs/source-app/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py
index b636b00c6116f..e659b1b6a00bf 100644
--- a/docs/source-app/conf.py
+++ b/docs/source-app/conf.py
@@ -293,7 +293,7 @@ def setup(app):
 for path_app_example in glob.glob(os.path.join(_PATH_ROOT, "examples", "app_*")):
     path_app_example2 = os.path.join(path_examples, os.path.basename(path_app_example))
     if not os.path.isdir(path_app_example2):
-        shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True)
+        shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True, ignore_dangling_symlinks=True)
 
 
 # Ignoring Third-party packages

From c6399991078142de8b84155808ad2c7868597aa2 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Fri, 18 Nov 2022 01:05:06 +0100
Subject: [PATCH 3/7] Fix

---
 .../levels/basic/real_lightning_component_implementations.rst   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/levels/basic/real_lightning_component_implementations.rst b/docs/source-app/levels/basic/real_lightning_component_implementations.rst
index da413f459234a..268517463c612 100644
--- a/docs/source-app/levels/basic/real_lightning_component_implementations.rst
+++ b/docs/source-app/levels/basic/real_lightning_component_implementations.rst
@@ -26,7 +26,7 @@ or cloud GPUs without code changes.
 .. lit_tabs::
    :descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 2.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic.
    :code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py;  /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py;
-   :highlights: 2; 4; 10-12; 15-18; 17; 18; 20
+   :highlights: 2; 4; 9-11; 14-17; 16; 17; 19
    :enable_run: true
    :tab_rows: 5
    :height: 420px

From 389b75f166eb932d5c3e820be10a1d540e176c81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Fri, 18 Nov 2022 19:58:10 +0100
Subject: [PATCH 4/7] fix job path

---
 .github/workflows/ci-app-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml
index 826dfc70b552f..8ddbf2a5ddb8d 100644
--- a/.github/workflows/ci-app-tests.yml
+++ b/.github/workflows/ci-app-tests.yml
@@ -11,7 +11,7 @@ on:
       - ".github/workflows/ci-app-tests.yml"
       - "src/lightning_app/**"
       - "tests/tests_app/**"
-      - "examples/app_*"  # some tests_app tests call examples files
+      - "examples/app_*/**"  # some tests_app tests call examples files
       - "requirements/app/**"
       - "setup.py"
       - ".actions/**"

From e621d5dee19797a82bfc419d3288fed13a7e77dc Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Mon, 21 Nov 2022 10:18:35 +0000
Subject: [PATCH 5/7] Remove symlink

---
 examples/app_multi_node/train_lt.py      | 21 +++++++-
 examples/app_multi_node/train_pytorch.py | 62 +++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 2 deletions(-)
 mode change 120000 => 100644 examples/app_multi_node/train_lt.py
 mode change 120000 => 100644 examples/app_multi_node/train_pytorch.py

diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py
deleted file mode 120000
index 22828b03303f5..0000000000000
--- a/examples/app_multi_node/train_lt.py
+++ /dev/null
@@ -1 +0,0 @@
-pl_multinode.py
\ No newline at end of file
diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py
new file mode 100644
index 0000000000000..4abe375c89b9b
--- /dev/null
+++ b/examples/app_multi_node/train_lt.py
@@ -0,0 +1,20 @@
+# app.py
+import lightning as L
+from lightning.app.components import LightningTrainerMultiNode
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+
+class LightningTrainerDistributed(L.LightningWork):
+    def run(self):
+        model = BoringModel()
+        trainer = L.Trainer(max_epochs=10, strategy="ddp")
+        trainer.fit(model)
+
+
+# 8 GPU: (2 nodes of 4 x v100)
+component = LightningTrainerMultiNode(
+    LightningTrainerDistributed,
+    num_nodes=4,
+    cloud_compute=L.CloudCompute("gpu-fast-multi"),  # 4 x v100
+)
+app = L.LightningApp(component)
diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
deleted file mode 120000
index 8c99aaa164339..0000000000000
--- a/examples/app_multi_node/train_pytorch.py
+++ /dev/null
@@ -1 +0,0 @@
-pt_multinode.py
\ No newline at end of file
diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
new file mode 100644
index 0000000000000..2bcb33dbbc079
--- /dev/null
+++ b/examples/app_multi_node/train_pytorch.py
@@ -0,0 +1,61 @@
+# app.py
+# ! pip install torch
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+import lightning as L
+from lightning.app.components import MultiNode
+
+
+def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int):
+    # 1. SET UP DISTRIBUTED ENVIRONMENT
+    global_rank = local_rank + node_rank * nprocs
+    world_size = num_nodes * nprocs
+
+    if torch.distributed.is_available() and not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            "nccl" if torch.cuda.is_available() else "gloo",
+            rank=global_rank,
+            world_size=world_size,
+            init_method=f"tcp://{main_address}:{main_port}",
+        )
+
+    # 2. PREPARE DISTRIBUTED MODEL
+    model = torch.nn.Linear(32, 2)
+    device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+    model = DistributedDataParallel(model, device_ids=[local_rank]).to(device)
+
+    # 3. SETUP LOSS AND OPTIMIZER
+    criterion = torch.nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+    # 4.TRAIN THE MODEL FOR 50 STEPS
+    for step in range(50):
+        model.zero_grad()
+        x = torch.randn(64, 32).to(device)
+        output = model(x)
+        loss = criterion(output, torch.ones_like(output))
+        print(f"global_rank: {global_rank} step: {step} loss: {loss}")
+        loss.backward()
+        optimizer.step()
+
+    # 5. VERIFY ALL COPIES OF THE MODEL HAVE THE SAME WEIGTHS AT END OF TRAINING
+    weight = model.module.weight.clone()
+    torch.distributed.all_reduce(weight)
+    assert torch.equal(model.module.weight, weight / world_size)
+
+    print("Multi Node Distributed Training Done!")
+
+
+class PyTorchDistributed(L.LightningWork):
+    def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int):
+        nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
+        torch.multiprocessing.spawn(
+            distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs
+        )
+
+
+# 32 GPUs: (8 nodes x 4 v 100)
+compute = L.CloudCompute("gpu-fast-multi")  # 4xV100
+component = MultiNode(PyTorchDistributed, num_nodes=8, cloud_compute=compute)
+app = L.LightningApp(component)

From b1acd6950130e5216d3bff57c025d35ad66cf5e5 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Mon, 21 Nov 2022 10:20:48 +0000
Subject: [PATCH 6/7] 2 nodes

---
 examples/app_multi_node/train_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
index 2bcb33dbbc079..9599bce5bbd85 100644
--- a/examples/app_multi_node/train_pytorch.py
+++ b/examples/app_multi_node/train_pytorch.py
@@ -57,5 +57,5 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int)
 
 # 32 GPUs: (8 nodes x 4 v 100)
 compute = L.CloudCompute("gpu-fast-multi")  # 4xV100
-component = MultiNode(PyTorchDistributed, num_nodes=8, cloud_compute=compute)
+component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute)
 app = L.LightningApp(component)

From f649a7e5e2e7999ad9e96a1d40be89ecd98bb179 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Mon, 21 Nov 2022 10:22:37 +0000
Subject: [PATCH 7/7] Revert not needed

---
 docs/source-app/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py
index e659b1b6a00bf..b636b00c6116f 100644
--- a/docs/source-app/conf.py
+++ b/docs/source-app/conf.py
@@ -293,7 +293,7 @@ def setup(app):
 for path_app_example in glob.glob(os.path.join(_PATH_ROOT, "examples", "app_*")):
     path_app_example2 = os.path.join(path_examples, os.path.basename(path_app_example))
     if not os.path.isdir(path_app_example2):
-        shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True, ignore_dangling_symlinks=True)
+        shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True)
 
 
 # Ignoring Third-party packages