[docs] Add docs for non-SLURM cluster setup (#5754)

areshytko · oreshytko · tchaton · Borda · commit 6d7c01b1b461 · 2021-02-05T21:43:10.000+01:00
* Add docs for non-slurm cluster setup

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;

* Update docs/source/cluster.rst

Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;

* Update docs/source/cluster.rst

Co-authored-by: Alexander &lt;alexander@reshytko.com&gt;
Co-authored-by: chaton &lt;thomas@grid.ai&gt;
Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: mergify[bot] &lt;37929162+mergify[bot]@users.noreply.github.com&gt;
Co-authored-by: Rohit Gupta &lt;rohitgr1998@gmail.com&gt;
diff --git a/docs/source/accelerators.rst b/docs/source/accelerators.rst
@@ -0,0 +1,185 @@
+
+.. _accelerators:
+
+############
+Accelerators
+############
+Accelerators connect a Lightning Trainer to arbitrary accelerators (CPUs, GPUs, TPUs, etc). Accelerators
+also manage distributed accelerators (like DP, DDP, HPC cluster).
+
+Accelerators can also be configured to run on arbitrary clusters using Plugins or to link up to arbitrary
+computational strategies like 16-bit precision via AMP and Apex.
+
+----------
+
+******************************
+Implement a custom accelerator
+******************************
+To link up arbitrary hardware, implement your own Accelerator subclass
+
+.. code-block:: python
+
+    from pytorch_lightning.accelerators.accelerator import Accelerator
+
+        class MyAccelerator(Accelerator):
+            def __init__(self, trainer, cluster_environment=None):
+                super().__init__(trainer, cluster_environment)
+                self.nickname = 'my_accelerator'
+
+            def setup(self):
+                # find local rank, etc, custom things to implement
+
+            def train(self):
+                # implement what happens during training
+
+            def training_step(self):
+                # implement how to do a training_step on this accelerator
+
+            def validation_step(self):
+                # implement how to do a validation_step on this accelerator
+
+            def test_step(self):
+                # implement how to do a test_step on this accelerator
+
+            def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+                # implement how to do a backward pass with this accelerator
+
+            def barrier(self, name: Optional[str] = None):
+                # implement this accelerator's barrier
+
+            def broadcast(self, obj, src=0):
+                # implement this accelerator's broadcast function
+
+            def sync_tensor(self,
+                            tensor: Union[torch.Tensor],
+                            group: Optional[Any] = None,
+                            reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+                # implement how to sync tensors when reducing metrics across accelerators
+
+********
+Examples
+********
+The following examples illustrate customizing accelerators.
+
+Example 1: Arbitrary HPC cluster
+================================
+To link any accelerator with an arbitrary cluster (SLURM, Condor, etc), pass in a Cluster Plugin which will be passed
+into any accelerator.
+
+First, implement your own ClusterEnvironment. Here is the torch elastic implementation.
+
+.. code-block:: python
+
+    import os
+    from pytorch_lightning import _logger as log
+    from pytorch_lightning.utilities import rank_zero_warn
+    from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
+
+    class TorchElasticEnvironment(ClusterEnvironment):
+
+        def __init__(self):
+            super().__init__()
+
+        def master_address(self):
+            if "MASTER_ADDR" not in os.environ:
+                rank_zero_warn(
+                    "MASTER_ADDR environment variable is not defined. Set as localhost"
+                )
+                os.environ["MASTER_ADDR"] = "127.0.0.1"
+            log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
+            master_address = os.environ.get('MASTER_ADDR')
+            return master_address
+
+        def master_port(self):
+            if "MASTER_PORT" not in os.environ:
+                rank_zero_warn(
+                    "MASTER_PORT environment variable is not defined. Set as 12910"
+                )
+                os.environ["MASTER_PORT"] = "12910"
+            log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
+
+            port = os.environ.get('MASTER_PORT')
+            return port
+
+        def world_size(self):
+            return os.environ.get('WORLD_SIZE')
+
+        def local_rank(self):
+            return int(os.environ['LOCAL_RANK'])
+
+Now, pass it into the trainer which will use Torch Elastic across your accelerator of choice.
+
+.. code-block:: python
+
+    cluster = TorchElasticEnvironment()
+    accelerator = MyAccelerator()
+    trainer = Trainer(plugins=[cluster], accelerator=MyAccelerator())
+
+In this example, MyAccelerator can define arbitrary hardware (like IPUs or TPUs) and links it to an arbitrary
+compute cluster.
+
+------------
+
+**********************
+Available Accelerators
+**********************
+
+CPU Accelerator
+===============
+
+.. autoclass:: pytorch_lightning.accelerators.cpu_accelerator.CPUAccelerator
+    :noindex:
+
+DDP Accelerator
+===============
+
+.. autoclass:: pytorch_lightning.accelerators.ddp_accelerator.DDPAccelerator
+    :noindex:
+
+DDP2 Accelerator
+================
+
+.. autoclass:: pytorch_lightning.accelerators.ddp2_accelerator.DDP2Accelerator
+    :noindex:
+
+DDP CPU HPC Accelerator
+=======================
+
+.. autoclass:: pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator.DDPCPUHPCAccelerator
+    :noindex:
+
+DDP CPU Spawn Accelerator
+=========================
+
+.. autoclass:: pytorch_lightning.accelerators.ddp_cpu_spawn_accelerator.DDPCPUSpawnAccelerator
+    :noindex:
+
+DDP HPC Accelerator
+===================
+
+.. autoclass:: pytorch_lightning.accelerators.ddp_hpc_accelerator.DDPHPCAccelerator
+    :noindex:
+
+DDP Spawn Accelerator
+=====================
+
+.. autoclass:: pytorch_lightning.accelerators.ddp_spawn_accelerator.DDPSpawnAccelerator
+    :noindex:
+
+GPU Accelerator
+===============
+
+.. autoclass:: pytorch_lightning.accelerators.gpu_accelerator.GPUAccelerator
+    :noindex:
+
+Horovod Accelerator
+===================
+
+.. autoclass:: pytorch_lightning.accelerators.horovod_accelerator.HorovodAccelerator
+    :noindex:
+
+TPU Accelerator
+===============
+
+.. autoclass:: pytorch_lightning.accelerators.tpu_accelerator.TPUAccelerator
+    :noindex:
diff --git a/docs/source/advanced/cluster.rst b/docs/source/advanced/cluster.rst
@@ -0,0 +1,57 @@
+
+.. _non-slurm:
+
+Computing cluster
+=================
+
+With Lightning it is easy to run your training script on a computing cluster without almost any modifications to the script.
+This guide shows how to run a training job on a general purpose cluster.
+
+Also, check :ref:`accelerators` as a new and more general approach to a cluster setup.
+
+--------
+
+
+Cluster setup
+-------------
+
+To setup a multi-node computing cluster you need:
+
+1) Multiple computers with PyTorch Lightning installed
+2) A network connectivity between them with firewall rules that allow traffic flow on a specified *MASTER_PORT*.
+3) Defined environment variables on each node required for the PyTorch Lightning multi-node distributed training
+
+PyTorch Lightning follows the design of `PyTorch distributed communication package <https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization>`_. and requires the following environment variables to be defined on each node:
+
+- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0
+- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node
+- *WORLD_SIZE* - required; how many nodes are in the cluster
+- *NODE_RANK* - required; id of the node in the cluster
+
+
+Training script design
+----------------------
+
+To train a model using multiple nodes, do the following:
+
+1.  Design your :ref:`lightning_module` (no need to add anything specific here).
+
+2.  Enable DDP in the trainer
+
+    .. code-block:: python
+
+       # train on 32 GPUs across 4 nodes
+       trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp')
+
+
+Submit a job to the cluster
+---------------------------
+
+To submit a training job to the cluster you need to run the same training script on each node of the cluster.
+This means that you need to:
+
+1. Copy all third-party libraries to each node (usually means - distribute requirements.txt file and install it).
+
+2. Copy all your import dependencies and the script itself to each node.
+
+3. Run the script on each node.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -369,23 +369,11 @@ def package_list_from_file(file):
 # only run doctests marked with a ".. doctest::" directive
 doctest_test_doctest_blocks = ''
 doctest_global_setup = """
-
 import importlib
 import os
 import torch
 from torch import nn
-
 import pytorch_lightning as pl
-<<<<<<< HEAD
-from pytorch_lightning import LightningDataModule, LightningModule, Trainer
-from pytorch_lightning.utilities import (
-    _NATIVE_AMP_AVAILABLE,
-    _APEX_AVAILABLE,
-    _XLA_AVAILABLE,
-    _TPU_AVAILABLE,
-)
-_TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
-=======
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.utilities import (
     NATIVE_AMP_AVAILABLE,
@@ -395,7 +383,5 @@ def package_list_from_file(file):
     _module_available,
 )
 TORCHVISION_AVAILABLE = _module_available("torchvision")
->>>>>>> d71659b4 (Fix docs typo (#4930))
-
 """
 coverage_skip_undoc_in_source = True
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -113,6 +113,7 @@ PyTorch Lightning Documentation
    advanced/training_tricks
    advanced/transfer_learning
    advanced/tpu
+   advanced/cluster
    common/test_set
    common/production_inference