Lightning-AI
diff --git a/‎.github/CONTRIBUTING.md‎
Lines changed: 2 additions & 4 deletions b/‎.github/CONTRIBUTING.md‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 2 deletions b/‎README.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/source/advanced/ipu.rst‎
Lines changed: 8 additions & 8 deletions b/‎docs/source/advanced/ipu.rst‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/source/advanced/lr_finder.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/advanced/lr_finder.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/advanced/sequences.rst‎
Lines changed: 2 additions & 5 deletions b/‎docs/source/advanced/sequences.rst‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎docs/source/advanced/tpu.rst‎
Lines changed: 4 additions & 15 deletions b/‎docs/source/advanced/tpu.rst‎
Lines changed: 4 additions & 15 deletions
diff --git a/‎docs/source/advanced/training_tricks.rst‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/advanced/training_tricks.rst‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/advanced/transfer_learning.rst‎
Lines changed: 5 additions & 6 deletions b/‎docs/source/advanced/transfer_learning.rst‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎docs/source/clouds/cluster.rst‎
Lines changed: 15 additions & 22 deletions b/‎docs/source/clouds/cluster.rst‎
Lines changed: 15 additions & 22 deletions
@@ -137,6 +137,7 @@ See following short example of a sample function taking one position string and
 ```python
 from typing import Optional
 
+
 def my_func(param_a: int, param_b: Optional[float] = None) -> str:
     """Sample function.
 
@@ -310,10 +311,7 @@ def test_explain_what_is_being_tested(tmpdir):
     # BoringModel is a functional model. You might want to set methods to None to test your behaviour
     # Example: model.training_step_end = None
 
-    trainer = Trainer(
-        default_root_dir=tmpdir, # will save everything within a tmpdir generated for this test
-        ...
-    )
+    trainer = Trainer(default_root_dir=tmpdir, ...)  # will save everything within a tmpdir generated for this test
     trainer.fit(model)
     trainer.test()  # [OPTIONAL]
 
 
@@ -64,6 +64,13 @@ repos:
       - id: black
         name: Format code
 
+  - repo: https://github.com/asottile/blacken-docs
+    rev: v1.10.0
+    hooks:
+      - id: blacken-docs
+        args: [ --line-length=120 ]
+        additional_dependencies: [ black==21.7b0 ]
+
   - repo: https://github.com/PyCQA/flake8
     rev: 3.9.2
     hooks:
 
@@ -165,7 +165,6 @@ A LightningModule defines a full *system* (ie: a GAN, autoencoder, BERT or a sim
 
 ```python
 class LitAutoEncoder(pl.LightningModule):
-
     def __init__(self):
         super().__init__()
         self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3))
@@ -183,7 +182,7 @@ class LitAutoEncoder(pl.LightningModule):
         z = self.encoder(x)
         x_hat = self.decoder(z)
         loss = F.mse_loss(x_hat, x)
-        self.log('train_loss', loss)
+        self.log("train_loss", loss)
         return loss
 
     def configure_optimizers(self):
 
@@ -34,7 +34,7 @@ Specify the number of IPUs to train with. Note that when training with IPUs, you
 
 .. code-block:: python
 
-    trainer = pl.Trainer(ipus=8) # Train using data parallel on 8 IPUs
+    trainer = pl.Trainer(ipus=8)  # Train using data parallel on 8 IPUs
 
 IPUs only support specifying a single number to allocate devices, which is handled via the underlying libraries.
 
@@ -102,10 +102,7 @@ Note that by default we return the last device iteration loss. You can override
     training_opts.anchorMode(poptorch.AnchorMode.All)
     training_opts.deviceIterations(32)
 
-    trainer = Trainer(
-        ipus=8,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
+    trainer = Trainer(ipus=8, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts))
     trainer.fit(model)
 
 You can also override all options by passing the ``poptorch.Options`` to the plugin. See `PopTorch options documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`__ for more information.
@@ -127,7 +124,7 @@ Lightning supports dumping all reports to a directory to open using the tool.
     from pytorch_lightning.plugins import IPUPlugin
 
     model = MyLightningModule()
-    trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(autoreport_dir='report_dir/'))
+    trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(autoreport_dir="report_dir/"))
     trainer.fit(model)
 
 This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports <https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html#opening-reports>`__.
@@ -155,8 +152,8 @@ Below is an example using the block annotation in a LightningModule.
     import pytorch_lightning as pl
     import poptorch
 
-    class MyLightningModule(pl.LightningModule):
 
+    class MyLightningModule(pl.LightningModule):
         def __init__(self):
             super().__init__()
             # This will place layer1, layer2+layer3, layer4, softmax on different IPUs at runtime.
@@ -175,6 +172,7 @@ Below is an example using the block annotation in a LightningModule.
 
         ...
 
+
     model = MyLightningModule()
     trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=20))
     trainer.fit(model)
@@ -187,8 +185,8 @@ You can also use the block context manager within the forward function, or any o
     import pytorch_lightning as pl
     import poptorch
 
-    class MyLightningModule(pl.LightningModule):
 
+    class MyLightningModule(pl.LightningModule):
         def __init__(self):
             super().__init__()
             self.layer1 = torch.nn.Linear(5, 10)
@@ -214,8 +212,10 @@ You can also use the block context manager within the forward function, or any o
             with poptorch.Block(ipu_id=3):
                 x = self.softmax(x)
             return x
+
         ...
 
+
     model = MyLightningModule()
     trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=20))
     trainer.fit(model)
 
@@ -46,13 +46,13 @@ which can be accessed via ``self.learning_rate`` or ``self.lr``.
 .. code-block:: python
 
     class LitModel(LightningModule):
-
         def __init__(self, learning_rate):
             self.learning_rate = learning_rate
 
         def configure_optimizers(self):
             return Adam(self.parameters(), lr=(self.lr or self.learning_rate))
 
+
     model = LitModel()
 
     # finds learning rate automatically
@@ -68,7 +68,7 @@ If your model is using an arbitrary value instead of ``self.lr`` or ``self.learn
     model = LitModel()
 
     # to set to your own hparams.my_value
-    trainer = Trainer(auto_lr_find='my_value')
+    trainer = Trainer(auto_lr_find="my_value")
 
     trainer.tune(model)
 
 
@@ -13,8 +13,8 @@ Lightning can handle TBTT automatically via this flag.
 
     from pytorch_lightning import LightningModule
 
-    class MyModel(LightningModule):
 
+    class MyModel(LightningModule):
         def __init__(self):
             super().__init__()
             # Important: This property activates truncated backpropagation through time
@@ -26,10 +26,7 @@ Lightning can handle TBTT automatically via this flag.
             # the training step must be updated to accept a ``hiddens`` argument
             # hiddens are the hiddens from the previous truncated backprop step
             out, hiddens = self.lstm(data, hiddens)
-            return {
-                "loss": ...,
-                "hiddens": hiddens
-            }
+            return {"loss": ..., "hiddens": hiddens}
 
 .. note:: If you need to modify how the batch is split,
     override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`.
@@ -92,29 +92,18 @@ for TPU use
 
     import torch_xla.core.xla_model as xm
 
+
     def train_dataloader(self):
-        dataset = MNIST(
-            os.getcwd(),
-            train=True,
-            download=True,
-            transform=transforms.ToTensor()
-        )
+        dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
 
         # required for TPU support
         sampler = None
         if use_tpu:
             sampler = torch.utils.data.distributed.DistributedSampler(
-                dataset,
-                num_replicas=xm.xrt_world_size(),
-                rank=xm.get_ordinal(),
-                shuffle=True
+                dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True
             )
 
-        loader = DataLoader(
-            dataset,
-            sampler=sampler,
-            batch_size=32
-        )
+        loader = DataLoader(dataset, sampler=sampler, batch_size=32)
 
         return loader
 
 
@@ -46,7 +46,7 @@ If the Trainer's ``gradient_clip_algorithm`` is set to ``'value'`` (``'norm'`` b
     trainer = Trainer(gradient_clip_val=0.5)  # gradient_clip_algorithm='norm' by default
 
     # clip gradients' maximum magnitude to <=0.5
-    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value')
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")
 
 ----------
 
@@ -82,7 +82,7 @@ longer training time. Inspired by https://github.com/BlackHC/toma.
     trainer = Trainer(auto_scale_batch_size=None)
 
     # Autoscale batch size
-    trainer = Trainer(auto_scale_batch_size=None|'power'|'binsearch')
+    trainer = Trainer(auto_scale_batch_size=None | "power" | "binsearch")
 
     # find the batch size
     trainer.tune(model)
@@ -107,7 +107,7 @@ search for batch sizes larger than the size of the training dataset.
     .. code-block:: python
 
         def train_dataloader(self):
-            return DataLoader(train_dataset, batch_size=self.batch_size|self.hparams.batch_size)
+            return DataLoader(train_dataset, batch_size=self.batch_size | self.hparams.batch_size)
 
 .. warning::
 
 
@@ -21,11 +21,13 @@ Let's use the `AutoEncoder` as a feature extractor in a separate model.
     class Encoder(torch.nn.Module):
         ...
 
+
     class AutoEncoder(LightningModule):
         def __init__(self):
             self.encoder = Encoder()
             self.decoder = Decoder()
 
+
     class CIFAR10Classifier(LightningModule):
         def __init__(self):
             # init the pretrained LightningModule
@@ -50,6 +52,7 @@ Example: Imagenet (computer Vision)
 
     import torchvision.models as models
 
+
     class ImagenetTransferLearning(LightningModule):
         def __init__(self):
             super().__init__()
@@ -102,20 +105,16 @@ Here's a model that uses `Huggingface transformers <https://github.com/huggingfa
 .. testcode::
 
     class BertMNLIFinetuner(LightningModule):
-
         def __init__(self):
             super().__init__()
 
-            self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)
+            self.bert = BertModel.from_pretrained("bert-base-cased", output_attentions=True)
             self.W = nn.Linear(bert.config.hidden_size, 3)
             self.num_classes = 3
 
-
         def forward(self, input_ids, attention_mask, token_type_ids):
 
-            h, _, attn = self.bert(input_ids=input_ids,
-                             attention_mask=attention_mask,
-                             token_type_ids=token_type_ids)
+            h, _, attn = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
 
             h_cls = h[:, 0]
             logits = self.W(h_cls)
 
@@ -52,7 +52,7 @@ To train a model using multiple nodes, do the following:
     .. code-block:: python
 
        # train on 32 GPUs across 4 nodes
-       trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp')
+       trainer = Trainer(gpus=8, num_nodes=4, accelerator="ddp")
 
 
 Submit a job to the cluster
@@ -91,7 +91,7 @@ To train a model using multiple nodes, do the following:
     .. code-block:: python
 
        # train on 32 GPUs across 4 nodes
-       trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp')
+       trainer = Trainer(gpus=8, num_nodes=4, accelerator="ddp")
 
 3.  It's a good idea to structure your training script like this:
 
@@ -101,16 +101,12 @@ To train a model using multiple nodes, do the following:
         def main(hparams):
             model = LightningTemplateModel(hparams)
 
-            trainer = Trainer(
-                gpus=8,
-                num_nodes=4,
-                accelerator='ddp'
-            )
+            trainer = Trainer(gpus=8, num_nodes=4, accelerator="ddp")
 
             trainer.fit(model)
 
 
-        if __name__ == '__main__':
+        if __name__ == "__main__":
             root_dir = os.path.dirname(os.path.realpath(__file__))
             parent_parser = ArgumentParser(add_help=False)
             hyperparams = parser.parse_args()
@@ -197,45 +193,42 @@ See also the multi-node examples
     # grid search 3 values of learning rate and 3 values of number of layers for your net
     # this generates 9 experiments (lr=1e-3, layers=16), (lr=1e-3, layers=32),
     # (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
-    parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
-    parser.opt_list('--learning_rate', default=0.001, type=float,
-                    options=[1e-3, 1e-2, 1e-1], tunable=True)
-    parser.opt_list('--layers', default=1, type=float, options=[16, 32, 64], tunable=True)
+    parser = HyperOptArgumentParser(strategy="grid_search", add_help=False)
+    parser.opt_list("--learning_rate", default=0.001, type=float, options=[1e-3, 1e-2, 1e-1], tunable=True)
+    parser.opt_list("--layers", default=1, type=float, options=[16, 32, 64], tunable=True)
     hyperparams = parser.parse_args()
 
     # Slurm cluster submits 9 jobs, each with a set of hyperparams
     cluster = SlurmCluster(
         hyperparam_optimizer=hyperparams,
-        log_path='/some/path/to/save',
+        log_path="/some/path/to/save",
     )
 
     # OPTIONAL FLAGS WHICH MAY BE CLUSTER DEPENDENT
     # which interface your nodes use for communication
-    cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
+    cluster.add_command("export NCCL_SOCKET_IFNAME=^docker0,lo")
 
     # see the output of the NCCL connection process
     # NCCL is how the nodes talk to each other
-    cluster.add_command('export NCCL_DEBUG=INFO')
+    cluster.add_command("export NCCL_DEBUG=INFO")
 
     # setting a master port here is a good idea.
-    cluster.add_command('export MASTER_PORT=%r' % PORT)
+    cluster.add_command("export MASTER_PORT=%r" % PORT)
 
     # ************** DON'T FORGET THIS ***************
     # MUST load the latest NCCL version
-    cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
+    cluster.load_modules(["NCCL/2.4.7-1-cuda.10.0"])
 
     # configure cluster
     cluster.per_experiment_nb_nodes = 12
     cluster.per_experiment_nb_gpus = 8
 
-    cluster.add_slurm_cmd(cmd='ntasks-per-node', value=8, comment='1 task per gpu')
+    cluster.add_slurm_cmd(cmd="ntasks-per-node", value=8, comment="1 task per gpu")
 
     # submit a script with 9 combinations of hyper params
     # (lr=1e-3, layers=16), (lr=1e-3, layers=32), (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
     cluster.optimize_parallel_cluster_gpu(
-        main,
-        nb_trials=9, # how many permutations of the grid search to run
-        job_name='name_for_squeue'
+        main, nb_trials=9, job_name="name_for_squeue"  # how many permutations of the grid search to run
     )
 
 
@@ -259,8 +252,8 @@ and node rank (node id). Here is an example of a custom
     import os
     from pytorch_lightning.plugins.environments import ClusterEnvironment
 
-    class MyClusterEnvironment(ClusterEnvironment):
 
+    class MyClusterEnvironment(ClusterEnvironment):
         def creates_children(self) -> bool:
             # return True if the cluster is managed (you don't launch processes yourself)
             return True