Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .github/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ See following short example of a sample function taking one position string and
```python
from typing import Optional


def my_func(param_a: int, param_b: Optional[float] = None) -> str:
"""Sample function.

Expand Down Expand Up @@ -310,10 +311,7 @@ def test_explain_what_is_being_tested(tmpdir):
# BoringModel is a functional model. You might want to set methods to None to test your behaviour
# Example: model.training_step_end = None

trainer = Trainer(
default_root_dir=tmpdir, # will save everything within a tmpdir generated for this test
...
)
trainer = Trainer(default_root_dir=tmpdir, ...) # will save everything within a tmpdir generated for this test
trainer.fit(model)
trainer.test() # [OPTIONAL]

Expand Down
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ repos:
- id: black
name: Format code

- repo: https://github.com/asottile/blacken-docs
rev: v1.10.0
hooks:
- id: blacken-docs
args: [ --line-length=120 ]
additional_dependencies: [ black==21.7b0 ]

- repo: https://github.com/PyCQA/flake8
rev: 3.9.2
hooks:
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ A LightningModule defines a full *system* (ie: a GAN, autoencoder, BERT or a sim

```python
class LitAutoEncoder(pl.LightningModule):

def __init__(self):
super().__init__()
self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3))
Expand All @@ -183,7 +182,7 @@ class LitAutoEncoder(pl.LightningModule):
z = self.encoder(x)
x_hat = self.decoder(z)
loss = F.mse_loss(x_hat, x)
self.log('train_loss', loss)
self.log("train_loss", loss)
return loss

def configure_optimizers(self):
Expand Down
16 changes: 8 additions & 8 deletions docs/source/advanced/ipu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Specify the number of IPUs to train with. Note that when training with IPUs, you

.. code-block:: python

trainer = pl.Trainer(ipus=8) # Train using data parallel on 8 IPUs
trainer = pl.Trainer(ipus=8) # Train using data parallel on 8 IPUs

IPUs only support specifying a single number to allocate devices, which is handled via the underlying libraries.

Expand Down Expand Up @@ -102,10 +102,7 @@ Note that by default we return the last device iteration loss. You can override
training_opts.anchorMode(poptorch.AnchorMode.All)
training_opts.deviceIterations(32)

trainer = Trainer(
ipus=8,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
trainer = Trainer(ipus=8, plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts))
trainer.fit(model)

You can also override all options by passing the ``poptorch.Options`` to the plugin. See `PopTorch options documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`__ for more information.
Expand All @@ -127,7 +124,7 @@ Lightning supports dumping all reports to a directory to open using the tool.
from pytorch_lightning.plugins import IPUPlugin

model = MyLightningModule()
trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(autoreport_dir='report_dir/'))
trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(autoreport_dir="report_dir/"))
trainer.fit(model)

This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports <https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html#opening-reports>`__.
Expand Down Expand Up @@ -155,8 +152,8 @@ Below is an example using the block annotation in a LightningModule.
import pytorch_lightning as pl
import poptorch

class MyLightningModule(pl.LightningModule):

class MyLightningModule(pl.LightningModule):
def __init__(self):
super().__init__()
# This will place layer1, layer2+layer3, layer4, softmax on different IPUs at runtime.
Expand All @@ -175,6 +172,7 @@ Below is an example using the block annotation in a LightningModule.

...


model = MyLightningModule()
trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=20))
trainer.fit(model)
Expand All @@ -187,8 +185,8 @@ You can also use the block context manager within the forward function, or any o
import pytorch_lightning as pl
import poptorch

class MyLightningModule(pl.LightningModule):

class MyLightningModule(pl.LightningModule):
def __init__(self):
super().__init__()
self.layer1 = torch.nn.Linear(5, 10)
Expand All @@ -214,8 +212,10 @@ You can also use the block context manager within the forward function, or any o
with poptorch.Block(ipu_id=3):
x = self.softmax(x)
return x

...


model = MyLightningModule()
trainer = pl.Trainer(ipus=8, plugins=IPUPlugin(device_iterations=20))
trainer.fit(model)
Expand Down
4 changes: 2 additions & 2 deletions docs/source/advanced/lr_finder.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,13 @@ which can be accessed via ``self.learning_rate`` or ``self.lr``.
.. code-block:: python

class LitModel(LightningModule):

def __init__(self, learning_rate):
self.learning_rate = learning_rate

def configure_optimizers(self):
return Adam(self.parameters(), lr=(self.lr or self.learning_rate))


model = LitModel()

# finds learning rate automatically
Expand All @@ -68,7 +68,7 @@ If your model is using an arbitrary value instead of ``self.lr`` or ``self.learn
model = LitModel()

# to set to your own hparams.my_value
trainer = Trainer(auto_lr_find='my_value')
trainer = Trainer(auto_lr_find="my_value")

trainer.tune(model)

Expand Down
7 changes: 2 additions & 5 deletions docs/source/advanced/sequences.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ Lightning can handle TBTT automatically via this flag.

from pytorch_lightning import LightningModule

class MyModel(LightningModule):

class MyModel(LightningModule):
def __init__(self):
super().__init__()
# Important: This property activates truncated backpropagation through time
Expand All @@ -26,10 +26,7 @@ Lightning can handle TBTT automatically via this flag.
# the training step must be updated to accept a ``hiddens`` argument
# hiddens are the hiddens from the previous truncated backprop step
out, hiddens = self.lstm(data, hiddens)
return {
"loss": ...,
"hiddens": hiddens
}
return {"loss": ..., "hiddens": hiddens}

.. note:: If you need to modify how the batch is split,
override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`.
19 changes: 4 additions & 15 deletions docs/source/advanced/tpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,29 +92,18 @@ for TPU use

import torch_xla.core.xla_model as xm


def train_dataloader(self):
dataset = MNIST(
os.getcwd(),
train=True,
download=True,
transform=transforms.ToTensor()
)
dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())

# required for TPU support
sampler = None
if use_tpu:
sampler = torch.utils.data.distributed.DistributedSampler(
dataset,
num_replicas=xm.xrt_world_size(),
rank=xm.get_ordinal(),
shuffle=True
dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True
)

loader = DataLoader(
dataset,
sampler=sampler,
batch_size=32
)
loader = DataLoader(dataset, sampler=sampler, batch_size=32)

return loader

Expand Down
6 changes: 3 additions & 3 deletions docs/source/advanced/training_tricks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ If the Trainer's ``gradient_clip_algorithm`` is set to ``'value'`` (``'norm'`` b
trainer = Trainer(gradient_clip_val=0.5) # gradient_clip_algorithm='norm' by default

# clip gradients' maximum magnitude to <=0.5
trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value')
trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm="value")

----------

Expand Down Expand Up @@ -82,7 +82,7 @@ longer training time. Inspired by https://github.com/BlackHC/toma.
trainer = Trainer(auto_scale_batch_size=None)

# Autoscale batch size
trainer = Trainer(auto_scale_batch_size=None|'power'|'binsearch')
trainer = Trainer(auto_scale_batch_size=None | "power" | "binsearch")

# find the batch size
trainer.tune(model)
Expand All @@ -107,7 +107,7 @@ search for batch sizes larger than the size of the training dataset.
.. code-block:: python

def train_dataloader(self):
return DataLoader(train_dataset, batch_size=self.batch_size|self.hparams.batch_size)
return DataLoader(train_dataset, batch_size=self.batch_size | self.hparams.batch_size)

.. warning::

Expand Down
11 changes: 5 additions & 6 deletions docs/source/advanced/transfer_learning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ Let's use the `AutoEncoder` as a feature extractor in a separate model.
class Encoder(torch.nn.Module):
...


class AutoEncoder(LightningModule):
def __init__(self):
self.encoder = Encoder()
self.decoder = Decoder()


class CIFAR10Classifier(LightningModule):
def __init__(self):
# init the pretrained LightningModule
Expand All @@ -50,6 +52,7 @@ Example: Imagenet (computer Vision)

import torchvision.models as models


class ImagenetTransferLearning(LightningModule):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -102,20 +105,16 @@ Here's a model that uses `Huggingface transformers <https://github.com/huggingfa
.. testcode::

class BertMNLIFinetuner(LightningModule):

def __init__(self):
super().__init__()

self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)
self.bert = BertModel.from_pretrained("bert-base-cased", output_attentions=True)
self.W = nn.Linear(bert.config.hidden_size, 3)
self.num_classes = 3


def forward(self, input_ids, attention_mask, token_type_ids):

h, _, attn = self.bert(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
h, _, attn = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

h_cls = h[:, 0]
logits = self.W(h_cls)
Expand Down
37 changes: 15 additions & 22 deletions docs/source/clouds/cluster.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ To train a model using multiple nodes, do the following:
.. code-block:: python

# train on 32 GPUs across 4 nodes
trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp')
trainer = Trainer(gpus=8, num_nodes=4, accelerator="ddp")


Submit a job to the cluster
Expand Down Expand Up @@ -91,7 +91,7 @@ To train a model using multiple nodes, do the following:
.. code-block:: python

# train on 32 GPUs across 4 nodes
trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp')
trainer = Trainer(gpus=8, num_nodes=4, accelerator="ddp")

3. It's a good idea to structure your training script like this:

Expand All @@ -101,16 +101,12 @@ To train a model using multiple nodes, do the following:
def main(hparams):
model = LightningTemplateModel(hparams)

trainer = Trainer(
gpus=8,
num_nodes=4,
accelerator='ddp'
)
trainer = Trainer(gpus=8, num_nodes=4, accelerator="ddp")

trainer.fit(model)


if __name__ == '__main__':
if __name__ == "__main__":
root_dir = os.path.dirname(os.path.realpath(__file__))
parent_parser = ArgumentParser(add_help=False)
hyperparams = parser.parse_args()
Expand Down Expand Up @@ -197,45 +193,42 @@ See also the multi-node examples
# grid search 3 values of learning rate and 3 values of number of layers for your net
# this generates 9 experiments (lr=1e-3, layers=16), (lr=1e-3, layers=32),
# (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
parser.opt_list('--learning_rate', default=0.001, type=float,
options=[1e-3, 1e-2, 1e-1], tunable=True)
parser.opt_list('--layers', default=1, type=float, options=[16, 32, 64], tunable=True)
parser = HyperOptArgumentParser(strategy="grid_search", add_help=False)
parser.opt_list("--learning_rate", default=0.001, type=float, options=[1e-3, 1e-2, 1e-1], tunable=True)
parser.opt_list("--layers", default=1, type=float, options=[16, 32, 64], tunable=True)
hyperparams = parser.parse_args()

# Slurm cluster submits 9 jobs, each with a set of hyperparams
cluster = SlurmCluster(
hyperparam_optimizer=hyperparams,
log_path='/some/path/to/save',
log_path="/some/path/to/save",
)

# OPTIONAL FLAGS WHICH MAY BE CLUSTER DEPENDENT
# which interface your nodes use for communication
cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
cluster.add_command("export NCCL_SOCKET_IFNAME=^docker0,lo")

# see the output of the NCCL connection process
# NCCL is how the nodes talk to each other
cluster.add_command('export NCCL_DEBUG=INFO')
cluster.add_command("export NCCL_DEBUG=INFO")

# setting a master port here is a good idea.
cluster.add_command('export MASTER_PORT=%r' % PORT)
cluster.add_command("export MASTER_PORT=%r" % PORT)

# ************** DON'T FORGET THIS ***************
# MUST load the latest NCCL version
cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
cluster.load_modules(["NCCL/2.4.7-1-cuda.10.0"])

# configure cluster
cluster.per_experiment_nb_nodes = 12
cluster.per_experiment_nb_gpus = 8

cluster.add_slurm_cmd(cmd='ntasks-per-node', value=8, comment='1 task per gpu')
cluster.add_slurm_cmd(cmd="ntasks-per-node", value=8, comment="1 task per gpu")

# submit a script with 9 combinations of hyper params
# (lr=1e-3, layers=16), (lr=1e-3, layers=32), (lr=1e-3, layers=64), ... (lr=1e-1, layers=64)
cluster.optimize_parallel_cluster_gpu(
main,
nb_trials=9, # how many permutations of the grid search to run
job_name='name_for_squeue'
main, nb_trials=9, job_name="name_for_squeue" # how many permutations of the grid search to run
)


Expand All @@ -259,8 +252,8 @@ and node rank (node id). Here is an example of a custom
import os
from pytorch_lightning.plugins.environments import ClusterEnvironment

class MyClusterEnvironment(ClusterEnvironment):

class MyClusterEnvironment(ClusterEnvironment):
def creates_children(self) -> bool:
# return True if the cluster is managed (you don't launch processes yourself)
return True
Expand Down
Loading