Skip to content

Commit c8d30a0

Browse files
authored
Merge branch 'master' into docs/chlog_post_173
2 parents abd8d7b + f357417 commit c8d30a0

File tree

77 files changed

+683
-855
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+683
-855
lines changed

.azure/app-cloud-e2e.yml

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ variables:
2424

2525
jobs:
2626
- job: App_cloud_e2e_testing
27-
pool:
28-
vmImage: 'ubuntu-latest'
27+
pool: azure-cpus
28+
container:
29+
image: mcr.microsoft.com/playwright/python:v1.25.2-focal
30+
options: "--shm-size=2g"
2931
timeoutInMinutes: "30"
3032
cancelTimeoutInMinutes: "2"
3133
strategy:
@@ -56,6 +58,7 @@ jobs:
5658
clean: all
5759
steps:
5860
- bash: |
61+
whoami
5962
python --version
6063
pip --version
6164
displayName: 'Info'
@@ -80,10 +83,10 @@ jobs:
8083

8184
- bash: |
8285
python -m pip install playwright
83-
python -m playwright install --with-deps
86+
python -m playwright install # --with-deps
8487
displayName: 'Install Playwright system dependencies'
8588
86-
- bash: pip install -e .
89+
- bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
8790
displayName: 'Install lightning'
8891

8992
- bash: |
@@ -110,12 +113,12 @@ jobs:
110113
TEST_APP_NAME: $(name)
111114
HAR_LOCATION: './artifacts/hars'
112115
SLOW_MO: '50'
113-
LAI_USER: $(LAI_USER)
114-
LAI_PASS: $(LAI_PASS)
115-
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
116-
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
116+
# LAI_USER: $(LAI_USER)
117+
# LAI_PASS: $(LAI_PASS)
118+
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
119+
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
117120
LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
118-
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
121+
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
119122
displayName: 'Run the tests'
120123
121124
- publish: '$(Build.ArtifactStagingDirectory)/videos'
@@ -125,16 +128,16 @@ jobs:
125128
- bash: |
126129
time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()"
127130
env:
128-
LAI_USER: $(LAI_USER)
129-
LAI_PASS: $(LAI_PASS)
130-
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
131-
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
131+
# LAI_USER: $(LAI_USER)
132+
# LAI_PASS: $(LAI_PASS)
133+
LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
134+
LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
132135
LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
133-
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
136+
LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
134137
PR_NUMBER: $(local_id)
135138
TEST_APP_NAME: $(name)
136-
GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning
137-
GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning
138-
GRID_URL: $(LIGHTNING_CLOUD_URL)
139-
_GRID_USERNAME: $(LIGHTNING_USERNAME)
139+
# GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning
140+
# GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning
141+
# GRID_URL: $(LIGHTNING_CLOUD_URL)
142+
# _GRID_USERNAME: $(LIGHTNING_USERNAME)
140143
displayName: 'Clean Previous Apps'

.azure/gpu-tests.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,15 @@ jobs:
123123
timeoutInMinutes: "35"
124124
condition: eq(variables['continue'], '1')
125125

126+
- bash: bash run_standalone_tasks.sh
127+
workingDirectory: tests/tests_pytorch
128+
env:
129+
PL_USE_MOCKED_MNIST: "1"
130+
PL_RUN_CUDA_TESTS: "1"
131+
displayName: 'Testing: PyTorch standalone tasks'
132+
timeoutInMinutes: "10"
133+
condition: eq(variables['continue'], '1')
134+
126135
- bash: |
127136
python -m coverage report
128137
python -m coverage xml

.azure/hpu-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
pip --version
4646
sudo pip uninstall -y lightning pytorch-lightning
4747
pip install fire
48-
python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
48+
python .actions/assistant.py requirements-prune-pkgs torch,torchvision
4949
pip install ".[extra,test]"
5050
pip list
5151
env:

.github/workflows/ci-pytorch-dockers.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
push: ${{ env.PUSH_TO_HUB }}
7676
tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
7777
timeout-minutes: 60
78-
- uses: ravsamhq/notify-slack-action@v1
78+
- uses: ravsamhq/notify-slack-action@v2
7979
if: failure() && env.PUSH_TO_HUB == 'true'
8080
with:
8181
status: ${{ job.status }}
@@ -117,7 +117,7 @@ jobs:
117117
push: ${{ env.PUSH_TO_HUB }}
118118
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
119119
timeout-minutes: 95
120-
- uses: ravsamhq/notify-slack-action@v1
120+
- uses: ravsamhq/notify-slack-action@v2
121121
if: failure() && env.PUSH_TO_HUB == 'true'
122122
with:
123123
status: ${{ job.status }}
@@ -155,7 +155,7 @@ jobs:
155155
push: ${{ env.PUSH_TO_HUB }}
156156
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
157157
timeout-minutes: 95
158-
- uses: ravsamhq/notify-slack-action@v1
158+
- uses: ravsamhq/notify-slack-action@v2
159159
if: failure() && env.PUSH_TO_HUB == 'true'
160160
with:
161161
status: ${{ job.status }}
@@ -199,7 +199,7 @@ jobs:
199199
push: ${{ env.PUSH_TO_HUB }}
200200
tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
201201
timeout-minutes: 10
202-
- uses: ravsamhq/notify-slack-action@v1
202+
- uses: ravsamhq/notify-slack-action@v2
203203
if: failure() && env.PUSH_TO_HUB == 'true'
204204
with:
205205
status: ${{ job.status }}
@@ -235,7 +235,7 @@ jobs:
235235
push: ${{ env.PUSH_TO_HUB }}
236236
tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
237237
timeout-minutes: 10
238-
- uses: ravsamhq/notify-slack-action@v1
238+
- uses: ravsamhq/notify-slack-action@v2
239239
if: failure() && env.PUSH_TO_HUB == 'true'
240240
with:
241241
status: ${{ job.status }}

.github/workflows/events-nightly.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
# report failure to Slack
4949
- name: Slack notification
5050
if: failure() && github.event_name == 'schedule'
51-
uses: ravsamhq/notify-slack-action@v1
51+
uses: ravsamhq/notify-slack-action@v2
5252
with:
5353
status: ${{ job.status }}
5454
token: ${{ secrets.GITHUB_TOKEN }}

dockers/base-conda/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,11 @@ RUN \
7878
conda update -n base -c defaults conda && \
7979
CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
8080
conda create -y --name $CONDA_ENV \
81-
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
81+
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \
8282
-c nvidia -c pytorch -c pytorch-test && \
8383
conda init bash && \
8484
# NOTE: this requires that the channel is presented in the yaml before packages \
85-
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
85+
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
8686
python prune.py && \
8787
rm prune.py && \
8888
cat environment.yml && \
@@ -102,7 +102,7 @@ RUN \
102102
pip list | grep torch && \
103103
python -c "import torch; print(torch.__version__)" && \
104104
pip install -q fire && \
105-
python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \
105+
python assistant.py requirements_prune_pkgs torch,torchvision && \
106106
# Install remaining requirements
107107
pip install --no-cache-dir -r requirements/pytorch/base.txt \
108108
-r requirements/pytorch/extra.txt \

docs/source-pytorch/advanced/model_parallel.rst

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -212,14 +212,31 @@ PyTorch Fully Sharded Training
212212
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
213213

214214
PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
215-
It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_. The API is pretty similar to that of FairScale.
215+
It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
216+
Lightning supports. The API is pretty similar to that of FairScale.
216217

217-
.. note::
218-
Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
219-
This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
220-
This is a limitation of Fully Sharded Training that will be resolved in the future.
221218

222-
To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
219+
Auto Wrapping
220+
"""""""""""""
221+
Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
222+
simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
223+
have to ``wrap`` layers manually as in the case of manual wrapping.
224+
225+
.. code-block:: python
226+
227+
model = BoringModel()
228+
trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16)
229+
trainer.fit(model)
230+
231+
232+
Read more `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/#auto-wrapping>`__.
233+
234+
235+
Manual Wrapping
236+
"""""""""""""""
237+
238+
Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
239+
parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
223240

224241
When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies.
225242

docs/source-pytorch/common/checkpointing_intermediate.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ What
120120
Where
121121
=====
122122

123-
- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
123+
- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
124124

125125
|
126126

docs/source-pytorch/common/lightning_module.rst

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,18 +1345,6 @@ load_from_checkpoint
13451345
.. automethod:: pytorch_lightning.core.module.LightningModule.load_from_checkpoint
13461346
:noindex:
13471347

1348-
on_hpc_save
1349-
~~~~~~~~~~~
1350-
1351-
.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_save
1352-
:noindex:
1353-
1354-
on_hpc_load
1355-
~~~~~~~~~~~
1356-
1357-
.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_load
1358-
:noindex:
1359-
13601348
on_train_start
13611349
~~~~~~~~~~~~~~
13621350

docs/source-pytorch/common/trainer.rst

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,3 +1745,63 @@ execution within that function, and the status of the Trainer.
17451745
trainer.state.status
17461746
# stage in ("train", "sanity_check", "validate", "test", "predict", "tune")
17471747
trainer.state.stage
1748+
1749+
should_stop
1750+
***********
1751+
1752+
If you want to terminate the training during ``.fit``, you can set ``trainer.should_stop=True`` to terminate the training
1753+
as soon as possible. Note that, it will respect the arguments ``min_steps`` and ``min_epochs`` to check whether to stop. If these
1754+
arguments are set and the ``current_epoch`` or ``global_step`` don't meet these minimum conditions, training will continue until
1755+
both conditions are met. If any of these arguments is not set, it won't be considered for the final decision.
1756+
1757+
1758+
.. code-block:: python
1759+
1760+
# setting `trainer.should_stop` at any point of training will terminate it
1761+
class LitModel(LightningModule):
1762+
def training_step(self, *args, **kwargs):
1763+
self.trainer.should_stop = True
1764+
1765+
1766+
trainer = Trainer()
1767+
model = LitModel()
1768+
trainer.fit(model)
1769+
1770+
.. code-block:: python
1771+
1772+
# setting `trainer.should_stop` will stop training only after at least 5 epochs have run
1773+
class LitModel(LightningModule):
1774+
def training_step(self, *args, **kwargs):
1775+
if self.current_epoch == 2:
1776+
self.trainer.should_stop = True
1777+
1778+
1779+
trainer = Trainer(min_epochs=5, max_epochs=100)
1780+
model = LitModel()
1781+
trainer.fit(model)
1782+
1783+
.. code-block:: python
1784+
1785+
# setting `trainer.should_stop` will stop training only after at least 5 steps have run
1786+
class LitModel(LightningModule):
1787+
def training_step(self, *args, **kwargs):
1788+
if self.global_step == 2:
1789+
self.trainer.should_stop = True
1790+
1791+
1792+
trainer = Trainer(min_steps=5, max_epochs=100)
1793+
model = LitModel()
1794+
trainer.fit(model)
1795+
1796+
.. code-block:: python
1797+
1798+
# setting `trainer.should_stop` at any until both min_steps and min_epochs are satisfied
1799+
class LitModel(LightningModule):
1800+
def training_step(self, *args, **kwargs):
1801+
if self.global_step == 7:
1802+
self.trainer.should_stop = True
1803+
1804+
1805+
trainer = Trainer(min_steps=5, min_epochs=5, max_epochs=100)
1806+
model = LitModel()
1807+
trainer.fit(model)

0 commit comments

Comments
 (0)