Lightning-AI
diff --git a/‎.azure/app-cloud-e2e.yml‎
Lines changed: 21 additions & 18 deletions b/‎.azure/app-cloud-e2e.yml‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎.azure/gpu-tests.yml‎
Lines changed: 9 additions & 0 deletions b/‎.azure/gpu-tests.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.azure/hpu-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.azure/hpu-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-pytorch-dockers.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/ci-pytorch-dockers.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/events-nightly.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/events-nightly.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dockers/base-conda/Dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎dockers/base-conda/Dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source-pytorch/advanced/model_parallel.rst‎
Lines changed: 23 additions & 6 deletions b/‎docs/source-pytorch/advanced/model_parallel.rst‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎docs/source-pytorch/common/checkpointing_intermediate.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-pytorch/common/checkpointing_intermediate.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-pytorch/common/lightning_module.rst‎
Lines changed: 0 additions & 12 deletions b/‎docs/source-pytorch/common/lightning_module.rst‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎docs/source-pytorch/common/trainer.rst‎
Lines changed: 60 additions & 0 deletions b/‎docs/source-pytorch/common/trainer.rst‎
Lines changed: 60 additions & 0 deletions
@@ -24,8 +24,10 @@ variables:
 
 jobs:
   - job: App_cloud_e2e_testing
-    pool:
-      vmImage: 'ubuntu-latest'
+    pool: azure-cpus
+    container:
+      image: mcr.microsoft.com/playwright/python:v1.25.2-focal
+      options: "--shm-size=2g"
     timeoutInMinutes: "30"
     cancelTimeoutInMinutes: "2"
     strategy:
@@ -56,6 +58,7 @@ jobs:
       clean: all
     steps:
     - bash: |
+        whoami
         python --version
         pip --version
       displayName: 'Info'
@@ -80,10 +83,10 @@ jobs:
 
     - bash: |
         python -m pip install playwright
-        python -m playwright install --with-deps
+        python -m playwright install  # --with-deps
       displayName: 'Install Playwright system dependencies'
 
-    - bash: pip install -e .
+    - bash: pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
       displayName: 'Install lightning'
 
     - bash: |
@@ -110,12 +113,12 @@ jobs:
         TEST_APP_NAME: $(name)
         HAR_LOCATION: './artifacts/hars'
         SLOW_MO: '50'
-        LAI_USER: $(LAI_USER)
-        LAI_PASS: $(LAI_PASS)
-        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
-        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
+        # LAI_USER: $(LAI_USER)
+        # LAI_PASS: $(LAI_PASS)
+        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
+        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
         LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
-        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
+        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
       displayName: 'Run the tests'
 
     - publish: '$(Build.ArtifactStagingDirectory)/videos'
@@ -125,16 +128,16 @@ jobs:
     - bash: |
         time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()"
       env:
-        LAI_USER: $(LAI_USER)
-        LAI_PASS: $(LAI_PASS)
-        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID)
-        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY)
+        # LAI_USER: $(LAI_USER)
+        # LAI_PASS: $(LAI_PASS)
+        LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
+        LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
         LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
-        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL)
+        LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
         PR_NUMBER: $(local_id)
         TEST_APP_NAME: $(name)
-        GRID_USER_ID: $(LIGHTNING_USER_ID)  # TODO: clarify the meaning
-        GRID_USER_KEY: $(LIGHTNING_API_KEY)  # TODO: clarify the meaning
-        GRID_URL: $(LIGHTNING_CLOUD_URL)
-        _GRID_USERNAME: $(LIGHTNING_USERNAME)
+        # GRID_USER_ID: $(LIGHTNING_USER_ID)  # TODO: clarify the meaning
+        # GRID_USER_KEY: $(LIGHTNING_API_KEY)  # TODO: clarify the meaning
+        # GRID_URL: $(LIGHTNING_CLOUD_URL)
+        # _GRID_USERNAME: $(LIGHTNING_USERNAME)
       displayName: 'Clean Previous Apps'
@@ -123,6 +123,15 @@ jobs:
       timeoutInMinutes: "35"
       condition: eq(variables['continue'], '1')
 
+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/tests_pytorch
+      env:
+        PL_USE_MOCKED_MNIST: "1"
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: PyTorch standalone tasks'
+      timeoutInMinutes: "10"
+      condition: eq(variables['continue'], '1')
+
     - bash: |
         python -m coverage report
         python -m coverage xml
 
@@ -45,7 +45,7 @@ jobs:
         pip --version
         sudo pip uninstall -y lightning pytorch-lightning
         pip install fire
-        python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext
+        python .actions/assistant.py requirements-prune-pkgs torch,torchvision
         pip install ".[extra,test]"
         pip list
       env:
 
@@ -75,7 +75,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
         timeout-minutes: 60
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -117,7 +117,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
         timeout-minutes: 95
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -155,7 +155,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 95
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -199,7 +199,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
         timeout-minutes: 10
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
@@ -235,7 +235,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
         timeout-minutes: 10
-      - uses: ravsamhq/notify-slack-action@v1
+      - uses: ravsamhq/notify-slack-action@v2
         if: failure() && env.PUSH_TO_HUB == 'true'
         with:
           status: ${{ job.status }}
 
@@ -48,7 +48,7 @@ jobs:
       # report failure to Slack
       - name: Slack notification
         if: failure() && github.event_name == 'schedule'
-        uses: ravsamhq/notify-slack-action@v1
+        uses: ravsamhq/notify-slack-action@v2
         with:
           status: ${{ job.status }}
           token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -78,11 +78,11 @@ RUN \
     conda update -n base -c defaults conda && \
     CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
     conda create -y --name $CONDA_ENV \
-      python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
+      python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \
       -c nvidia -c pytorch -c pytorch-test && \
     conda init bash && \
     # NOTE: this requires that the channel is presented in the yaml before packages \
-    printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
+    printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
     python prune.py && \
     rm prune.py && \
     cat environment.yml && \
@@ -102,7 +102,7 @@ RUN \
     pip list | grep torch && \
     python -c "import torch; print(torch.__version__)" && \
     pip install -q fire && \
-    python assistant.py requirements_prune_pkgs torch,torchvision,torchtext && \
+    python assistant.py requirements_prune_pkgs torch,torchvision && \
     # Install remaining requirements
     pip install --no-cache-dir -r requirements/pytorch/base.txt \
         -r requirements/pytorch/extra.txt \
 
@@ -212,14 +212,31 @@ PyTorch Fully Sharded Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 PyTorch has it's own version of `FSDP <https://pytorch.org/docs/stable/fsdp.html>`_ which is upstreamed from their `fairscale <https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html>`__ project.
-It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_. The API is pretty similar to that of FairScale.
+It was introduced in their `v1.11.0 release <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ but it is recommended to use it with PyTorch v1.12 or more and that's what
+Lightning supports. The API is pretty similar to that of FairScale.
 
-.. note::
-    Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``.
-    This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``.
-    This is a limitation of Fully Sharded Training that will be resolved in the future.
 
-To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
+Auto Wrapping
+"""""""""""""
+Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The
+simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't
+have to ``wrap`` layers manually as in the case of manual wrapping.
+
+.. code-block:: python
+
+    model = BoringModel()
+    trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16)
+    trainer.fit(model)
+
+
+Read more `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/#auto-wrapping>`__.
+
+
+Manual Wrapping
+"""""""""""""""
+
+Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate
+parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly.
 
 When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies.
 
 
@@ -120,7 +120,7 @@ What
 Where
 =====
 
-- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
+- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
 
 |
 
 
@@ -1345,18 +1345,6 @@ load_from_checkpoint
 .. automethod:: pytorch_lightning.core.module.LightningModule.load_from_checkpoint
     :noindex:
 
-on_hpc_save
-~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_save
-    :noindex:
-
-on_hpc_load
-~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.module.LightningModule.on_hpc_load
-    :noindex:
-
 on_train_start
 ~~~~~~~~~~~~~~
 
 
@@ -1745,3 +1745,63 @@ execution within that function, and the status of the Trainer.
     trainer.state.status
     # stage in ("train", "sanity_check", "validate", "test", "predict", "tune")
     trainer.state.stage
+
+should_stop
+***********
+
+If you want to terminate the training during ``.fit``, you can set ``trainer.should_stop=True`` to terminate the training
+as soon as possible. Note that, it will respect the arguments ``min_steps`` and ``min_epochs`` to check whether to stop. If these
+arguments are set and the ``current_epoch`` or ``global_step`` don't meet these minimum conditions, training will continue until
+both conditions are met. If any of these arguments is not set, it won't be considered for the final decision.
+
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` at any point of training will terminate it
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            self.trainer.should_stop = True
+
+
+    trainer = Trainer()
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` will stop training only after at least 5 epochs have run
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.current_epoch == 2:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_epochs=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` will stop training only after at least 5 steps have run
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.global_step == 2:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_steps=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
+
+.. code-block:: python
+
+    # setting `trainer.should_stop` at any until both min_steps and min_epochs are satisfied
+    class LitModel(LightningModule):
+        def training_step(self, *args, **kwargs):
+            if self.global_step == 7:
+                self.trainer.should_stop = True
+
+
+    trainer = Trainer(min_steps=5, min_epochs=5, max_epochs=100)
+    model = LitModel()
+    trainer.fit(model)
Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ What`
`120`	`120`	`Where`
`121`	`121`	`=====`
`122`	`122`
`123`		-- It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
	`123`	+- By default, the ``ModelCheckpoint`` will save files into the ``Trainer.log_dir``. It gives you the ability to specify the ``dirpath`` and ``filename`` for your checkpoints. Filename can also be dynamic so you can inject the metrics that are being logged using :meth:`~pytorch_lightning.core.module.LightningModule.log`.
`124`	`124`
`125`	`125`	`\|`
`126`	`126`