Lightning-AI
diff --git a/‎.azure/gpu-tests.yml‎
Lines changed: 9 additions & 0 deletions b/‎.azure/gpu-tests.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.azure/hpu-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.azure/hpu-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.circleci/config.yml‎
Lines changed: 3 additions & 7 deletions b/‎.circleci/config.yml‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/bug_report.md‎
Lines changed: 3 additions & 0 deletions b/‎.github/ISSUE_TEMPLATE/bug_report.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎dockers/tpu-tests/tpu_test_cases.jsonnet‎
Lines changed: 5 additions & 10 deletions b/‎dockers/tpu-tests/tpu_test_cases.jsonnet‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎docs/source-pytorch/accelerators/ipu_basic.rst‎
Lines changed: 5 additions & 4 deletions b/‎docs/source-pytorch/accelerators/ipu_basic.rst‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎requirements/pytorch/base.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/base.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/pytorch/extra.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/pytorch/extra.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/pytorch_lightning/CHANGELOG.md‎
Lines changed: 16 additions & 4 deletions b/‎src/pytorch_lightning/CHANGELOG.md‎
Lines changed: 16 additions & 4 deletions
@@ -117,6 +117,15 @@ jobs:
       timeoutInMinutes: "35"
       condition: eq(variables['continue'], '1')
 
+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/tests_pytorch
+      env:
+        PL_USE_MOCKED_MNIST: "1"
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: PyTorch standalone tasks'
+      timeoutInMinutes: "10"
+      condition: eq(variables['continue'], '1')
+
     - bash: |
         python -m coverage report
         python -m coverage xml
 
@@ -84,7 +84,7 @@ jobs:
 
     - task: PublishTestResults@2
       inputs:
-        testResultsFiles: 'hpu*_test-results.xml'
+        testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml'
         testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
       condition: succeededOrFailed()
       displayName: 'Publish test results'
@@ -81,6 +81,8 @@ references:
        job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
        job_name=${job_name#job.batch/}
        job_name=${job_name% created}
+       pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
+       echo "GKE pod name: $pod_name"
        echo "Waiting on kubernetes job: $job_name"
        i=0 && \
        # N checks spaced 30s apart = 900s total.
@@ -92,8 +94,6 @@ references:
        printf "Waiting for job to finish: " && \
        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
        echo "Done waiting. Job status code: $status_code" && \
-       pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
-       echo "GKE pod name: $pod_name" && \
        kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
        if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
        # First portion is the test logs. Print these to Github Action stdout.
@@ -106,10 +106,6 @@ references:
      name: Statistics
      command: |
        mv ./xx01 coverage.xml
-       # TODO: add human readable report
-       cat coverage.xml
-       sudo pip install pycobertura
-       pycobertura show coverage.xml
 
 jobs:
 
@@ -119,7 +115,7 @@ jobs:
     environment:
       - XLA_VER: 1.9
       - PYTHON_VER: 3.7
-      - MAX_CHECKS: 240
+      - MAX_CHECKS: 1000
       - CHECK_SPEEP: 5
     steps:
       - checkout
 
@@ -46,14 +46,17 @@ python collect_env_details.py
 You can also fill out the list below manually.
 -->
 
+- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow):
 - PyTorch Lightning Version (e.g., 1.5.0):
+- Lightning App Version (e.g., 0.5.2):
 - PyTorch Version (e.g., 1.10):
 - Python version (e.g., 3.9):
 - OS (e.g., Linux):
 - CUDA/cuDNN version:
 - GPU models and configuration:
 - How you installed PyTorch (`conda`, `pip`, source):
 - If compiling from source, the output of `torch.__config__.show()`:
+- Running environment of LightningApp (e.g. local, cloud):
 - Any other relevant information:
 
 ### Additional context
 
@@ -8,7 +8,7 @@ local tputests = base.BaseTest {
   mode: 'postsubmit',
   configMaps: [],
 
-  timeout: 1200, # 20 minutes, in seconds.
+  timeout: 6000, # 100 minutes, in seconds.
 
   image: 'pytorchlightning/pytorch_lightning',
   imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',
@@ -34,16 +34,11 @@ local tputests = base.BaseTest {
       pip install -e .[test]
       echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
       export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
+      export PL_RUN_TPU_TESTS=1
       cd tests/tests_pytorch
-      echo $PWD
-      # TODO (@kaushikb11): Add device stats tests here
-      coverage run --source pytorch_lightning -m pytest -v --capture=no \
-          strategies/test_tpu_spawn.py \
-          profilers/test_xla_profiler.py \
-          accelerators/test_tpu.py \
-          models/test_tpu.py \
-          plugins/environments/test_xla_environment.py \
-          utilities/test_xla_device_utils.py
+      coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
+      echo "\n||| Running standalone tests |||\n"
+      bash run_standalone_tests.sh -b 1
       test_exit_code=$?
       echo "\n||| END PYTEST LOGS |||\n"
       coverage xml
 
@@ -62,7 +62,8 @@ Currently there are some known limitations that are being addressed in the near
 
 Please see the `MNIST example <https://github.com/Lightning-AI/lightning/blob/master/examples/pl_ipu/mnist_sample.py>`__ which displays most of the limitations and how to overcome them till they are resolved.
 
-* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this
-* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result
-* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code
-* Clipping gradients is not supported
+* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this.
+* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result.
+* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code.
+* Clipping gradients is not supported.
+* It is not possible to use :class:`torch.utils.data.BatchSampler` in your dataloaders if you are using multiple IPUs.
@@ -15,10 +15,12 @@ profile = "black"
 line_length = 120
 force_sort_within_sections = "False"
 order_by_type = "False"
+skip = ["_notebooks"]
 
 
 [tool.black]
 line-length = 120
+exclude = '(_notebooks/.*)'
 
 
 [tool.mypy]
@@ -61,7 +63,6 @@ module = [
     "pytorch_lightning.profilers.simple",
     "pytorch_lightning.strategies.ddp",
     "pytorch_lightning.strategies.ddp_spawn",
-    "pytorch_lightning.strategies.deepspeed",
     "pytorch_lightning.strategies.fully_sharded",
     "pytorch_lightning.strategies.ipu",
     "pytorch_lightning.strategies.sharded",
 
@@ -4,7 +4,7 @@ tqdm>=4.57.0, <=4.63.0
 PyYAML>=5.4, <=6.0
 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0
 tensorboard>=2.9.1, <2.10.0
-torchmetrics>=0.7.0, <0.9.2  # needed for using fixed compare_version
+torchmetrics>=0.7.0, <0.9.3  # needed for using fixed compare_version
 pyDeprecate>=0.3.1, <=0.3.2
 packaging>=17.0, <=21.3
 typing-extensions>=4.0.0, <4.3.1
@@ -1,6 +1,6 @@
 # extended list of package dependencies to reach full functionality
 matplotlib>3.1, <3.5.3
-torchtext>=0.10.*, <=0.12.0
+torchtext>=0.10.*, <0.14.0
 omegaconf>=2.0.5, <2.3.0
 hydra-core>=1.0.5, <1.3.0
 jsonargparse[signatures]>=4.12.0, <=4.12.0
 
@@ -167,7 +167,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/PyTorchLightning/pytorch-lightning/pull/13423))
 
 
--
+- Disallowed using `BatchSampler` when running on multiple IPUs ([#13854](https://github.com/PyTorchLightning/pytorch-lightning/pull/13854))
 
 
 ### Deprecated
@@ -348,6 +348,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Improved support for custom `DataLoader`s when instantiated in `*_dataloader` hook ([#12981](https://github.com/PyTorchLightning/pytorch-lightning/pull/12981))
 
+- Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook [#13640](https://github.com/PyTorchLightning/pytorch-lightning/pull/13640))
+
 
 - Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/PyTorchLightning/pytorch-lightning/pull/13014))
 
@@ -379,6 +381,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` ([#12832](https://github.com/Lightning-AI/lightning/pull/12832)
 
 
+- Improved support for custom `ReduceLROnPlateau` scheduler if `reduce_on_plateau` is set by the user in scheduler config ([#13838](https://github.com/Lightning-AI/lightning/pull/13838))
+
+
+- Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/Lightning-AI/lightning/pull/13645))
+
+
+- Fixed error handling in learning rate finder when not enough data points are available to give a good suggestion ([#13845](https://github.com/Lightning-AI/lightning/pull/13845))
+
+
+- Fixed an issue that caused the learning rate finder to set the model's learning rate to None when no suggestion was possible ([#13845](https://github.com/Lightning-AI/lightning/pull/13845))
+
+
+
 ## [1.6.5] - 2022-07-13
 
 ### Fixed
@@ -389,9 +404,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467))
 
 
-- Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/PyTorchLightning/pytorch-lightning/pull/13645))
-
-
 ## [1.6.4] - 2022-06-01
 
 ### Added