Lightning-AI
diff --git a/‎.github/workflows/ci_test-conda.yml‎
Lines changed: 3 additions & 8 deletions b/‎.github/workflows/ci_test-conda.yml‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎.github/workflows/ci_test-full.yml‎
Lines changed: 11 additions & 4 deletions b/‎.github/workflows/ci_test-full.yml‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎.github/workflows/docs-checks.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/docs-checks.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/events-nightly.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/events-nightly.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/events-recurent.yml‎ renamed to ‎.github/workflows/events-recurrent.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/events-recurent.yml‎ renamed to ‎.github/workflows/events-recurrent.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 76 additions & 2 deletions b/‎CHANGELOG.md‎
Lines changed: 76 additions & 2 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 1 deletion b/‎MANIFEST.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎azure-pipelines.yml‎
Lines changed: 8 additions & 4 deletions b/‎azure-pipelines.yml‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎benchmarks/test_sharded_parity.py‎
Lines changed: 10 additions & 30 deletions b/‎benchmarks/test_sharded_parity.py‎
Lines changed: 10 additions & 30 deletions
@@ -27,18 +27,13 @@ jobs:
       run: |
         conda info
         conda list
+        # adjust versions according installed Torch version
+        python ./requirements/adjust_versions.py requirements/extra.txt
+        python ./requirements/adjust_versions.py requirements/examples.txt
         pip install --requirement requirements/devel.txt --upgrade-strategy only-if-needed
         pip list
 
-    - name: Cache datasets
-      # todo this probably does not work with docker images, rather cache dockers
-      uses: actions/cache@v2
-      with:
-        path: Datasets
-        key: pl-dataset
-
     - name: Pull checkpoints from S3
-      # todo: consider adding coma caching, but ATM all models have less then 100KB
       run: |
         # enter legacy and update checkpoints from S3
         cd legacy
 
@@ -104,11 +104,14 @@ jobs:
         HOROVOD_WITHOUT_MXNET: 1
         HOROVOD_WITHOUT_TENSORFLOW: 1
       run: |
-        # python -m pip install --upgrade --user pip
-        pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade
-        pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade
         python --version
         pip --version
+        # python -m pip install --upgrade --user pip
+        pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
+        # adjust versions according installed Torch version
+        python ./requirements/adjust_versions.py requirements/extra.txt
+        python ./requirements/adjust_versions.py requirements/examples.txt
+        pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
         pip list
       shell: bash
 
@@ -135,7 +138,11 @@ jobs:
     - name: Tests
       run: |
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+
+    - name: Examples
+      run: |
+        python -m pytest pl_examples -v --durations=10
 
     - name: Upload pytest test results
       uses: actions/upload-artifact@v2
 
@@ -41,15 +41,15 @@ jobs:
 
       - name: Install dependencies
         run: |
+          python --version
+          pip --version
           # remove Horovod from requirements
           python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)"
           # python -m pip install --upgrade --user pip
           pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
           pip install --requirement requirements/extra.txt
           pip install --requirement requirements/loggers.txt
           pip install --requirement requirements/docs.txt
-          python --version
-          pip --version
           pip list
         shell: bash
 
@@ -84,12 +84,12 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
+          python --version
+          pip --version
+          # pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
           pip install --requirement requirements/docs.txt
           # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux
           sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures
-          python --version
-          pip --version
           pip list
         shell: bash
 
 
@@ -102,8 +102,6 @@ jobs:
         id: extend
 
       - name: Publish CUDA to Docker Hub
-        # ToDo: extend also building for Nightly from pip
-        if: matrix.pytorch_version < 1.8
         # publish master/release
         uses: docker/build-push-action@v2
         with:
 
@@ -1,4 +1,4 @@
-name: Recurent events
+name: Recurrent events
 
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.202132.xyzmunity/t/distinct-job-for-each-schedule/17811/2
 
@@ -17,17 +17,46 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072))
 
 
+- Added `RunningStage.SANITY_CHECKING` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
+- Added `TrainerState.{FITTING,VALIDATING,TESTING,PREDICTING,TUNING}` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
+- Added `LightningEnvironment` for Lightning-specific DDP ([#5915](https://github.com/PyTorchLightning/pytorch-lightning/pull/5915))
+
+
+- Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274))
+
+
+- Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139))
+
+
 ### Changed
 
+- Renamed `pytorch_lightning.callbacks.swa` to `pytorch_lightning.callbacks.stochastic_weight_avg` ([#6259](https://github.com/PyTorchLightning/pytorch-lightning/pull/6259))
+
+
+- Refactor `RunningStage` and `TrainerState` usage ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
+- Changed `trainer.evaluating` to return `True` if validating or testing ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
 
 ### Deprecated
 
 
+- Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
 ### Removed
 
 - Removed support for passing a bool value to `profiler` argument of Trainer ([#6164](https://github.com/PyTorchLightning/pytorch-lightning/pull/6164))
 
 
+- Removed no return warning from val/test step ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139))
+
+
 - Removed passing a `ModelCheckpoint` instance to `Trainer(checkpoint_callback)` ([#6166](https://github.com/PyTorchLightning/pytorch-lightning/pull/6166))
 
 
@@ -48,6 +77,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed deprecated `LightningModule` `hparams` setter ([#6207](https://github.com/PyTorchLightning/pytorch-lightning/pull/6207))
 
 
+- Removed `optimizer_idx` argument from `training_step` in manual optimization ([#6093](https://github.com/PyTorchLightning/pytorch-lightning/pull/6093))
+
+
 ### Fixed
 
 - Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011))
@@ -59,16 +91,58 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Do not print top-k verbose log with `ModelCheckpoint(monitor=None)` ([#6109](https://github.com/PyTorchLightning/pytorch-lightning/pull/6109))
 
 
+- Fixed `ModelCheckpoint(monitor=None, save_last=True)` not saving checkpoints ([#6136](https://github.com/PyTorchLightning/pytorch-lightning/pull/6136))
+
+
+- Fixed `ModelCheckpoint(save_top_k=0, save_last=True)` not saving the `last` checkpoint ([#6136](https://github.com/PyTorchLightning/pytorch-lightning/pull/6136))
+
+
 - Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/PyTorchLightning/pytorch-lightning/pull/6115))
 
 
-- Fixed epoch level schedulers not being called when `val_check_interval < 1.0` ([#6075](https://github.com/PyTorchLightning/pytorch-lightning/pull/6075))
+- Fixed `AttributeError` when `logger=None` on TPU ([#6221](https://github.com/PyTorchLightning/pytorch-lightning/pull/6221))
 
 
-- Fixed multiple early stopping callbacks ([#6197](https://github.com/PyTorchLightning/pytorch-lightning/pull/6197))
+- Fixed `ModelPruning(make_pruning_permanent=True)` pruning buffers getting removed when saved during training ([#6073](https://github.com/PyTorchLightning/pytorch-lightning/pull/6073))
+
+
+- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit`  ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272))
+
+
+- Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275))
+
+
+- Fixed `SingleTPU` calling `all_gather` ([#6296](https://github.com/PyTorchLightning/pytorch-lightning/pull/6296))
+
+
+- Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324))
+
+
+- Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/PyTorchLightning/pytorch-lightning/pull/6260))
+
+
+- Fixed `Trainer` not resetting `lightning_optimizers` when calling `Trainer.fit()` multiple times ([#6372](https://github.com/PyTorchLightning/pytorch-lightning/pull/6372))
 
 
+## [1.2.2] - 2021-03-02
+
+### Added
+
+- Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072))
+
+### Changed
+
+- Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147))
+- Changed default for DeepSpeed CPU Offload to False, due to prohibitively slow speeds at smaller scale ([#6262](https://github.com/PyTorchLightning/pytorch-lightning/pull/6262))
+
+### Fixed
+
+- Fixed epoch level schedulers not being called when `val_check_interval < 1.0` ([#6075](https://github.com/PyTorchLightning/pytorch-lightning/pull/6075))
+- Fixed multiple early stopping callbacks ([#6197](https://github.com/PyTorchLightning/pytorch-lightning/pull/6197))
+- Fixed incorrect usage of `detach()`, `cpu()`, `to()` ([#6216](https://github.com/PyTorchLightning/pytorch-lightning/pull/6216))
+- Fixed LBFGS optimizer support which didn't converge in automatic optimization ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147))
 - Prevent `WandbLogger` from dropping values ([#5931](https://github.com/PyTorchLightning/pytorch-lightning/pull/5931))
+- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
 
 
 ## [1.2.1] - 2021-02-23
 
@@ -46,7 +46,7 @@ recursive-include docs/source/_static/images/general/ pl_overview* tf_* tutorial
 
 # Include the Requirements
 recursive-include requirements *.txt
-recursive-exclude requirements *.sh
+recursive-exclude requirements *.sh *.py
 include requirements.txt
 include pyproject.toml
 
 
@@ -318,9 +318,9 @@ class LitAutoEncoder(pl.LightningModule):
         super().__init__()
         self.automatic_optimization = False
 
-    def training_step(self, batch, batch_idx, optimizer_idx):
+    def training_step(self, batch, batch_idx):
         # access your optimizers with use_pl_optimizer=False. Default is True
-        (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
+        opt_a, opt_b = self.optimizers(use_pl_optimizer=True)
 
         loss_a = ...
         self.manual_backward(loss_a, opt_a)
 
@@ -23,7 +23,7 @@ jobs:
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: 2
 
-    pool: dsvm-spot-pool
+    pool: gridai-spot-pool
 
     #strategy:
     #  matrix:
@@ -58,7 +58,7 @@ jobs:
         export GIT_TERMINAL_PROMPT=1
         #sudo apt-get install -y cmake
         # python -m pip install "pip==20.1"
-        pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install --requirement requirements.txt
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
@@ -92,11 +92,15 @@ jobs:
       displayName: 'Statistics'
 
     - bash: |
-        python -m pytest benchmarks pl_examples -v --maxfail=2 --durations=0
-      displayName: 'Testing: extended'
+        python -m pytest benchmarks -v --maxfail=2 --durations=0
+      displayName: 'Testing: benchmarks'
 
     - bash: |
+        python -m pytest pl_examples -v --maxfail=2 --durations=0
         python setup.py install --user --quiet
         bash pl_examples/run_ddp-example.sh
+        cd pl_examples/basic_examples
+        bash submit_ddp_job.sh
+        bash submit_ddp2_job.sh
         pip uninstall -y pytorch-lightning
       displayName: 'Examples'
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import platform
 import time
 from typing import Type
 
@@ -22,25 +21,20 @@
 
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.plugins import DDPSpawnShardedPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from tests.accelerators import DDPLauncher
 from tests.helpers.boring_model import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=1, skip_windows=True, fairscale=True)
 def test_ddp_sharded_plugin_correctness_one_gpu():
     plugin_parity_test(
         gpus=1,
         model_cls=SeedTrainLoaderModel,
     )
 
 
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=1, skip_windows=True, fairscale=True, amp_native=True)
 def test_ddp_sharded_plugin_correctness_amp_one_gpu():
     plugin_parity_test(
         gpus=1,
@@ -50,9 +44,7 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
 
 
 @pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
 def test_ddp_sharded_plugin_correctness_multi_gpu():
     plugin_parity_test(
         gpus=2,
@@ -61,10 +53,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu():
     )
 
 
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True)
 def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
     plugin_parity_test(
         gpus=2,
@@ -74,10 +63,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
     )
 
 
-@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True)
 def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
     plugin_parity_test(
         gpus=2,
@@ -87,8 +73,7 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
     )
 
 
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@RunIf(min_gpus=2, fairscale=True)
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
@@ -101,8 +86,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None):
     )
 
 
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@RunIf(min_gpus=2, fairscale=True)
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
@@ -116,9 +100,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
 
 
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
 def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
     """
         Ensures same results using multiple optimizers across multiple GPUs
@@ -131,9 +113,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
 
 
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
 def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
     """
         Ensures using multiple optimizers across multiple GPUs with manual optimization
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: Recurent events`
	`1`	`+name: Recurrent events`
`2`	`2`
`3`	`3`	`# https://jasonet.co/posts/scheduled-actions/`
`4`	`4`	`# https://github.202132.xyzmunity/t/distinct-job-for-each-schedule/17811/2`