Merge branch 'master' into bugfix/duplicate-logs2

mergify[bot] · web-flow · commit 0cd08b550f32 · 2021-01-24T08:42:23.000Z
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -38,7 +38,7 @@ pull_request_rules:
 
   - name: update PR
     conditions:
-      - conflict
+      - -conflict
       - -draft # filter-out GH draft PRs
       - label="0:] Ready-To-Go"
     actions:
@@ -50,7 +50,8 @@ pull_request_rules:
       - -draft # filter-out GH draft PRs
       - label="0:] Ready-To-Go"
       - "#approved-reviews-by<3" # number of review approvals
+      - "#review-requested<3" # number of requested reviews
     actions:
       request_reviews:
         teams:
-          - core-contributors
+          - "@PyTorchLightning/core-contributors"
diff --git a/.github/workflows/ci_test-mnodes.yml b/.github/workflows/ci_test-mnodes.yml
@@ -0,0 +1,212 @@
+name: Multi Nodes GPU Tests
+
+# Workflow Steps:
+#  1. Checkout Pytorch Lightning
+#  2. Set up Python
+#  3. Configure AWS Credentials
+#  4. Install AWS Client
+#  5. Get Current Sha Commit
+#  6. Create Job Name
+#  7. Update Test Configuration File
+#  8. Install EKSClient
+#  9. Create Gpu Node Pool
+#  10. Check Current Node Pool | Current Elatic Pods
+#  11. Apply Elastic
+#  12. Wait 5 sec
+#  13. Find ETCD TCP Address
+#  14. Update Test Configuration File
+#  15. Apply Multi Node Testing
+#  16. Wait 120 secs
+#  17. Listen to Jobs Logging
+#  18. Statistics
+#  19. Upload coverage results
+#  20. Upload coverage to Codecov
+#  21. Delete Group Node
+
+#on: push
+
+on:
+  push:
+    branches:
+      - master
+      - release/*
+  pull_request:
+    types: [closed]
+
+env:
+  AWS_CLUSTER: pl-lightning-torchelastic
+  NODE_TYPE: g4dn.xlarge
+  NODES: 2
+  NUM_GPUS: 1
+  REGION: us-east-2
+  MAX_CHECKS: 300
+  CHECK_SPEEP: 2
+
+jobs:
+  multi-nodes-gpu-testing:
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.7]
+        pytorch-version: [1.5]
+    # Timeout: https://stackoverflow.com/a/59076067/4521646
+    timeout-minutes: 50
+
+    # runs only when merged happened.
+    # if: github.event.pull_request.merged == true
+    steps:
+
+    - name: Checkout Pytorch Lightning
+      uses: actions/checkout@v2
+      with:
+        repository: PyTorchLightning/pytorch-lightning
+        ref: ${{ github.event.base_ref }}
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    # Note: This uses an internal pip API and may not always work
+    # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
+    - name: Cache pip
+      uses: actions/cache@v2
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-multi-node
+        restore-keys: |
+          ${{ runner.os }}-pip-
+
+    - name: Install dependencies
+      run: |
+        pip install awscli coverage
+        # todo
+        pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.3 -q --no-cache-dir
+        #pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@mnodes -q --no-cache-dir
+
+    - name: Configure AWS Credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
+        aws-region: us-east-2
+
+    - name: Get Current Sha Commit
+      id: vars
+      shell: bash
+      run: |
+        echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
+        echo $PWD
+
+    - name: Create Job Name
+      id: job
+      shell: bash
+      run: |
+        echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
+        echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"
+
+    - name: Install EKSClient
+      run: |
+        curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
+        sudo mv /tmp/eksctl /usr/local/bin
+      shell: bash
+
+    - name: Create Gpu Node Pool
+      run: |
+        aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
+        eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE  --nodes=$NODES
+        # eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE  --nodes=$NODES
+      shell: bash
+
+    - name: Check Current Node Pool | Current Elatic Pods
+      run: |
+        eksctl get nodegroups --cluster $AWS_CLUSTER
+        kubectl get pods -n elastic-job
+
+    - name: Apply Elastic
+      run: |
+        git clone https://github.com/pytorch/elastic.git
+        cd elastic/kubernetes
+
+        kubectl apply -k config/default
+
+        kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
+        kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml
+
+    - name: Wait
+      # todo: this shall be dynamic
+      if: always()
+      shell: bash
+      run: |
+        sleep 5
+
+    - name: Find ETCD TCP Address
+      id: tcp
+      shell: bash
+      run: |
+        echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"
+
+    - name: Update Test Config. File
+      run: |
+        import os
+        from dtrun.configs import prepare_multi_nodes_gpu_config
+
+        assert os.path.isfile('./tests/mnode_tests.txt')
+        prepare_multi_nodes_gpu_config(
+            './.github/multi-nodes-gpu.yaml',
+            './tests/mnode_tests.txt',
+            sha="${{ steps.vars.outputs.SHA }}",
+            tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
+            python_version="${{ matrix.python-version }}",
+            torch_version="${{ matrix.pytorch-version }}",
+            num_gpus=1,
+        )
+      shell: python
+
+    - name: Apply Multi Node Testing
+      run: |
+        # cat ./.github/multi-nodes-gpu.yaml
+        kubectl apply -f ./.github/multi-nodes-gpu.yaml
+      shell: bash
+
+    - name: Wait
+      # todo: this shall be dynamic
+      if: always()
+      shell: bash
+      run: |
+        sleep 400
+
+    - name: Listen to Jobs Logging
+      shell: bash
+      run: |
+        # todo: Enable automatic checking.
+        # while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
+        # echo "Done waiting. Job status code: $status_code" && \
+        kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
+        if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
+        cat xx00
+
+    - name: Statistics
+      if: success()
+      run: |
+        cat ./xx01  | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
+        cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v1
+      if: always()
+      # see: https://github.com/actions/toolkit/issues/399
+      continue-on-error: true
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: coverage.xml
+        flags: multi-nodes,pytest
+        name: multi-nodes-coverage
+        fail_ci_if_error: false
+
+    - name: Delete Group Node
+      if: always()
+      run: |
+       kubectl delete  ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
+       eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
@@ -29,14 +29,9 @@ jobs:
 
     - uses: actions/upload-artifact@v2
       with:
-        name: pypi-packages
+        name: pypi-packages-${{ github.sha }}
         path: dist
 
-  publish-package:
-    runs-on: ubuntu-20.04
-    needs: build-package
-    steps:
-    - uses: actions/checkout@v2
     - name: Upload to release
       if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
       uses: svenstaro/upload-release-action@v2
@@ -48,6 +43,19 @@ jobs:
         overwrite: false
         file_glob: true
 
+  publish-package:
+    runs-on: ubuntu-20.04
+    needs: build-package
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/download-artifact@v2
+      with:
+        name: pypi-packages-${{ github.sha }}
+        path: dist
+    - name: Show packages
+      run: |
+        ls -lh dist/
+
     - name: Delay releasing
       if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
       uses: juliangruber/sleep-action@v1
@@ -102,7 +110,7 @@ jobs:
 
     - uses: actions/download-artifact@v2
       with:
-        name: pypi-packages
+        name: pypi-packages-${{ github.sha }}
         path: dist
 
     - name: Pull files from S3
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+
 ## [unreleased.Bugfixes] - YYYY-MM-DD
 
 ### Added
@@ -21,15 +22,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 
-- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
 
 
-- Fixed logging on_train_batch_end in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
+## [1.1.5] - 2021-01-19
 
+### Fixed
 
+- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
+- Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
 - Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
-
-
 - Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/PyTorchLightning/pytorch-lightning/pull/5540))
 
 
diff --git a/README.md b/README.md
@@ -305,11 +305,10 @@ Lightning is also part of the [PyTorch ecosystem](https://pytorch.org/ecosystem/
 
 ### Asking for help
 If you have any questions please:
-1. [Read the docs](https://pytorch-lightning.rtfd.io/en/latest/).
-2. [Look it up in our forum (or add a new question)](https://forums.pytorchlightning.ai/)
-2. [Search through the issues](https://github.com/PytorchLightning/pytorch-lightning/issues?utf8=%E2%9C%93&q=my++question).
-3. [Join our slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A).
-4. [Ask on stackoverflow](https://stackoverflow.com/questions/ask?guided=false) with the tag pytorch-lightning.
+1. [Read the docs](https://pytorch-lightning.rtfd.io/en/latest).
+2. [Search through the Discussions](https://github.com/PyTorchLightning/pytorch-lightning/discussions).
+3. [Look it up in our forum (or add a new question)](https://forums.pytorchlightning.ai)
+4. [Join our slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A).
 
 ### Funding
 Building open-source software with only a few part-time people is hard!
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
@@ -284,7 +284,7 @@ def main(hparams: Namespace):
 
 if __name__ == '__main__':
     cli_lightning_logo()
-    parser = ArgumentParser()
+    parser = ArgumentParser(add_help=False)
     parser = SegModel.add_model_specific_args(parser)
     hparams = parser.parse_args()
 
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
@@ -6,7 +6,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.1.4'
+__version__ = '1.1.5'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'
diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py
@@ -28,7 +28,7 @@ def __init__(self):
     def master_address(self):
         # figure out the root node addr
         try:
-            root_node = os.environ["SLURM_NODELIST"].split(" ")[0]
+            root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0]
         except Exception:
             root_node = "127.0.0.1"
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -464,8 +464,11 @@ def training_step(self, *args, **kwargs):
             Any of.
 
             - :class:`~torch.Tensor` - The loss tensor
-            - `dict` - A dictionary. Can include any keys, but must include the key 'loss'
-            - `None` - Training will skip to the next batch
+            - ``dict`` - A dictionary. Can include any keys, but must include the key ``'loss'``
+            - ``None`` - Training will skip to the next batch
+
+        Note:
+            Returning ``None`` is currently not supported for multi-GPU or TPU, or with 16-bit precision enabled.
 
         In this step you'd normally do the forward pass and calculate the loss for a batch.
         You can also do fancier things like multiple forward passes or something model specific.
@@ -640,7 +643,7 @@ def validation_step(self, *args, **kwargs):
            Any of.
 
             - Any object or value
-            - `None` - Validation will skip to the next batch
+            - ``None`` - Validation will skip to the next batch
 
         .. code-block:: python
 
@@ -825,7 +828,7 @@ def test_step(self, *args, **kwargs):
            Any of.
 
             - Any object or value
-            - `None` - Testing will skip to the next batch
+            - ``None`` - Testing will skip to the next batch
 
         .. code-block:: python
 
diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py
@@ -200,7 +200,7 @@ def __init__(
         self._prefix = prefix
         self._kwargs = kwargs
         self.experiment_id = experiment_id
-        self._experiment = self._create_or_get_experiment()
+        self._experiment = None
 
         log.info(f'NeptuneLogger will work in {"offline" if self.offline_mode else "online"} mode')
 
diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py
@@ -139,7 +139,7 @@ def connect_ddp(self, global_rank: int, world_size: int) -> None:
 
         # figure out the root node addr
         try:
-            root_node = os.environ["SLURM_NODELIST"].split(" ")[0]
+            root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0]
         except Exception:
             root_node = "127.0.0.1"
 
diff --git a/requirements.txt b/requirements.txt
diff --git a/tests/backends/test_multi_nodes_gpu.py b/tests/backends/test_multi_nodes_gpu.py
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py
diff --git a/tests/mnode_tests.txt b/tests/mnode_tests.txt