Skip to content

Commit 0cd08b5

Browse files
authored
Merge branch 'master' into bugfix/duplicate-logs2
2 parents 42bfd40 + 052bc00 commit 0cd08b5

File tree

16 files changed

+386
-32
lines changed

16 files changed

+386
-32
lines changed

.mergify.yml renamed to .github/mergify.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pull_request_rules:
3838

3939
- name: update PR
4040
conditions:
41-
- conflict
41+
- -conflict
4242
- -draft # filter-out GH draft PRs
4343
- label="0:] Ready-To-Go"
4444
actions:
@@ -50,7 +50,8 @@ pull_request_rules:
5050
- -draft # filter-out GH draft PRs
5151
- label="0:] Ready-To-Go"
5252
- "#approved-reviews-by<3" # number of review approvals
53+
- "#review-requested<3" # number of requested reviews
5354
actions:
5455
request_reviews:
5556
teams:
56-
- core-contributors
57+
- "@PyTorchLightning/core-contributors"
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
name: Multi Nodes GPU Tests
2+
3+
# Workflow Steps:
4+
# 1. Checkout Pytorch Lightning
5+
# 2. Set up Python
6+
# 3. Configure AWS Credentials
7+
# 4. Install AWS Client
8+
# 5. Get Current Sha Commit
9+
# 6. Create Job Name
10+
# 7. Update Test Configuration File
11+
# 8. Install EKSClient
12+
# 9. Create Gpu Node Pool
13+
# 10. Check Current Node Pool | Current Elatic Pods
14+
# 11. Apply Elastic
15+
# 12. Wait 5 sec
16+
# 13. Find ETCD TCP Address
17+
# 14. Update Test Configuration File
18+
# 15. Apply Multi Node Testing
19+
# 16. Wait 120 secs
20+
# 17. Listen to Jobs Logging
21+
# 18. Statistics
22+
# 19. Upload coverage results
23+
# 20. Upload coverage to Codecov
24+
# 21. Delete Group Node
25+
26+
#on: push
27+
28+
on:
29+
push:
30+
branches:
31+
- master
32+
- release/*
33+
pull_request:
34+
types: [closed]
35+
36+
env:
37+
AWS_CLUSTER: pl-lightning-torchelastic
38+
NODE_TYPE: g4dn.xlarge
39+
NODES: 2
40+
NUM_GPUS: 1
41+
REGION: us-east-2
42+
MAX_CHECKS: 300
43+
CHECK_SPEEP: 2
44+
45+
jobs:
46+
multi-nodes-gpu-testing:
47+
runs-on: ubuntu-20.04
48+
strategy:
49+
fail-fast: false
50+
matrix:
51+
python-version: [3.7]
52+
pytorch-version: [1.5]
53+
# Timeout: https://stackoverflow.com/a/59076067/4521646
54+
timeout-minutes: 50
55+
56+
# runs only when merged happened.
57+
# if: github.event.pull_request.merged == true
58+
steps:
59+
60+
- name: Checkout Pytorch Lightning
61+
uses: actions/checkout@v2
62+
with:
63+
repository: PyTorchLightning/pytorch-lightning
64+
ref: ${{ github.event.base_ref }}
65+
66+
- name: Set up Python
67+
uses: actions/setup-python@v2
68+
with:
69+
python-version: ${{ matrix.python-version }}
70+
71+
# Note: This uses an internal pip API and may not always work
72+
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
73+
- name: Cache pip
74+
uses: actions/cache@v2
75+
with:
76+
path: ~/.cache/pip
77+
key: ${{ runner.os }}-pip-multi-node
78+
restore-keys: |
79+
${{ runner.os }}-pip-
80+
81+
- name: Install dependencies
82+
run: |
83+
pip install awscli coverage
84+
# todo
85+
pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/[email protected] -q --no-cache-dir
86+
#pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@mnodes -q --no-cache-dir
87+
88+
- name: Configure AWS Credentials
89+
uses: aws-actions/configure-aws-credentials@v1
90+
with:
91+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
92+
aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
93+
aws-region: us-east-2
94+
95+
- name: Get Current Sha Commit
96+
id: vars
97+
shell: bash
98+
run: |
99+
echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
100+
echo $PWD
101+
102+
- name: Create Job Name
103+
id: job
104+
shell: bash
105+
run: |
106+
echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
107+
echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"
108+
109+
- name: Install EKSClient
110+
run: |
111+
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
112+
sudo mv /tmp/eksctl /usr/local/bin
113+
shell: bash
114+
115+
- name: Create Gpu Node Pool
116+
run: |
117+
aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
118+
eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES
119+
# eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES
120+
shell: bash
121+
122+
- name: Check Current Node Pool | Current Elatic Pods
123+
run: |
124+
eksctl get nodegroups --cluster $AWS_CLUSTER
125+
kubectl get pods -n elastic-job
126+
127+
- name: Apply Elastic
128+
run: |
129+
git clone https://github.com/pytorch/elastic.git
130+
cd elastic/kubernetes
131+
132+
kubectl apply -k config/default
133+
134+
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
135+
kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml
136+
137+
- name: Wait
138+
# todo: this shall be dynamic
139+
if: always()
140+
shell: bash
141+
run: |
142+
sleep 5
143+
144+
- name: Find ETCD TCP Address
145+
id: tcp
146+
shell: bash
147+
run: |
148+
echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"
149+
150+
- name: Update Test Config. File
151+
run: |
152+
import os
153+
from dtrun.configs import prepare_multi_nodes_gpu_config
154+
155+
assert os.path.isfile('./tests/mnode_tests.txt')
156+
prepare_multi_nodes_gpu_config(
157+
'./.github/multi-nodes-gpu.yaml',
158+
'./tests/mnode_tests.txt',
159+
sha="${{ steps.vars.outputs.SHA }}",
160+
tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
161+
python_version="${{ matrix.python-version }}",
162+
torch_version="${{ matrix.pytorch-version }}",
163+
num_gpus=1,
164+
)
165+
shell: python
166+
167+
- name: Apply Multi Node Testing
168+
run: |
169+
# cat ./.github/multi-nodes-gpu.yaml
170+
kubectl apply -f ./.github/multi-nodes-gpu.yaml
171+
shell: bash
172+
173+
- name: Wait
174+
# todo: this shall be dynamic
175+
if: always()
176+
shell: bash
177+
run: |
178+
sleep 400
179+
180+
- name: Listen to Jobs Logging
181+
shell: bash
182+
run: |
183+
# todo: Enable automatic checking.
184+
# while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
185+
# echo "Done waiting. Job status code: $status_code" && \
186+
kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
187+
if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
188+
cat xx00
189+
190+
- name: Statistics
191+
if: success()
192+
run: |
193+
cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
194+
cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml
195+
196+
- name: Upload coverage to Codecov
197+
uses: codecov/codecov-action@v1
198+
if: always()
199+
# see: https://github.com/actions/toolkit/issues/399
200+
continue-on-error: true
201+
with:
202+
token: ${{ secrets.CODECOV_TOKEN }}
203+
file: coverage.xml
204+
flags: multi-nodes,pytest
205+
name: multi-nodes-coverage
206+
fail_ci_if_error: false
207+
208+
- name: Delete Group Node
209+
if: always()
210+
run: |
211+
kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
212+
eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER

.github/workflows/release-pypi.yml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,9 @@ jobs:
2929
3030
- uses: actions/upload-artifact@v2
3131
with:
32-
name: pypi-packages
32+
name: pypi-packages-${{ github.sha }}
3333
path: dist
3434

35-
publish-package:
36-
runs-on: ubuntu-20.04
37-
needs: build-package
38-
steps:
39-
- uses: actions/checkout@v2
4035
- name: Upload to release
4136
if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
4237
uses: svenstaro/upload-release-action@v2
@@ -48,6 +43,19 @@ jobs:
4843
overwrite: false
4944
file_glob: true
5045

46+
publish-package:
47+
runs-on: ubuntu-20.04
48+
needs: build-package
49+
steps:
50+
- uses: actions/checkout@v2
51+
- uses: actions/download-artifact@v2
52+
with:
53+
name: pypi-packages-${{ github.sha }}
54+
path: dist
55+
- name: Show packages
56+
run: |
57+
ls -lh dist/
58+
5159
- name: Delay releasing
5260
if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
5361
uses: juliangruber/sleep-action@v1
@@ -102,7 +110,7 @@ jobs:
102110

103111
- uses: actions/download-artifact@v2
104112
with:
105-
name: pypi-packages
113+
name: pypi-packages-${{ github.sha }}
106114
path: dist
107115

108116
- name: Pull files from S3

CHANGELOG.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
66

7+
78
## [unreleased.Bugfixes] - YYYY-MM-DD
89

910
### Added
@@ -21,15 +22,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2122
### Fixed
2223

2324

24-
- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
2525

2626

27-
- Fixed logging on_train_batch_end in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
27+
## [1.1.5] - 2021-01-19
2828

29+
### Fixed
2930

31+
- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
32+
- Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
3033
- Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
31-
32-
3334
- Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/PyTorchLightning/pytorch-lightning/pull/5540))
3435

3536

README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -305,11 +305,10 @@ Lightning is also part of the [PyTorch ecosystem](https://pytorch.org/ecosystem/
305305

306306
### Asking for help
307307
If you have any questions please:
308-
1. [Read the docs](https://pytorch-lightning.rtfd.io/en/latest/).
309-
2. [Look it up in our forum (or add a new question)](https://forums.pytorchlightning.ai/)
310-
2. [Search through the issues](https://github.com/PytorchLightning/pytorch-lightning/issues?utf8=%E2%9C%93&q=my++question).
311-
3. [Join our slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A).
312-
4. [Ask on stackoverflow](https://stackoverflow.com/questions/ask?guided=false) with the tag pytorch-lightning.
308+
1. [Read the docs](https://pytorch-lightning.rtfd.io/en/latest).
309+
2. [Search through the Discussions](https://github.com/PyTorchLightning/pytorch-lightning/discussions).
310+
3. [Look it up in our forum (or add a new question)](https://forums.pytorchlightning.ai)
311+
4. [Join our slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A).
313312

314313
### Funding
315314
Building open-source software with only a few part-time people is hard!

pl_examples/domain_templates/semantic_segmentation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def main(hparams: Namespace):
284284

285285
if __name__ == '__main__':
286286
cli_lightning_logo()
287-
parser = ArgumentParser()
287+
parser = ArgumentParser(add_help=False)
288288
parser = SegModel.add_model_specific_args(parser)
289289
hparams = parser.parse_args()
290290

pytorch_lightning/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import time
77

88
_this_year = time.strftime("%Y")
9-
__version__ = '1.1.4'
9+
__version__ = '1.1.5'
1010
__author__ = 'William Falcon et al.'
1111
__author_email__ = '[email protected]'
1212
__license__ = 'Apache-2.0'

pytorch_lightning/cluster_environments/slurm_environment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def __init__(self):
2828
def master_address(self):
2929
# figure out the root node addr
3030
try:
31-
root_node = os.environ["SLURM_NODELIST"].split(" ")[0]
31+
root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0]
3232
except Exception:
3333
root_node = "127.0.0.1"
3434

pytorch_lightning/core/lightning.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,8 +464,11 @@ def training_step(self, *args, **kwargs):
464464
Any of.
465465
466466
- :class:`~torch.Tensor` - The loss tensor
467-
- `dict` - A dictionary. Can include any keys, but must include the key 'loss'
468-
- `None` - Training will skip to the next batch
467+
- ``dict`` - A dictionary. Can include any keys, but must include the key ``'loss'``
468+
- ``None`` - Training will skip to the next batch
469+
470+
Note:
471+
Returning ``None`` is currently not supported for multi-GPU or TPU, or with 16-bit precision enabled.
469472
470473
In this step you'd normally do the forward pass and calculate the loss for a batch.
471474
You can also do fancier things like multiple forward passes or something model specific.
@@ -640,7 +643,7 @@ def validation_step(self, *args, **kwargs):
640643
Any of.
641644
642645
- Any object or value
643-
- `None` - Validation will skip to the next batch
646+
- ``None`` - Validation will skip to the next batch
644647
645648
.. code-block:: python
646649
@@ -825,7 +828,7 @@ def test_step(self, *args, **kwargs):
825828
Any of.
826829
827830
- Any object or value
828-
- `None` - Testing will skip to the next batch
831+
- ``None`` - Testing will skip to the next batch
829832
830833
.. code-block:: python
831834

pytorch_lightning/loggers/neptune.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def __init__(
200200
self._prefix = prefix
201201
self._kwargs = kwargs
202202
self.experiment_id = experiment_id
203-
self._experiment = self._create_or_get_experiment()
203+
self._experiment = None
204204

205205
log.info(f'NeptuneLogger will work in {"offline" if self.offline_mode else "online"} mode')
206206

0 commit comments

Comments
 (0)