Skip to content

Commit cc40781

Browse files
authored
Merge branch 'master' into tests/update-deepspeed-lite
2 parents edff47c + fff62f0 commit cc40781

File tree

46 files changed

+816
-384
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+816
-384
lines changed

.azure/gpu-tests.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,15 @@ jobs:
117117
timeoutInMinutes: "35"
118118
condition: eq(variables['continue'], '1')
119119

120+
- bash: bash run_standalone_tasks.sh
121+
workingDirectory: tests/tests_pytorch
122+
env:
123+
PL_USE_MOCKED_MNIST: "1"
124+
PL_RUN_CUDA_TESTS: "1"
125+
displayName: 'Testing: PyTorch standalone tasks'
126+
timeoutInMinutes: "10"
127+
condition: eq(variables['continue'], '1')
128+
120129
- bash: |
121130
python -m coverage report
122131
python -m coverage xml

.azure/hpu-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ jobs:
8484
8585
- task: PublishTestResults@2
8686
inputs:
87-
testResultsFiles: 'hpu*_test-results.xml'
87+
testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml'
8888
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
8989
condition: succeededOrFailed()
9090
displayName: 'Publish test results'

.circleci/config.yml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ references:
8181
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
8282
job_name=${job_name#job.batch/}
8383
job_name=${job_name% created}
84+
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
85+
echo "GKE pod name: $pod_name"
8486
echo "Waiting on kubernetes job: $job_name"
8587
i=0 && \
8688
# N checks spaced 30s apart = 900s total.
@@ -92,8 +94,6 @@ references:
9294
printf "Waiting for job to finish: " && \
9395
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
9496
echo "Done waiting. Job status code: $status_code" && \
95-
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
96-
echo "GKE pod name: $pod_name" && \
9797
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
9898
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
9999
# First portion is the test logs. Print these to Github Action stdout.
@@ -106,10 +106,6 @@ references:
106106
name: Statistics
107107
command: |
108108
mv ./xx01 coverage.xml
109-
# TODO: add human readable report
110-
cat coverage.xml
111-
sudo pip install pycobertura
112-
pycobertura show coverage.xml
113109
114110
jobs:
115111

@@ -119,7 +115,7 @@ jobs:
119115
environment:
120116
- XLA_VER: 1.9
121117
- PYTHON_VER: 3.7
122-
- MAX_CHECKS: 240
118+
- MAX_CHECKS: 1000
123119
- CHECK_SPEEP: 5
124120
steps:
125121
- checkout

.github/ISSUE_TEMPLATE/bug_report.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,17 @@ python collect_env_details.py
4646
You can also fill out the list below manually.
4747
-->
4848

49+
- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow):
4950
- PyTorch Lightning Version (e.g., 1.5.0):
51+
- Lightning App Version (e.g., 0.5.2):
5052
- PyTorch Version (e.g., 1.10):
5153
- Python version (e.g., 3.9):
5254
- OS (e.g., Linux):
5355
- CUDA/cuDNN version:
5456
- GPU models and configuration:
5557
- How you installed PyTorch (`conda`, `pip`, source):
5658
- If compiling from source, the output of `torch.__config__.show()`:
59+
- Running environment of LightningApp (e.g. local, cloud):
5760
- Any other relevant information:
5861

5962
### Additional context

dockers/tpu-tests/tpu_test_cases.jsonnet

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ local tputests = base.BaseTest {
88
mode: 'postsubmit',
99
configMaps: [],
1010

11-
timeout: 1200, # 20 minutes, in seconds.
11+
timeout: 6000, # 100 minutes, in seconds.
1212

1313
image: 'pytorchlightning/pytorch_lightning',
1414
imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',
@@ -34,16 +34,11 @@ local tputests = base.BaseTest {
3434
pip install -e .[test]
3535
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
3636
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
37+
export PL_RUN_TPU_TESTS=1
3738
cd tests/tests_pytorch
38-
echo $PWD
39-
# TODO (@kaushikb11): Add device stats tests here
40-
coverage run --source pytorch_lightning -m pytest -v --capture=no \
41-
strategies/test_tpu_spawn.py \
42-
profilers/test_xla_profiler.py \
43-
accelerators/test_tpu.py \
44-
models/test_tpu.py \
45-
plugins/environments/test_xla_environment.py \
46-
utilities/test_xla_device_utils.py
39+
coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
40+
echo "\n||| Running standalone tests |||\n"
41+
bash run_standalone_tests.sh -b 1
4742
test_exit_code=$?
4843
echo "\n||| END PYTEST LOGS |||\n"
4944
coverage xml

docs/source-pytorch/accelerators/ipu_basic.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ Currently there are some known limitations that are being addressed in the near
6262

6363
Please see the `MNIST example <https://github.com/Lightning-AI/lightning/blob/master/examples/pl_ipu/mnist_sample.py>`__ which displays most of the limitations and how to overcome them till they are resolved.
6464

65-
* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this
66-
* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result
67-
* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code
68-
* Clipping gradients is not supported
65+
* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this.
66+
* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result.
67+
* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code.
68+
* Clipping gradients is not supported.
69+
* It is not possible to use :class:`torch.utils.data.BatchSampler` in your dataloaders if you are using multiple IPUs.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@ profile = "black"
1515
line_length = 120
1616
force_sort_within_sections = "False"
1717
order_by_type = "False"
18+
skip = ["_notebooks"]
1819

1920

2021
[tool.black]
2122
line-length = 120
23+
exclude = '(_notebooks/.*)'
2224

2325

2426
[tool.mypy]
@@ -61,7 +63,6 @@ module = [
6163
"pytorch_lightning.profilers.simple",
6264
"pytorch_lightning.strategies.ddp",
6365
"pytorch_lightning.strategies.ddp_spawn",
64-
"pytorch_lightning.strategies.deepspeed",
6566
"pytorch_lightning.strategies.fully_sharded",
6667
"pytorch_lightning.strategies.ipu",
6768
"pytorch_lightning.strategies.sharded",

requirements/pytorch/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ tqdm>=4.57.0, <=4.63.0
44
PyYAML>=5.4, <=6.0
55
fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0
66
tensorboard>=2.9.1, <2.10.0
7-
torchmetrics>=0.7.0, <0.9.2 # needed for using fixed compare_version
7+
torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version
88
pyDeprecate>=0.3.1, <=0.3.2
99
packaging>=17.0, <=21.3
1010
typing-extensions>=4.0.0, <4.3.1

requirements/pytorch/extra.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# extended list of package dependencies to reach full functionality
22
matplotlib>3.1, <3.5.3
3-
torchtext>=0.10.*, <=0.12.0
3+
torchtext>=0.10.*, <0.14.0
44
omegaconf>=2.0.5, <2.3.0
55
hydra-core>=1.0.5, <1.3.0
66
jsonargparse[signatures]>=4.12.0, <=4.12.0

src/pytorch_lightning/CHANGELOG.md

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
167167
- Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/PyTorchLightning/pytorch-lightning/pull/13423))
168168

169169

170-
-
170+
- Disallowed using `BatchSampler` when running on multiple IPUs ([#13854](https://github.com/PyTorchLightning/pytorch-lightning/pull/13854))
171171

172172

173173
### Deprecated
@@ -348,6 +348,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
348348

349349
- Improved support for custom `DataLoader`s when instantiated in `*_dataloader` hook ([#12981](https://github.com/PyTorchLightning/pytorch-lightning/pull/12981))
350350

351+
- Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook [#13640](https://github.com/PyTorchLightning/pytorch-lightning/pull/13640))
352+
351353

352354
- Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/PyTorchLightning/pytorch-lightning/pull/13014))
353355

@@ -379,6 +381,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
379381
- Fixed main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` ([#12832](https://github.com/Lightning-AI/lightning/pull/12832)
380382

381383

384+
- Improved support for custom `ReduceLROnPlateau` scheduler if `reduce_on_plateau` is set by the user in scheduler config ([#13838](https://github.com/Lightning-AI/lightning/pull/13838))
385+
386+
387+
- Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/Lightning-AI/lightning/pull/13645))
388+
389+
390+
- Fixed error handling in learning rate finder when not enough data points are available to give a good suggestion ([#13845](https://github.com/Lightning-AI/lightning/pull/13845))
391+
392+
393+
- Fixed an issue that caused the learning rate finder to set the model's learning rate to None when no suggestion was possible ([#13845](https://github.com/Lightning-AI/lightning/pull/13845))
394+
395+
396+
382397
## [1.6.5] - 2022-07-13
383398

384399
### Fixed
@@ -389,9 +404,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
389404
- Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467))
390405

391406

392-
- Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/PyTorchLightning/pytorch-lightning/pull/13645))
393-
394-
395407
## [1.6.4] - 2022-06-01
396408

397409
### Added

0 commit comments

Comments
 (0)