From 64a4f6a6821b5ac860209dca8144022786e13e0b Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 5 May 2022 17:24:44 +0200 Subject: [PATCH 01/20] CI: Azure - multiple configs --- .azure-pipelines/gpu-tests.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index d9b59c5b2cfc0..946f04c5bfe86 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -18,6 +18,14 @@ pr: jobs: - job: pytest + strategy: + matrix: + 'PyTorch - stable': + pyVersion: '3.7' + ptVersion: '1.8' + 'PyTorch - latest': + pyVersion: '3.9' + ptVersion: '1.11' # how long to run the job before automatically cancelling timeoutInMinutes: "55" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -30,7 +38,7 @@ jobs: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 # run on torch 1.8 as it's the LTS version # TODO: Unpin sha256 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" + image: "pytorchlightning/pytorch_lightning:base-cuda-py$(pyVersion)-torch$(ptVersion)@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" From db3ebb28b6951d4b43e6fa967903942d1d44b864 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 5 May 2022 17:28:58 +0200 Subject: [PATCH 02/20] short --- .github/workflows/ci_schema.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml index acf210c58da06..03b230124085d 100644 --- a/.github/workflows/ci_schema.yml +++ b/.github/workflows/ci_schema.yml @@ -16,9 +16,9 @@ jobs: pip install "check-jsonschema>=0.10" - name: GH Workflows - run: | - check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows" + run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows" - name: Azure Pipelines - run: | - check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json" + env: + SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json + run: check-jsonschema .azure/*.yml --schemafile "$SCHEMA_FILE" From 39c8add090a514501b04117b60b775b0b2a527b1 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 5 May 2022 17:34:12 +0200 Subject: [PATCH 03/20] dir --- .github/workflows/ci_schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml index 03b230124085d..54efaff27a201 100644 --- a/.github/workflows/ci_schema.yml +++ b/.github/workflows/ci_schema.yml @@ -21,4 +21,4 @@ jobs: - name: Azure Pipelines env: SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json - run: check-jsonschema .azure/*.yml --schemafile "$SCHEMA_FILE" + run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE" From 987ebc61005a3ee7e1a53d45a10720f9047a085c Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 5 May 2022 17:41:20 +0200 Subject: [PATCH 04/20] ubuntu --- dockers/base-cuda/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 41dea516ae378..1bfb949eb73c6 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +ARG UBUNTU_VERSION=20.04 ARG CUDA_VERSION=11.3.1 -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.8 From 1696b9aa9df9f98642316730df45e05bef3986d3 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 6 May 2022 05:03:56 +0200 Subject: [PATCH 05/20] names --- .azure-pipelines/gpu-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 946f04c5bfe86..c20d88391dd61 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -20,10 +20,10 @@ jobs: - job: pytest strategy: matrix: - 'PyTorch - stable': + 'PyTorch - LTS': pyVersion: '3.7' ptVersion: '1.8' - 'PyTorch - latest': + 'PyTorch - stable': pyVersion: '3.9' ptVersion: '1.11' # how long to run the job before automatically cancelling From 5059e1d26e2fe22f5bf17d1d5beebd63094d93df Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 6 May 2022 05:04:31 +0200 Subject: [PATCH 06/20] benchmark --- .azure-pipelines/gpu-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml index f9b8406d2194f..07aef0725941b 100644 --- a/.azure-pipelines/gpu-benchmark.yml +++ b/.azure-pipelines/gpu-benchmark.yml @@ -29,7 +29,7 @@ jobs: pool: azure-gpus-spot container: # TODO: Unpin sha256 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all From 1b928d56ed238d814139ce904fb418611c148c9c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 6 May 2022 06:40:42 +0200 Subject: [PATCH 07/20] Apply suggestions from code review Co-authored-by: Akihiro Nitta --- .azure-pipelines/gpu-tests.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 22277cbbf230c..59dd7472e1b2a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -21,11 +21,9 @@ jobs: strategy: matrix: 'PyTorch - LTS': - pyVersion: '3.7' - ptVersion: '1.8' + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" 'PyTorch - stable': - pyVersion: '3.9' - ptVersion: '1.11' + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" # how long to run the job before automatically cancelling timeoutInMinutes: "55" # how much time to give 'run always even if cancelled tasks' before stopping them From 28f677a48bfdc0eb07794a8c724b643a609bbcc1 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 6 May 2022 11:57:08 +0200 Subject: [PATCH 08/20] ... --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 59dd7472e1b2a..c051d3fee272f 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -35,7 +35,7 @@ jobs: container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 # run on torch 1.8 as it's the LTS version - image: "pytorchlightning/pytorch_lightning:base-cuda-py$(pyVersion)-torch$(ptVersion)" + image: $(image) # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" From 338476d3362f5ac8387f1d5b747e59c90d7da9eb Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 6 May 2022 14:02:07 +0200 Subject: [PATCH 09/20] prune MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- .azure-pipelines/gpu-tests.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index c051d3fee272f..bbec7935a24af 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -33,8 +33,6 @@ jobs: # ToDo: this need to have installed docker in the base image... container: - # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - # run on torch 1.8 as it's the LTS version image: $(image) # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' From 727b22670c0efa2763967adcc89ea385ab0cfb2a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Mon, 9 May 2022 17:39:42 +0900 Subject: [PATCH 10/20] Temporarily don't fail fast in standalone tests --- tests/standalone_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/standalone_tests.sh b/tests/standalone_tests.sh index 7b7dd361ab0b1..2cb39aafa184d 100755 --- a/tests/standalone_tests.sh +++ b/tests/standalone_tests.sh @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -set -e +# set -e # FIXME # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 From 3f7df1bc9dacc529f878d6bb15399efbd1df4cf2 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Mon, 9 May 2022 19:34:00 +0900 Subject: [PATCH 11/20] Don't use amp with fsdp native --- tests/strategies/test_ddp_fully_sharded_native.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py index cf4973e5ae035..ddf6fecb99016 100644 --- a/tests/strategies/test_ddp_fully_sharded_native.py +++ b/tests/strategies/test_ddp_fully_sharded_native.py @@ -112,7 +112,6 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): accelerator="gpu", devices=2, strategy="fsdp_native", - precision=16, max_epochs=1, sync_batchnorm=True, ) @@ -125,7 +124,11 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir): model = TestFSDPModel() trainer = Trainer( - default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp_native", precision=16, max_epochs=1 + default_root_dir=tmpdir, + accelerator="gpu", + devices=1, + strategy="fsdp_native", + max_epochs=1, ) _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) @@ -141,7 +144,6 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): accelerator="gpu", devices=2, strategy="fsdp_native", - precision=16, max_epochs=1, callbacks=[ck], ) From fd5237d03a5b8d60c6c4dbdeb2e1fe6cad4a713f Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Mon, 9 May 2022 19:42:48 +0900 Subject: [PATCH 12/20] Increase timeout for PyTorch 1.11 testing --- .azure-pipelines/gpu-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index bbec7935a24af..8be6773170518 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -22,10 +22,12 @@ jobs: matrix: 'PyTorch - LTS': image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + timeoutInMinutes: "55" 'PyTorch - stable': image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" + timeoutInMinutes: "65" # how long to run the job before automatically cancelling - timeoutInMinutes: "55" + timeoutInMinutes: $(timeoutInMinutes) # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" From 73e659e3ddf0a356f4e358df580f77b508281ea6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 10 May 2022 08:57:30 +0900 Subject: [PATCH 13/20] Remove TODO --- .azure-pipelines/gpu-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 8be6773170518..6726139cd5c6a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -33,7 +33,6 @@ jobs: pool: azure-gpus-spot - # ToDo: this need to have installed docker in the base image... container: image: $(image) # default shm size is 64m. Increase it to avoid: From 6c7202696d406076f78d835d84a6c65cdad4b885 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 10 May 2022 09:07:28 +0900 Subject: [PATCH 14/20] Revert "Increase timeout for PyTorch 1.11 testing" This reverts commit fd5237d03a5b8d60c6c4dbdeb2e1fe6cad4a713f. --- .azure-pipelines/gpu-tests.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 6726139cd5c6a..1b8dbbed3e489 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -22,12 +22,10 @@ jobs: matrix: 'PyTorch - LTS': image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" - timeoutInMinutes: "55" 'PyTorch - stable': image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" - timeoutInMinutes: "65" # how long to run the job before automatically cancelling - timeoutInMinutes: $(timeoutInMinutes) + timeoutInMinutes: "55" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" From 0890cdb8075ef98d460a1f62b652528d06220cdd Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 10 May 2022 09:08:01 +0900 Subject: [PATCH 15/20] Increase timeout --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 1b8dbbed3e489..1b1b584eba9fd 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -25,7 +25,7 @@ jobs: 'PyTorch - stable': image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" # how long to run the job before automatically cancelling - timeoutInMinutes: "55" + timeoutInMinutes: "65" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" From 02d89585aa7b77e886ca3c69da3916b2bd711c79 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 10 May 2022 12:44:43 +0900 Subject: [PATCH 16/20] Revert "Don't use amp with fsdp native" This reverts commit 3f7df1bc9dacc529f878d6bb15399efbd1df4cf2. --- tests/strategies/test_ddp_fully_sharded_native.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py index ddf6fecb99016..cf4973e5ae035 100644 --- a/tests/strategies/test_ddp_fully_sharded_native.py +++ b/tests/strategies/test_ddp_fully_sharded_native.py @@ -112,6 +112,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): accelerator="gpu", devices=2, strategy="fsdp_native", + precision=16, max_epochs=1, sync_batchnorm=True, ) @@ -124,11 +125,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir): model = TestFSDPModel() trainer = Trainer( - default_root_dir=tmpdir, - accelerator="gpu", - devices=1, - strategy="fsdp_native", - max_epochs=1, + default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp_native", precision=16, max_epochs=1 ) _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) @@ -144,6 +141,7 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): accelerator="gpu", devices=2, strategy="fsdp_native", + precision=16, max_epochs=1, callbacks=[ck], ) From 2bef593cac85fb217146e21c99b9dd04394c41fa Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 10 May 2022 12:48:48 +0900 Subject: [PATCH 17/20] Don't test fsdp_native with PyTorch1.11 --- pytorch_lightning/utilities/imports.py | 2 ++ tests/strategies/test_ddp_fully_sharded_native.py | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index f2f73a7d89c89..4a2085b8b44fc 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -94,6 +94,8 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") _TORCH_GREATER_EQUAL_1_11 = _compare_version("torch", operator.ge, "1.11.0") +# todo: remove "dev" when PyTorch 1.12 is released +_TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0dev") _APEX_AVAILABLE = _module_available("apex.amp") _BAGUA_AVAILABLE = _package_available("bagua") diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py index cf4973e5ae035..934e55af14fc3 100644 --- a/tests/strategies/test_ddp_fully_sharded_native.py +++ b/tests/strategies/test_ddp_fully_sharded_native.py @@ -9,16 +9,16 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf -if _TORCH_GREATER_EQUAL_1_11: +if _TORCH_GREATER_EQUAL_1_12: from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel from torch.distributed.fsdp.wrap import wrap -@RunIf(min_torch="1.11") +@RunIf(min_torch="1.12dev") def test_invalid_on_cpu(tmpdir): """Test to ensure that to raise Misconfiguration for Native FSDP on CPU.""" with pytest.raises( @@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("torch.cuda.is_available", return_value=True) -@RunIf(min_torch="1.11") +@RunIf(min_torch="1.12dev") def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): """Test to ensure that plugin native amp plugin raises Misconfiguration error.""" with pytest.raises( @@ -102,7 +102,7 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer.module[2].reshard_after_forward is True -@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11") +@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev") def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run.""" @@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11") +@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev") def test_fully_sharded_native_strategy_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" @@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11") +@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev") def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" From c51480576c9dee177ca2147be353954f8b048353 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 10 May 2022 15:18:03 +0200 Subject: [PATCH 18/20] use_base_version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- pytorch_lightning/utilities/imports.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 4a2085b8b44fc..f648c83ca0ecc 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -94,8 +94,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") _TORCH_GREATER_EQUAL_1_11 = _compare_version("torch", operator.ge, "1.11.0") -# todo: remove "dev" when PyTorch 1.12 is released -_TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0dev") +_TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0", use_base_version=True) _APEX_AVAILABLE = _module_available("apex.amp") _BAGUA_AVAILABLE = _package_available("bagua") From e34787cbe995509716b82696d98bb1ef782ab80a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 11 May 2022 19:16:10 +0900 Subject: [PATCH 19/20] Don't set materialized child to child's child --- pytorch_lightning/utilities/meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py index d14f111e8759a..a5edcfb300188 100644 --- a/pytorch_lightning/utilities/meta.py +++ b/pytorch_lightning/utilities/meta.py @@ -186,7 +186,7 @@ def materialize_module(root_module: nn.Module) -> nn.Module: if not materialize_fn or isinstance(child, (Sequential, ModuleList, ModuleDict)): materialize_module(child) else: - setattr(child, name, materialize_fn()) + setattr(root_module, name, materialize_fn()) return root_module From deba69f8283afb11e429d3fc87b3a9ae8d73e71c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 12 May 2022 13:46:38 +0200 Subject: [PATCH 20/20] Apply suggestions from code review --- tests/standalone_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/standalone_tests.sh b/tests/standalone_tests.sh index 2cb39aafa184d..7b7dd361ab0b1 100755 --- a/tests/standalone_tests.sh +++ b/tests/standalone_tests.sh @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# set -e # FIXME +set -e # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1