From 695a01e654213292bb04a49a6dd108c67ed081a0 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 10 Aug 2022 00:31:57 +0200 Subject: [PATCH 01/30] update version and changelog for 1.7.2 release --- src/pytorch_lightning/CHANGELOG.md | 7 +++++++ src/pytorch_lightning/__version__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 835838342e610..3aee9a5ce4a90 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.7.2] - 2022-08-16 + +### Fixed + +- + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index 116d5667841f3..2196826f840ed 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.7.1" +version = "1.7.2" From f7a189fb20955effc309d11891d94756dfee6ea7 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 10 Aug 2022 00:32:37 +0200 Subject: [PATCH 02/30] Reset all results on epoch end (#14061) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 +- .../logger_connector/logger_connector.py | 3 +- .../logging_/test_train_loop_logging.py | 29 +++++++++++++++++-- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 3aee9a5ce4a90..8ff6b04e355db 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) ## [1.7.1] - 2022-08-09 diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index ff882912625d0..02e17a8d93494 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -163,8 +163,7 @@ def update_train_epoch_metrics(self) -> None: self.log_metrics(self.metrics["log"]) # reset result collection for next epoch - assert self.trainer._results is not None - self.trainer._results.reset(metrics=True) + self.reset_results() """ Utilities and properties diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index 5855eba4c86af..d16be306b9365 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -569,11 +569,12 @@ def on_train_epoch_end(self, trainer, pl_module): "accelerator", [ pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), + "cpu", ], ) def test_metric_are_properly_reduced(tmpdir, accelerator): class TestingModel(BoringModel): - def __init__(self, *args, **kwargs) -> None: + def __init__(self) -> None: super().__init__() self.val_acc = Accuracy() @@ -592,7 +593,6 @@ def validation_step(self, batch, batch_idx): return super().validation_step(batch, batch_idx) early_stop = EarlyStopping(monitor="val_acc", mode="max") - checkpoint = ModelCheckpoint(monitor="val_acc", save_last=True, save_top_k=2, mode="max") model = TestingModel() @@ -812,3 +812,28 @@ def training_step(self, batch, batch_idx): call(metrics={"foo_epoch": 0.0, "epoch": 1}, step=3), ] ) + + +@mock.patch("pytorch_lightning.loggers.TensorBoardLogger.log_metrics") +def test_log_on_train_start(mock_log_metrics, tmpdir): + """Tests that logged metrics on_train_start get reset after the first epoch.""" + + class MyModel(BoringModel): + def on_train_start(self): + self.log("foo", 123) + + model = MyModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=0, + max_epochs=2, + log_every_n_steps=1, + enable_model_summary=False, + enable_checkpointing=False, + enable_progress_bar=False, + ) + trainer.fit(model) + + assert mock_log_metrics.mock_calls == [call(metrics={"foo": 123.0, "epoch": 0}, step=0)] + assert trainer.max_epochs > 1 From afe40c0c551c410df8ae20cb089d1efca71c25a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 00:54:10 +0200 Subject: [PATCH 03/30] Skip ddp fork tests on windows (#14121) --- .../strategies/test_ddp_spawn_strategy.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index f485060833320..7fb22206c45c6 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -184,11 +184,17 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): "strategy_name,expected_ddp_kwargs", [ ("ddp_spawn", {}), - ("ddp_fork", {}), - ("ddp_notebook", {}), + pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)), + pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)), ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}), + pytest.param( + "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True) + ), + pytest.param( + "ddp_notebook_find_unused_parameters_false", + {"find_unused_parameters": False}, + marks=RunIf(skip_windows=True), + ), ], ) def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): From c76c381dd67f10f30ba7374b6d6f671d692855e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 11:23:20 +0200 Subject: [PATCH 04/30] Fix device placement when `.cuda()` called without specifying index (#14128) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../core/mixins/device_dtype_mixin.py | 10 ++++---- .../utilities/test_dtype_device_mixin.py | 24 ++++++++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8ff6b04e355db..089ea9ec2ec15 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -25,6 +25,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index b12e1cf042a1f..98fd9c7074c28 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -118,14 +118,16 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: # ty while being optimized. Arguments: - device: if specified, all parameters will be - copied to that device + device: If specified, all parameters will be copied to that device. If `None`, the current CUDA device + index will be used. Returns: Module: self """ - if device is None or isinstance(device, int): - device = torch.device("cuda", index=(device or 0)) + if device is None: + device = torch.device("cuda", torch.cuda.current_device()) + elif isinstance(device, int): + device = torch.device("cuda", index=device) self.__update_properties(device=device) return super().cuda(device=device) diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py index 38f72b555d52d..7c17b3d9f7642 100644 --- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py +++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py @@ -113,7 +113,7 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir): ], ) @RunIf(min_cuda_gpus=1) -def test_gpu_cuda_device(device): +def test_cuda_device(device): model = TopModule() model.cuda(device) @@ -122,3 +122,25 @@ def test_gpu_cuda_device(device): assert device.type == "cuda" assert device.index is not None assert device.index == torch.cuda.current_device() + + +@RunIf(min_cuda_gpus=2) +def test_cuda_current_device(): + """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" + + class CudaModule(DeviceDtypeModuleMixin): + def __init__(self): + super().__init__() + self.layer = nn.Linear(1, 1) + + model = CudaModule() + + torch.cuda.set_device(0) + model.cuda(1) + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) + + torch.cuda.set_device(1) + model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) From e385ea49166c1934995d712d1e4327721a0de12f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 23:15:12 +0200 Subject: [PATCH 05/30] Convert subprocess test to standalone test (#14101) --- tests/tests_pytorch/run_standalone_tasks.sh | 10 ++- tests/tests_pytorch/serve/__init__.py | 0 tests/tests_pytorch/strategies/ddp_model.py | 58 ---------------- .../strategies/scripts/__init__.py | 0 .../strategies/scripts/cli_script.py | 24 +++++++ tests/tests_pytorch/strategies/test_ddp.py | 67 +++++++------------ tests/tests_pytorch/utilities/distributed.py | 45 ------------- 7 files changed, 55 insertions(+), 149 deletions(-) create mode 100644 tests/tests_pytorch/serve/__init__.py delete mode 100644 tests/tests_pytorch/strategies/ddp_model.py create mode 100644 tests/tests_pytorch/strategies/scripts/__init__.py create mode 100644 tests/tests_pytorch/strategies/scripts/cli_script.py delete mode 100644 tests/tests_pytorch/utilities/distributed.py diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 960bd867ceaa4..698ed7863ab96 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -34,6 +34,10 @@ fi # test that a user can manually launch individual processes echo "Running manual ddp launch test" export PYTHONPATH="${PYTHONPATH}:$(pwd)" -args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} +args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} & +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args} + +# test that ddp can launched as a module (-m option) +echo "Running ddp example as module" +python -m strategies.scripts.cli_script ${args} diff --git a/tests/tests_pytorch/serve/__init__.py b/tests/tests_pytorch/serve/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/ddp_model.py b/tests/tests_pytorch/strategies/ddp_model.py deleted file mode 100644 index 76d1f3f2f6866..0000000000000 --- a/tests/tests_pytorch/strategies/ddp_model.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Runs either `.fit()` or `.test()` on a single node across multiple gpus.""" -import os -from argparse import ArgumentParser - -import torch - -from pytorch_lightning import seed_everything, Trainer -from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.simple_models import ClassificationModel - - -def main(): - seed_everything(4321) - - parser = ArgumentParser(add_help=False) - parser = Trainer.add_argparse_args(parser) - parser.add_argument("--trainer_method", default="fit") - parser.add_argument("--tmpdir") - parser.add_argument("--workdir") - parser.set_defaults(accelerator="gpu", devices=2) - parser.set_defaults(strategy="ddp") - args = parser.parse_args() - - dm = ClassifDataModule() - model = ClassificationModel() - trainer = Trainer.from_argparse_args(args) - - if args.trainer_method == "fit": - trainer.fit(model, datamodule=dm) - result = None - elif args.trainer_method == "test": - result = trainer.test(model, datamodule=dm) - elif args.trainer_method == "fit_test": - trainer.fit(model, datamodule=dm) - result = trainer.test(model, datamodule=dm) - else: - raise ValueError(f"Unsupported: {args.trainer_method}") - - result_ext = {"status": "complete", "method": args.trainer_method, "result": result} - file_path = os.path.join(args.tmpdir, "ddp.result") - torch.save(result_ext, file_path) - - -if __name__ == "__main__": - main() diff --git a/tests/tests_pytorch/strategies/scripts/__init__.py b/tests/tests_pytorch/strategies/scripts/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/scripts/cli_script.py b/tests/tests_pytorch/strategies/scripts/cli_script.py new file mode 100644 index 0000000000000..17f0d29392eb9 --- /dev/null +++ b/tests/tests_pytorch/strategies/scripts/cli_script.py @@ -0,0 +1,24 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A trivial script that wraps a LightningCLI around the BoringModel and BoringDataModule.""" +from pytorch_lightning.cli import LightningCLI +from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel + +if __name__ == "__main__": + LightningCLI( + BoringModel, + BoringDataModule, + seed_everything_default=42, + save_config_overwrite=True, + ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 1a2a0475e7ed6..9b196f3e2a97f 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -21,60 +21,41 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning import Trainer +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import Callback from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPStrategy +from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.strategies import ddp_model -from tests_pytorch.utilities.distributed import call_training_script +from tests_pytorch.helpers.simple_models import ClassificationModel -CLI_ARGS = "--max_epochs 1 --accelerator gpu --devices 2 --strategy ddp" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit", tmpdir, timeout=120, as_module=as_module) - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_test_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.test(model, datamodule=dm) - # verify the file wrote the expected outputs - assert result["status"] == "complete" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_test(tmpdir): + seed_everything(4321) + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) + result = trainer.test(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_test_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "test", tmpdir, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit_test", tmpdir, timeout=20, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - model_outs = result["result"] - for out in model_outs: + for out in result: assert out["test_acc"] > 0.7 diff --git a/tests/tests_pytorch/utilities/distributed.py b/tests/tests_pytorch/utilities/distributed.py deleted file mode 100644 index 38a50edcc7177..0000000000000 --- a/tests/tests_pytorch/utilities/distributed.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import subprocess -import sys -from pathlib import Path -from subprocess import TimeoutExpired - -import pytorch_lightning - - -def call_training_script(module_file, cli_args, method, tmpdir, timeout=60, as_module=False): - file = Path(module_file.__file__).absolute() - cli_args = cli_args.split(" ") if cli_args else [] - cli_args += ["--tmpdir", str(tmpdir)] - cli_args += ["--trainer_method", method] - file_args = ["-m", module_file.__spec__.name] if as_module else [str(file)] - command = [sys.executable] + file_args + cli_args - - # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment - env = os.environ.copy() - env["PYTHONPATH"] = env.get("PYTHONPATH", "") + f"{pytorch_lightning.__file__}:" - - # for running in ddp mode, we need to launch it's own process or pytest will get stuck - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - try: - std, err = p.communicate(timeout=timeout) - err = str(err.decode("utf-8")) - if "Exception" in err: - raise Exception(err) - except TimeoutExpired: - p.kill() - std, err = p.communicate() - return std, err From 1b320a570684a3568dadeb3b4b929af6afe4cadf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 01:32:32 +0200 Subject: [PATCH 06/30] Fix entry point test for Python 3.10 (#14154) --- .../trainer/connectors/test_callback_connector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py index d6d5018aa1dd0..02e846425a2a0 100644 --- a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py @@ -30,7 +30,7 @@ ) from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 def test_checkpoint_callbacks_are_last(tmpdir): @@ -265,7 +265,10 @@ def _make_entry_point_query_mock(callback_factory): entry_point = Mock() entry_point.name = "mocked" entry_point.load.return_value = callback_factory - if _PYTHON_GREATER_EQUAL_3_8_0: + if _PYTHON_GREATER_EQUAL_3_10_0: + query_mock.return_value = [entry_point] + import_path = "importlib.metadata.entry_points" + elif _PYTHON_GREATER_EQUAL_3_8_0: query_mock().get.return_value = [entry_point] import_path = "importlib.metadata.entry_points" else: From 552f4967e6dc0425710d3469399d4ba49df8e91e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 09:33:19 +0200 Subject: [PATCH 07/30] Fix flaky test caused by weak reference (#14157) --- tests/tests_pytorch/trainer/connectors/test_data_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 52ef4c4db6d8d..2650e46b7fa60 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -445,7 +445,8 @@ def test_dataloader_source_direct_access(): def test_dataloader_source_request_from_module(): """Test requesting a dataloader from a module works.""" module = BoringModel() - module.trainer = Trainer() + trainer = Trainer() + module.trainer = trainer module.foo = Mock(return_value=module.train_dataloader()) source = _DataLoaderSource(module, "foo") From a59ee10552b433864c5ab1aebc3b7a456d04375d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 17:49:46 +0200 Subject: [PATCH 08/30] Fix saving hyperparameters in a composition where parent is not a LM or LDM (#14151) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 4 ++++ src/pytorch_lightning/utilities/parsing.py | 17 ++++++++++++----- tests/tests_pytorch/models/test_hparams.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 089ea9ec2ec15..d6889662a9d49 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -11,6 +11,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) + + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 9f5fe2d6b6841..b619c5cb698b0 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -160,7 +160,10 @@ def get_init_args(frame: types.FrameType) -> Dict[str, Any]: def collect_init_args( - frame: types.FrameType, path_args: List[Dict[str, Any]], inside: bool = False + frame: types.FrameType, + path_args: List[Dict[str, Any]], + inside: bool = False, + classes: Tuple[Type, ...] = (), ) -> List[Dict[str, Any]]: """Recursively collects the arguments passed to the child constructors in the inheritance tree. @@ -168,6 +171,7 @@ def collect_init_args( frame: the current stack frame path_args: a list of dictionaries containing the constructor args in all parent classes inside: track if we are inside inheritance path, avoid terminating too soon + classes: the classes in which to inspect the frames Return: A list of dictionaries where each dictionary contains the arguments passed to the @@ -179,13 +183,13 @@ def collect_init_args( if not isinstance(frame.f_back, types.FrameType): return path_args - if "__class__" in local_vars: + if "__class__" in local_vars and (not classes or issubclass(local_vars["__class__"], classes)): local_args = get_init_args(frame) # recursive update path_args.append(local_args) - return collect_init_args(frame.f_back, path_args, inside=True) + return collect_init_args(frame.f_back, path_args, inside=True, classes=classes) if not inside: - return collect_init_args(frame.f_back, path_args, inside) + return collect_init_args(frame.f_back, path_args, inside, classes=classes) return path_args @@ -223,7 +227,10 @@ def save_hyperparameters( init_args = {f.name: getattr(obj, f.name) for f in fields(obj)} else: init_args = {} - for local_args in collect_init_args(frame, []): + + from pytorch_lightning.core.mixins import HyperparametersMixin + + for local_args in collect_init_args(frame, [], classes=(HyperparametersMixin,)): init_args.update(local_args) if ignore is None: diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index c064d0f8c055e..90d9d1eb0e902 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -29,6 +29,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, _OMEGACONF_AVAILABLE, AttributeDict, is_picklable @@ -401,6 +402,24 @@ def _raw_checkpoint_path(trainer) -> str: return raw_checkpoint_path +@pytest.mark.parametrize("base_class", (HyperparametersMixin, LightningModule, LightningDataModule)) +def test_save_hyperparameters_under_composition(base_class): + """Test that in a composition where the parent is not a Lightning-like module, the parent's arguments don't get + collected.""" + + class ChildInComposition(base_class): + def __init__(self, same_arg): + super().__init__() + self.save_hyperparameters() + + class NotPLSubclass: # intentionally not subclassing LightningModule/LightningDataModule + def __init__(self, same_arg="parent_default", other_arg="other"): + self.child = ChildInComposition(same_arg="cocofruit") + + parent = NotPLSubclass() + assert parent.child.hparams == dict(same_arg="cocofruit") + + class LocalVariableModelSuperLast(BoringModel): """This model has the super().__init__() call at the end.""" From 9ffbe079d372c92e1e2b8005c687a2693fabba65 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 11 Aug 2022 22:37:06 +0200 Subject: [PATCH 09/30] Remove DeepSpeed version restriction from Lite (#13967) --- .azure/gpu-tests.yml | 2 +- requirements/pytorch/strategies.txt | 2 +- src/pytorch_lightning/CHANGELOG.md | 5 +++-- src/pytorch_lightning/lite/lite.py | 15 --------------- tests/tests_pytorch/lite/test_lite.py | 13 +------------ 5 files changed, 6 insertions(+), 31 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f37c17613affc..7d9a02226899f 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -75,7 +75,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] - pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded + pip install -U deepspeed # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 4e916fbc6c61f..c5fc92a67a837 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment fairscale>=0.4.5, <=0.4.6 -deepspeed>=0.6.0, <0.7.0 +deepspeed>=0.6.0, <=0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d6889662a9d49..8c95200e02146 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -9,10 +9,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) - - - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) +### Changed + +- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) ## [1.7.1] - 2022-08-09 diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 5125bf4486a9d..981eed30635f6 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -40,7 +40,6 @@ has_iterable_dataset, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import seed_everything @@ -106,8 +105,6 @@ def __init__( self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 - self._check_deepspeed_support() - # wrap the run method so we can inject setup logic or spawn processes for the user setattr(self, "run", partial(self._run_impl, self.run)) @@ -459,18 +456,6 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N f" Choose one of {supported} or pass in a `Strategy` instance." ) - def _check_deepspeed_support(self) -> None: - if ( - isinstance(self._strategy, DeepSpeedStrategy) - and self._strategy.zero_stage_3 - and _RequirementAvailable("deepspeed>=0.6.5") - ): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise RuntimeError( - "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`." - " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available." - ) - @staticmethod def _supported_device_types() -> Sequence[_AcceleratorType]: return ( diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 2215ab3129780..86a0a5a82195a 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib import os from copy import deepcopy from unittest import mock @@ -30,7 +29,6 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -480,13 +478,4 @@ def run(self): assert self.broadcast(True) assert self.is_global_zero == (self.local_rank == 0) - if _RequirementAvailable("deepspeed>=0.6.5"): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise_if_deepspeed_incompatible = pytest.raises( - RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" - ) - else: - raise_if_deepspeed_incompatible = contextlib.suppress() - - with raise_if_deepspeed_incompatible: - Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() From c6f2c368a2ef3eacd1ef85d8e89ffad16dd44601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 18:55:01 +0200 Subject: [PATCH 10/30] Configure the check-group app (#14165) Co-authored-by: Jirka --- .github/checkgroup.yml | 165 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 .github/checkgroup.yml diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml new file mode 100644 index 0000000000000..8f1d3c6fb5e86 --- /dev/null +++ b/.github/checkgroup.yml @@ -0,0 +1,165 @@ +custom_service_name: "Lightning CI required checker" +subprojects: + - id: "CI: CircleCI" + paths: + - ".circleci/**" + checks: + - "test-on-tpus" + + - id: "CI: Azure" + paths: + - ".azure/**" + checks: + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + + - id: "pytorch_lightning" + paths: + # all examples don't need to be added because they aren't used in CI, but these are + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch*.yml" + - ".github/workflows/docs-*.yml" + checks: + - "conda (3.8, 1.10)" + - "conda (3.8, 1.9)" + - "conda (3.9, 1.11)" + - "conda (3.9, 1.12)" + - "cpu (macOS-11, 3.10, latest, stable)" + - "cpu (macOS-11, 3.7, latest, stable)" + - "cpu (macOS-11, 3.7, oldest, stable)" + - "cpu (ubuntu-20.04, 3.10, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "cpu (windows-2022, 3.10, latest, stable)" + - "cpu (windows-2022, 3.7, latest, stable)" + - "cpu (windows-2022, 3.7, oldest, stable)" + - "doctest (pytorch)" + - "make-docs (pytorch)" + - "mypy" + - "PR Gatekeeper (pytorch)" + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + - "slow (macOS-11, 3.7, 1.11)" + - "slow (ubuntu-20.04, 3.7, 1.11)" + - "slow (windows-2022, 3.7, 1.11)" + - "test-on-tpus" + + - id: "pytorch_lightning: Docs" + paths: + - "docs/source-pytorch/**" + - ".github/workflows/docs-*.yml" + - "requirements/pytorch/**" + checks: + - "doctest (pytorch)" + - "make-docs (pytorch)" + + - id: "pytorch_lightning: Docker" + paths: + - "dockers/**" + checks: + - "build-conda (3.8, 1.10)" + - "build-conda (3.8, 1.9)" + - "build-conda (3.9, 1.11)" + - "build-conda (3.9, 1.12)" + - "build-cuda (3.8, 1.9, 11.1.1)" + - "build-cuda (3.9, 1.10, 11.3.1)" + - "build-cuda (3.9, 1.11, 11.3.1)" + - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.9, 11.1.1)" + - "build-hpu (1.5.0, 1.11.0)" + - "build-ipu (3.9, 1.9)" + - "build-NGC" + - "build-pl (3.9, 1.10, 11.3.1)" + - "build-pl (3.9, 1.11, 11.3.1)" + - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.9, 11.1.1)" + - "build-xla (3.7, 1.12)" + + - id: "pytorch_lightning: mypy" + paths: + - ".github/workflows/code-checks.yml" + - "pyproject.toml" # includes mypy config + checks: + - "mypy" + + - id: "lightning_app" + paths: + - ".github/workflows/ci-app*.yml" + - "examples/app_**" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app/**" + - "tests/tests_app_examples/**" + - "tests/tests_clusters/**" + # the examples are used in the app CI + - "examples/app_*" + checks: + - "Cloud Test (boring_app)" + - "Cloud Test (collect_failures)" + - "Cloud Test (commands_and_api)" + - "Cloud Test (custom_work_dependencies)" + - "Cloud Test (drive)" + - "Cloud Test (idle_timeout)" + - "Cloud Test (payload)" + - "Cloud Test (template_jupyterlab)" + - "Cloud Test (template_react_ui)" + - "Cloud Test (template_streamlit_ui)" + - "Cloud Test (v0_app)" + - "doctest (app)" + - "make-docs (app)" + - "pytest (macOS-11, 3.8, latest)" + - "pytest (macOS-11, 3.8, oldest)" + - "pytest (ubuntu-20.04, 3.8, latest)" + - "pytest (ubuntu-20.04, 3.8, oldest)" + - "pytest (windows-2022, 3.8, latest)" + - "pytest (windows-2022, 3.8, oldest)" + + - id: "lightning_app: Docs" + paths: + - "docs/source-app/**" + - ".github/workflows/docs-*.yml" + - "requirements/app/**" + checks: + - "doctest (app)" + - "make-docs (app)" + + - id: "install" + paths: + - ".actions/setup_tools.py" + - ".github/workflows/ci-pkg-install.yml" + - "setup.py" + - "src/lightning/**" + # all __about__, __version__, __setup__ + - "src/*/__*.py" + checks: + - "install-meta-pypi (macOS-11, 3.8)" + - "install-meta-pypi (ubuntu-20.04, 3.8)" + - "install-meta-pypi (windows-2022, 3.8)" + - "install-meta-src (macOS-11, 3.8)" + - "install-meta-src (macOS-11, lightning, 3.8)" + - "install-meta-src (ubuntu-20.04, 3.8)" + - "install-meta-src (ubuntu-20.04, lightning, 3.8)" + - "install-meta-src (windows-2022, 3.8)" + - "install-meta-src (windows-2022, lightning, 3.8)" + - "install-standalone (macOS-11, app, 3.8)" + - "install-standalone (macOS-11, pytorch, 3.8)" + - "install-standalone (ubuntu-20.04, app, 3.8)" + - "install-standalone (ubuntu-20.04, pytorch, 3.8)" + - "install-standalone (windows-2022, app, 3.8)" + - "install-standalone (windows-2022, pytorch, 3.8)" From 8cb0098709239d7f843544a7226f52baf3a3cc96 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 14:27:35 -0400 Subject: [PATCH 11/30] Update onnxruntime requirement from <=1.12.0 to <1.13.0 in /requirements (#14083) Updates the requirements on [onnxruntime](https://github.com/microsoft/onnxruntime) to permit the latest version. - [Release notes](https://github.com/microsoft/onnxruntime/releases) - [Changelog](https://github.com/microsoft/onnxruntime/blob/master/docs/ReleaseManagement.md) - [Commits](https://github.com/microsoft/onnxruntime/compare/v0.1.4...v1.12.1) --- updated-dependencies: - dependency-name: onnxruntime dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index c155400a3d35f..f8bd5793a0af6 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -10,7 +10,7 @@ mypy==0.971 # needed in tests cloudpickle>=1.3, <=2.1.0 scikit-learn>0.22.1, <=1.1.1 -onnxruntime<=1.12.0 +onnxruntime<1.13.0 psutil<=5.9.1 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks fastapi<=0.79.0 From fbe63d2c5f8edd9475fe8e4bcd4d765175ebd985 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 19:40:34 +0200 Subject: [PATCH 12/30] Update gcsfs requirement from <2022.6.0,>=2021.5.0 to >=2021.5.0,<2022.8.0 in /requirements (#14079) Update gcsfs requirement in /requirements Updates the requirements on [gcsfs](https://github.com/fsspec/gcsfs) to permit the latest version. - [Release notes](https://github.com/fsspec/gcsfs/releases) - [Commits](https://github.com/fsspec/gcsfs/compare/2021.05.0...2022.7.1) --- updated-dependencies: - dependency-name: gcsfs dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index c386c5581cc42..20b6c1b8dbc12 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -7,5 +7,5 @@ torchtext>=0.10.*, <0.14.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 -gcsfs>=2021.5.0, <2022.6.0 +gcsfs>=2021.5.0, <2022.8.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 From fcf9c5c48e869fd7d9b7b866ef4f0520b2a8a838 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 15 Aug 2022 18:48:55 +0200 Subject: [PATCH 13/30] Fix a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported (#14117) fix --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/utilities/data.py | 10 +++++---- src/pytorch_lightning/utilities/imports.py | 1 + tests/tests_pytorch/utilities/test_data.py | 25 ++++++++++++++++++++++ 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8c95200e02146..4139dc469dbd8 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) + + - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 862c7f2de905b..f2d3040125141 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -514,15 +514,17 @@ def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = Non It patches the ``__init__`` method. """ classes = _get_all_subclasses(base_cls) | {base_cls} - wrapped = set() for cls in classes: - if cls.__init__ not in wrapped: + # Check that __init__ belongs to the class + # https://stackoverflow.com/a/5253424 + if "__init__" in cls.__dict__: cls._old_init = cls.__init__ cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) - wrapped.add(cls.__init__) yield for cls in classes: - if hasattr(cls, "_old_init"): + # Check that _old_init belongs to the class + # https://stackoverflow.com/a/5253424 + if "_old_init" in cls.__dict__: cls.__init__ = cls._old_init del cls._old_init diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 7784741ca87c1..96dd62982439a 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -124,6 +124,7 @@ def __repr__(self) -> str: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) +_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index ffb898efaa815..5b0087a245924 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass import pytest @@ -173,6 +174,30 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) +def test_replace_init_method_multiple_loaders_without_init(): + """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` + method (the one we are wrapping), it can happen, that `hasattr(cls, "_old_init")` is True because of parent + class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error + occured only sometimes because it depends on the order in which we are iterating over a set of classes we are + patching. + + This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` + and are children of `DataLoader`. We are testing that a) context manager `_replace_init_method` exits cleanly, and + b) the mechanism checking for presence of `_old_init` works as expected. + """ + classes = [DataLoader] + for i in range(100): + classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) + + with _replace_init_method(DataLoader, "dataset"): + for cls in classes[1:]: # First one is `DataLoader` + assert "_old_init" not in cls.__dict__ + assert hasattr(cls, "_old_init") + + assert "_old_init" in DataLoader.__dict__ + assert hasattr(DataLoader, "_old_init") + + class DataLoaderSubclass1(DataLoader): def __init__(self, attribute1, *args, **kwargs): self.at1 = attribute1 From 803444bdefc8c18bca59b272c19e99b0602ff155 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 11:35:41 +0900 Subject: [PATCH 14/30] CI: Replace `_` of in GHA workflow filenames with `-` (#13917) * Rename workflow files * Update docs * Fix azure badges * Update the main readme * bad rebase * Update doc --- .actions/setup_tools.py | 5 ++- .github/workflows/README.md | 34 +++++++++---------- ...e2e_test.yml => ci-app-cloud-e2e-test.yml} | 0 ...i-app_examples.yml => ci-app-examples.yml} | 0 .../{ci-app_tests.yml => ci-app-tests.yml} | 0 ...{ci_pkg-install.yml => ci-pkg-install.yml} | 0 ...pr-gatekeeper.yml => ci-pr-gatekeeper.yml} | 0 ...st-conda.yml => ci-pytorch-test-conda.yml} | 0 ...test-full.yml => ci-pytorch-test-full.yml} | 0 ...test-slow.yml => ci-pytorch-test-slow.yml} | 0 .../{ci_schema.yml => ci-schema.yml} | 0 ...h_dockers.yml => cicd-pytorch-dockers.yml} | 0 README.md | 25 ++++++++------ src/pytorch_lightning/README.md | 26 +++++++------- 14 files changed, 46 insertions(+), 44 deletions(-) rename .github/workflows/{ci-app_cloud_e2e_test.yml => ci-app-cloud-e2e-test.yml} (100%) rename .github/workflows/{ci-app_examples.yml => ci-app-examples.yml} (100%) rename .github/workflows/{ci-app_tests.yml => ci-app-tests.yml} (100%) rename .github/workflows/{ci_pkg-install.yml => ci-pkg-install.yml} (100%) rename .github/workflows/{ci_pr-gatekeeper.yml => ci-pr-gatekeeper.yml} (100%) rename .github/workflows/{ci-pytorch_test-conda.yml => ci-pytorch-test-conda.yml} (100%) rename .github/workflows/{ci-pytorch_test-full.yml => ci-pytorch-test-full.yml} (100%) rename .github/workflows/{ci-pytorch_test-slow.yml => ci-pytorch-test-slow.yml} (100%) rename .github/workflows/{ci_schema.yml => ci-schema.yml} (100%) rename .github/workflows/{cicd-pytorch_dockers.yml => cicd-pytorch-dockers.yml} (100%) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 5088be2020738..a76e81246798c 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -94,11 +94,10 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str: text = text.replace("pytorch-lightning.readthedocs.io/en/stable/", f"pytorch-lightning.readthedocs.io/en/{version}") # codecov badge text = text.replace("/branch/master/graph/badge.svg", f"/release/{version}/graph/badge.svg") - # replace github badges for release ones + # github actions badge text = text.replace("badge.svg?branch=master&event=push", f"badge.svg?tag={version}") - # Azure... + # azure pipelines badge text = text.replace("?branchName=master", f"?branchName=refs%2Ftags%2F{version}") - text = re.sub(r"\?definitionId=\d+&branchName=master", f"?definitionId=2&branchName=refs%2Ftags%2F{version}", text) skip_begin = r"" skip_end = r"" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index f559551e1237f..4ed903c0f3a93 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,16 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows | -| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | +| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | +| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux | +| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 @@ -33,15 +33,15 @@ | --------------------------------- | ----------------------------------------------------------------------------------------- | | .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/Lightning-AI/lightning) | | .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). | -| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. | +| .github/workflows/ci-schema.yml | Validate the syntax of workflow files. | ## Others -| workflow file | action | -| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| .github/workflows/ci_dockers.yml | Build docker images used for testing in CI without pushing to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). Publishing these built images takes place in `.github/workflows/release-docker.yml` which only runs in master. | -| .github/workflows/ci_pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | -| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | +| workflow file | action | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| .github/workflows/cicd-pytorch-dockers.yml | Build docker images used for testing in CI. If run on nightly schedule, push to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). | +| .github/workflows/ci-pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | +| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | ## Deployment @@ -60,4 +60,4 @@ | .github/stale.yml | Close inactive issues/PRs sometimes after adding the "won't fix" label to them. | | .github/workflows/probot-auto-cc.yml, .github/lightning-probot.yml | Notify maintainers of interest depending on labels added to an issue We utilize lightning-probot forked from PyTorch’s probot. | | .pre-commit-config.yaml | pre-commit.ci runs a set of linters and formatters, such as black, flake8 and isort. When formatting is applied, the bot pushes a commit with its change. This configuration is also used for running pre-commit locally. | -| .github/workflows/ci_pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | +| .github/workflows/ci-pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml similarity index 100% rename from .github/workflows/ci-app_cloud_e2e_test.yml rename to .github/workflows/ci-app-cloud-e2e-test.yml diff --git a/.github/workflows/ci-app_examples.yml b/.github/workflows/ci-app-examples.yml similarity index 100% rename from .github/workflows/ci-app_examples.yml rename to .github/workflows/ci-app-examples.yml diff --git a/.github/workflows/ci-app_tests.yml b/.github/workflows/ci-app-tests.yml similarity index 100% rename from .github/workflows/ci-app_tests.yml rename to .github/workflows/ci-app-tests.yml diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci-pkg-install.yml similarity index 100% rename from .github/workflows/ci_pkg-install.yml rename to .github/workflows/ci-pkg-install.yml diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml similarity index 100% rename from .github/workflows/ci_pr-gatekeeper.yml rename to .github/workflows/ci-pr-gatekeeper.yml diff --git a/.github/workflows/ci-pytorch_test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-conda.yml rename to .github/workflows/ci-pytorch-test-conda.yml diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch-test-full.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-full.yml rename to .github/workflows/ci-pytorch-test-full.yml diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-slow.yml rename to .github/workflows/ci-pytorch-test-slow.yml diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci-schema.yml similarity index 100% rename from .github/workflows/ci_schema.yml rename to .github/workflows/ci-schema.yml diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml similarity index 100% rename from .github/workflows/cicd-pytorch_dockers.yml rename to .github/workflows/cicd-pytorch-dockers.yml diff --git a/README.md b/README.md index 2fef343425f17..9c03e3707ec24 100644 --- a/README.md +++ b/README.md @@ -80,21 +80,24 @@ ______________________________________________________________________ ## Continuous Integration -Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major Python and PyTorch versions. +Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs and against major Python and PyTorch versions.
Current build statuses
-| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 (with Conda | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -136,8 +139,8 @@ conda install pytorch-lightning -c conda-forge The actual status of 1.7 \[stable\] is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml?query=branch%3Arelease%2Fpytorch) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml?query=branch%3Arelease%2Fpytorch) [![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch) diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index eb1a42730b5f0..b57aea6fae147 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -78,17 +78,17 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -130,8 +130,8 @@ conda install pytorch-lightning -c conda-forge The actual status of stable is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) [![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch) [![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch) From 0f246ab453f7c031257df5ad2957973b6ad0a0d4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 18:02:54 +0900 Subject: [PATCH 15/30] CI: Update Windows version from 2019 to 2022 (#14129) Update windows --- .github/workflows/ci-app-examples.yml | 2 +- .github/workflows/ci-app-tests.yml | 6 +++--- .github/workflows/ci-pkg-install.yml | 6 +++--- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index ec8becd5f70d1..01570f59c2c77 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 1678dab257301..fe3cc36dc16d3 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] @@ -126,7 +126,7 @@ jobs: # - name: Clone Quick Start Example Repo # uses: actions/checkout@v3 # # TODO: this needs to be git submodule -# if: matrix.os == 'windows-2019' # because the install doesn't work on windows +# if: matrix.os == 'windows-2022' # because the install doesn't work on windows # with: # repository: Lightning-AI/lightning-quick-start # ref: 'main' @@ -134,6 +134,6 @@ jobs: # # - name: Lightning Install quick-start # shell: bash -# if: matrix.os != 'windows-2019' # because the install doesn't work on windows +# if: matrix.os != 'windows-2022' # because the install doesn't work on windows # run: | # python -m lightning install app lightning/quick-start -y diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 342e027b07cfe..a9fdd36693a67 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -33,7 +33,7 @@ jobs: fail-fast: true max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["app", "pytorch"] python-version: [3.8] # , 3.9 @@ -67,7 +67,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["", "lightning"] python-version: [3.8] # , 3.9 @@ -100,7 +100,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] # , 3.9 steps: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 445707d340c4b..7409ce25a5128 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] python-version: ["3.7", "3.10"] # minimum, maximum requires: ["oldest", "latest"] release: ["stable"] diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index b3756bbe8c2f7..36007d3311451 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] # same config as '.azure-pipelines/gpu-tests.yml' python-version: ["3.7"] pytorch-version: ["1.11"] From 5a955df5ece9a579b7579d8e573ad0bcf0ea432b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 19:37:50 +0900 Subject: [PATCH 16/30] CI/CD: Add CUDA version to docker image tags (#13831) * append cuda version to tags * revertme: push to hub * Update docker readme * Build base-conda-py3.9-torch1.12-cuda11.3.1 * Use new images in conda tests * revertme: push to hub * Revert "revertme: push to hub" This reverts commit 0f7d534b2ae41e4bd227961a929c333c88e35f59. * Revert "revertme: push to hub" This reverts commit 46a05fccbb9b596aa98d5d68424917b5811c5b4f. * Run conda if workflow edited * Run gpu testing if workflow edited * Use new tags in release/Dockerfile * Build base-cuda and PL release images with all combinations * Update release docker * Update conda from py3.9-torch1.12 to py3.10-torch.1.12 * Fix ubuntu version * Revert conda * revertme: push to hub * Don't build Python 3.10 for now... * Fix pl release builder * updating version contribute to the error? https://github.com/docker/buildx/issues/456 * Update actions' versions * Update slack user to notify * Don't use 11.6.0 to avoid bagua incompatibility * Don't use 11.1, and use 11.1.1 * Update .github/workflows/ci-pytorch_test-conda.yml Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> * Update trigger * Ignore artfacts from tutorials * Trim docker images to distribute * Add an image for tutorials * Update conda image 3.8x1.10 * Try different conda variants * No need to set cuda for conda jobs * Update who to notify ipu failure * Don't push * update filenaem Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 4 +- .github/workflows/cicd-pytorch-dockers.yml | 80 +++++++++++---------- .github/workflows/release-docker.yml | 31 +++++--- .gitignore | 6 ++ dockers/README.md | 45 +++--------- dockers/release/Dockerfile | 3 +- 8 files changed, 87 insertions(+), 88 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index ac5ca6f60a6b4..0de590f2c54a6 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 7d9a02226899f..8ae670d265ced 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 777ec2af759a0..2bbdb699c2c1e 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -22,13 +22,11 @@ jobs: strategy: fail-fast: false matrix: - # nightly: add when there's a release candidate include: - {python-version: "3.8", pytorch-version: "1.9"} - {python-version: "3.8", pytorch-version: "1.10"} - {python-version: "3.9", pytorch-version: "1.11"} - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 30 steps: @@ -45,7 +43,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml index a6ba2ac4aa5f4..84051cafd82d8 100644 --- a/.github/workflows/cicd-pytorch-dockers.yml +++ b/.github/workflows/cicd-pytorch-dockers.yml @@ -29,17 +29,22 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image - python_version: ["3.9"] - pytorch_version: ["1.12"] + include: + # We only release one docker image per PyTorch version. + # The matrix here is the same as the one in release-docker.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/release/Dockerfile push: false # pushed in release-docker.yml only when PL is released timeout-minutes: 50 @@ -53,14 +58,14 @@ jobs: python_version: ["3.7"] xla_version: ["1.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -85,30 +90,31 @@ jobs: fail-fast: false matrix: include: - # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} - # latest (used in Tutorials) - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + # These are the base images for PL release docker images, + # so include at least all of the combinations in release-dockers.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + # Used in Lightning-AI/tutorials + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ matrix.cuda_version }} - UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 95 - uses: ravsamhq/notify-slack-action@v1 if: failure() && env.PUSH_TO_HUB == 'true' @@ -126,25 +132,23 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - # nightly: add when there's a release candidate - # - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9"} + - {python_version: "3.8", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} + - {python_version: "3.9", pytorch_version: "1.12"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -168,14 +172,14 @@ jobs: # the config used in 'dockers/ci-runner-ipu/Dockerfile' - {python_version: "3.9", pytorch_version: "1.9"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -184,7 +188,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 100 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -199,7 +203,7 @@ jobs: status: ${{ job.status }} token: ${{ secrets.GITHUB_TOKEN }} notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} @@ -212,14 +216,14 @@ jobs: # the config used in 'dockers/ci-runner-hpu/Dockerfile' - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | DIST=latest @@ -243,10 +247,10 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build Conda Docker # publish master/release - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: file: dockers/nvidia/Dockerfile push: false diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 9d87f1a582fb1..6901a24204683 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,6 +1,5 @@ name: Docker -# https://www.docker.com/blog/first-docker-github-action-is-here -# https://github.com/docker/build-push-action + on: push: branches: [master, "release/*"] @@ -15,8 +14,12 @@ jobs: strategy: fail-fast: false matrix: - python_version: ["3.7", "3.8", "3.9"] - pytorch_version: ["1.9", "1.10"] + include: + # We only release one docker image per PyTorch version. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - name: Checkout uses: actions/checkout@v2 @@ -32,19 +35,29 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} - tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + tags: | + ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} + latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 55 - name: Publish Latest to Docker uses: docker/build-push-action@v1.1.0 - # only on releases and latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10' + # Only latest Python and PyTorch + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} tags: "latest" timeout-minutes: 55 diff --git a/.gitignore b/.gitignore index 719f291a492ca..259d9f271189c 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,9 @@ hars* artifacts/* *docs/examples* *docs/source-app/api* + +# tutorials +our_model.tar +test.png +saved_models +data/ diff --git a/dockers/README.md b/dockers/README.md index 533c85739f528..b1ff9826b6c1f 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,36 +1,17 @@ # Docker images -## Builds images form attached Dockerfiles +## Build images from Dockerfiles You can build it on your own, note it takes lots of time, be prepared. ```bash -git clone -docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile . -``` - -or with specific arguments - -```bash -git clone -docker image build \ - -t pytorch-lightning:base-cuda-py3.9-pt1.10 \ - -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.10 \ - . -``` +git clone https://github.com/Lightning-AI/lightning.git -or nightly version from Conda +# build with the default arguments +docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . -```bash -git clone -docker image build \ - -t pytorch-lightning:base-conda-py3.9-pt1.11 \ - -f dockers/base-conda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.11 \ - . +# build with specific arguments +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . ``` To run your docker use @@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest ## Run docker image with GPUs -To run docker image with access to you GPUs you need to install +To run docker image with access to your GPUs, you need to install ```bash # Add the package repositories @@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker ``` -and later run the docker image with `--gpus all` so for example +and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 ``` ## Run Jupyter server @@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in- 1. Build the docker image: ```bash - docker image build \ - -t pytorch-lightning:v1.3.1 \ - -f dockers/nvidia/Dockerfile \ - --build-arg LIGHTNING_VERSION=1.3.1 \ - . + docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 . ``` 1. start the server and map ports: ```bash - docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1 + docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5 ``` 1. Connect in local browser: - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6` diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index cb393c91dfbe0..c39e66509188c 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,8 +14,9 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 +ARG CUDA_VERSION=11.3.1 -FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} LABEL maintainer="Lightning-AI " From 4f5e10116da8ab7183809f9a613fd946507e0ebe Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 15 Aug 2022 18:59:27 +0200 Subject: [PATCH 17/30] Avoid entry_points deprecation warning (#14052) Co-authored-by: Adam J. Stewart Co-authored-by: Akihiro Nitta --- src/pytorch_lightning/CHANGELOG.md | 6 ++++++ .../trainer/connectors/callback_connector.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4139dc469dbd8..9f8f61eb890db 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -19,6 +19,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) +- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) + + +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index 83881905beeb1..3c76e734db189 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -30,7 +30,7 @@ from pytorch_lightning.callbacks.rich_model_summary import RichModelSummary from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info _log = logging.getLogger(__name__) @@ -256,14 +256,19 @@ def _configure_external_callbacks() -> List[Callback]: Return: A list of all callbacks collected from external factories. """ + group = "pytorch_lightning.callbacks_factory" + if _PYTHON_GREATER_EQUAL_3_8_0: from importlib.metadata import entry_points - factories = entry_points().get("pytorch_lightning.callbacks_factory", ()) + if _PYTHON_GREATER_EQUAL_3_10_0: + factories = entry_points(group=group) # type: ignore[call-arg] + else: + factories = entry_points().get(group, {}) # type: ignore[assignment] else: from pkg_resources import iter_entry_points - factories = iter_entry_points("pytorch_lightning.callbacks_factory") + factories = iter_entry_points(group) # type: ignore[assignment] external_callbacks = [] for factory in factories: From af9841ce1ee9d7b60f985fc7b5b891cb641415f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 18:55:01 +0200 Subject: [PATCH 18/30] Configure the check-group app (#14165) Co-authored-by: Jirka From f3f282b110f342c842ec0027f2aa09103140e04d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 15 Aug 2022 19:02:50 +0200 Subject: [PATCH 19/30] Profile batch transfer and gradient clipping hooks (#14069) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 21 +++++++-------- src/pytorch_lightning/core/module.py | 27 ++++++++++++------- .../plugins/precision/precision_plugin.py | 4 ++- .../trainer/connectors/data_connector.py | 22 +++++++-------- .../logger_connector/fx_validator.py | 5 ++++ .../trainer/connectors/test_data_connector.py | 26 +++++++++--------- .../trainer/logging_/test_logger_connector.py | 10 +++---- 7 files changed, 63 insertions(+), 52 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 9f8f61eb890db..d5ec8e9fb222f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -6,23 +6,23 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [1.7.2] - 2022-08-16 -### Fixed - -- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) - +### Added -- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) -- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) +- Added profiling to these hooks: `on_before_batch_transfer`, `transfer_batch_to_device`, `on_after_batch_transfer`, `configure_gradient_clipping`, `clip_gradients` ([#14069](https://github.com/Lightning-AI/lightning/pull/14069)) ### Changed - Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) +- Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961)) +### Fixed -- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) - - +- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) +- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) ## [1.7.1] - 2022-08-09 @@ -39,9 +39,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) -- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) - - ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index b8cc1d91cde18..30e3562067ba7 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -38,7 +38,6 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.core.saving import ModelIO from pytorch_lightning.loggers import Logger, LoggerCollection -from pytorch_lightning.trainer.connectors.data_connector import _DataHookSelector from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType, warnings from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors @@ -293,16 +292,24 @@ def _apply_batch_transfer_handler( self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0 ) -> Any: device = device or self.device - datahook_selector = ( - _DataHookSelector(self, None) if self._trainer is None else self.trainer._data_connector._datahook_selector - ) - hook = datahook_selector.get_hook("on_before_batch_transfer") - batch = hook(batch, dataloader_idx) - hook = datahook_selector.get_hook("transfer_batch_to_device") - batch = hook(batch, device, dataloader_idx) - hook = datahook_selector.get_hook("on_after_batch_transfer") - batch = hook(batch, dataloader_idx) + def call_hook(hook_name, *args): + if self._trainer: + datahook_selector = self._trainer._data_connector._datahook_selector + obj = datahook_selector.get_instance(hook_name) + trainer_method = ( + self._trainer._call_lightning_module_hook + if isinstance(obj, self.__class__) + else self._trainer._call_lightning_datamodule_hook + ) + return trainer_method(hook_name, *args) + else: + hook = getattr(self, hook_name) + return hook(*args) + + batch = call_hook("on_before_batch_transfer", batch, dataloader_idx) + batch = call_hook("transfer_batch_to_device", batch, device, dataloader_idx) + batch = call_hook("on_after_batch_transfer", batch, dataloader_idx) return batch def print(self, *args, **kwargs) -> None: diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index 02d343a0876b4..b529568d1a04e 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -178,7 +178,9 @@ def _clip_gradients( if not isinstance(model, pl.LightningModule) or not model.automatic_optimization: # the configuration validator disallows clipping on manual return - model.configure_gradient_clipping( + + model.trainer._call_lightning_module_hook( + "configure_gradient_clipping", optimizer, optimizer_idx, gradient_clip_val=clip_val, diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index e1aca404722db..1de8bee90d18f 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -14,7 +14,7 @@ import multiprocessing import os from dataclasses import dataclass, field -from typing import Any, Callable, Collection, List, Optional, Tuple, Union +from typing import Any, Collection, List, Optional, Tuple, Union from weakref import proxy from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler @@ -527,16 +527,16 @@ def is_module(self) -> bool: @dataclass class _DataHookSelector: - """Stores the info about the shared DataHooks within LightningModule and LightningDataModule. + """Stores the info about the shared DataHooks within ``LightningModule`` and ``LightningDataModule``. - The hook source can be + The hook source can be: - 1. a method from the :class:`~pytorch_lightning.core.module.LightningModule`, - 2. a method from the :class:`~pytorch_lightning.core.datamodule.LightningDataModule`, + 1. the :class:`~pytorch_lightning.core.module.LightningModule`, + 2. the :class:`~pytorch_lightning.core.datamodule.LightningDataModule`, Arguments: - model: A LightningModule - datamodule: A LightningDataModule + model: A ``LightningModule`` + datamodule: A ``LightningDataModule`` """ model: "pl.LightningModule" @@ -545,7 +545,7 @@ class _DataHookSelector: default=("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer") ) - def get_hook(self, hook_name: str) -> Callable: + def get_instance(self, hook_name: str) -> Union["pl.LightningModule", "pl.LightningDataModule"]: if hook_name not in self._valid_hooks: raise ValueError( f"`{hook_name}` is not a shared hook within `LightningModule` and `LightningDataModule`." @@ -553,7 +553,7 @@ def get_hook(self, hook_name: str) -> Callable: ) if self.datamodule is None: - return getattr(self.model, hook_name) + return self.model if is_overridden(hook_name, self.datamodule): if is_overridden(hook_name, self.model): @@ -561,11 +561,11 @@ def get_hook(self, hook_name: str) -> Callable: f"You have overridden `{hook_name}` in both `LightningModule` and `LightningDataModule`." " It will use the implementation from `LightningDataModule` instance." ) - return getattr(self.datamodule, hook_name) + return self.datamodule if is_overridden(hook_name, self.model): warning_cache.warn( f"You have overridden `{hook_name}` in `LightningModule` but have passed in a" " `LightningDataModule`. It will use the implementation from `LightningModule` instance." ) - return getattr(self.model, hook_name) + return self.model diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index 6f60ba6f1aa2f..56ad53ef4ba04 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -44,6 +44,8 @@ class _LogOptions(TypedDict): allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), "lr_scheduler_step": None, + "configure_gradient_clipping": None, + "clip_gradients": None, "on_before_zero_grad": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), @@ -98,6 +100,9 @@ class _LogOptions(TypedDict): "on_epoch_end": _LogOptions( allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True ), + "on_before_batch_transfer": None, + "transfer_batch_to_device": None, + "on_after_batch_transfer": None, "on_batch_start": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 2650e46b7fa60..7273d7719834e 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -471,34 +471,34 @@ def test_no_datamodule_no_overridden(self, hook_name): model, _, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=None) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_with_datamodule_no_overridden(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_override_model_hook(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_override_datamodule_hook(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) setattr(dm, hook_name, self.overridden_func) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(dm, hook_name) + assert instance is dm def test_override_both_model_and_datamodule(self, hook_name): model, dm, trainer = self.reset_instances() @@ -506,24 +506,24 @@ def test_override_both_model_and_datamodule(self, hook_name): setattr(model, hook_name, self.overridden_func) setattr(dm, hook_name, self.overridden_func) with pytest.warns(UserWarning, match=f"have overridden `{hook_name}` in both"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(dm, hook_name) + assert instance is dm def test_with_datamodule_override_model(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) setattr(model, hook_name, self.overridden_func) with pytest.warns(UserWarning, match=f"have overridden `{hook_name}` in `LightningModule`"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_invalid_hook_passed_in_datahook_selector(): dh_selector = _DataHookSelector(BoringModel(), None) with pytest.raises(ValueError, match="is not a shared hook"): - dh_selector.get_hook("setup") + dh_selector.get_instance("setup") def test_eval_distributed_sampler_warning(tmpdir): diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index 760e8eea2a85c..c2be22c61244b 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -187,11 +187,6 @@ def __init__(self, not_supported): { "log", "log_dict", - # the following are problematic as they do have `self._current_fx_name` defined some times but - # not others depending on where they were called. So we cannot reliably `self.log` in them - "on_before_batch_transfer", - "transfer_batch_to_device", - "on_after_batch_transfer", } ) # remove `nn.Module` hooks @@ -227,6 +222,9 @@ def test_fx_validator_integration(tmpdir): "on_pretrain_routine_end": "You can't", "train_dataloader": "You can't", "val_dataloader": "You can't", + "on_before_batch_transfer": "You can't", + "transfer_batch_to_device": "You can't", + "on_after_batch_transfer": "You can't", "on_validation_end": "You can't", "on_train_end": "You can't", "on_fit_end": "You can't", @@ -238,6 +236,8 @@ def test_fx_validator_integration(tmpdir): "on_validation_model_eval": "You can't", "on_validation_model_train": "You can't", "lr_scheduler_step": "You can't", + "configure_gradient_clipping": "You can't", + "clip_gradients": "You can't", "on_save_checkpoint": "You can't", "on_load_checkpoint": "You can't", "on_exception": "You can't", From 3dd0faf1ac0852071677ea2dfb502c16915b936a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 15 Aug 2022 19:04:30 +0200 Subject: [PATCH 20/30] Avoid false positive warning about using `sync_dist` when using torchmetrics (#14143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 2 +- .../connectors/logger_connector/result.py | 2 +- .../core/test_metric_result_integration.py | 22 ++++++++++++++----- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d5ec8e9fb222f..fc9877f090d8a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -22,9 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) - Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) +- Avoided false positive warning about using `sync_dist` when using torchmetrics ([#14143](https://github.com/Lightning-AI/lightning/pull/14143)) - Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) - ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 9eb88fda4891e..a28599b5f20be 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -525,7 +525,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: should = result_metric.meta.sync.should - if not result_metric.meta.sync.should and distributed_available(): + if not should and distributed_available() and result_metric.is_tensor: # ensure sync happens for FT since during a failure, the metrics are synced and saved to the # checkpoint, so during restart, metrics on rank 0 are from the accumulated ones from the previous # run, and on other ranks, they are 0. So we need to make sure they are synced in further training diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index cb8a51c5bf9ba..9672bb75b51f1 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -21,9 +21,11 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp +import torchmetrics from torch.nn import ModuleDict, ModuleList from torchmetrics import Metric, MetricCollection +import pytorch_lightning as pl import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint @@ -666,14 +668,22 @@ def on_train_start(self): @pytest.mark.parametrize("distributed_env", [True, False]) -def test_logger_sync_dist(distributed_env): - # self.log('bar', 7, ..., sync_dist=False) +@pytest.mark.parametrize("log_val", [torch.tensor(0.5), torchmetrics.Accuracy()]) +def test_logger_sync_dist(distributed_env, log_val): + pl.trainer.connectors.logger_connector.result.warning_cache.clear() + + # self.log('bar', 0.5, ..., sync_dist=False) meta = _Metadata("foo", "bar") meta.sync = _Sync(_should=False) - result_metric = _ResultMetric(metadata=meta, is_tensor=True) - result_metric.update(torch.tensor(7.0), 10) + is_tensor = isinstance(log_val, torch.Tensor) + + if not is_tensor: + log_val.update(torch.tensor([0, 1]), torch.tensor([0, 0], dtype=torch.long)) + + result_metric = _ResultMetric(metadata=meta, is_tensor=is_tensor) + result_metric.update(log_val, 10) - warning_ctx = pytest.warns if distributed_env else no_warning_call + warning_ctx = pytest.warns if distributed_env and is_tensor else no_warning_call with mock.patch( "pytorch_lightning.trainer.connectors.logger_connector.result.distributed_available", @@ -681,4 +691,4 @@ def test_logger_sync_dist(distributed_env): ): with warning_ctx(PossibleUserWarning, match=r"recommended to use `self.log\('bar', ..., sync_dist=True\)`"): value = _ResultCollection._get_cache(result_metric, on_step=False) - assert value == 7.0 + assert value == 0.5 From 6a78b2a54f2426b14e97481ec0a74a8d241060a0 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 15 Aug 2022 19:05:04 +0200 Subject: [PATCH 21/30] Avoid raising the sampler warning if num_replicas=1 (#14097) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- src/pytorch_lightning/CHANGELOG.md | 2 ++ .../trainer/connectors/data_connector.py | 10 +++++++--- .../trainer/connectors/test_data_connector.py | 9 +++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index fc9877f090d8a..df2d1b94b31a9 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) - Avoided false positive warning about using `sync_dist` when using torchmetrics ([#14143](https://github.com/Lightning-AI/lightning/pull/14143)) - Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) +- Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) + ## [1.7.1] - 2022-08-09 diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index 1de8bee90d18f..6e592b9f6d310 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -298,10 +298,14 @@ def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional # update docs too once this is resolved trainer_fn = self.trainer.state.fn - if isinstance(sampler, DistributedSampler) and trainer_fn in (TrainerFn.VALIDATING, TrainerFn.TESTING): + if ( + isinstance(sampler, DistributedSampler) + and sampler.num_replicas > 1 + and trainer_fn in (TrainerFn.VALIDATING, TrainerFn.TESTING) + ): rank_zero_warn( - f"Using `DistributedSampler` with the dataloaders. During `trainer.{trainer_fn.value}()`," - " it is recommended to use `Trainer(devices=1)` to ensure each sample/batch gets evaluated" + f"Using `DistributedSampler` with the dataloaders. During `trainer.{trainer_fn.value}()`, it is" + " recommended to use `Trainer(devices=1, num_nodes=1)` to ensure each sample/batch gets evaluated" " exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates" " some samples to make sure all devices have same batch size in case of uneven inputs.", category=PossibleUserWarning, diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 7273d7719834e..379a3248a1535 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -526,19 +526,20 @@ def test_invalid_hook_passed_in_datahook_selector(): dh_selector.get_instance("setup") -def test_eval_distributed_sampler_warning(tmpdir): +@pytest.mark.parametrize("devices, warn_context", [(1, no_warning_call), (2, pytest.warns)]) +def test_eval_distributed_sampler_warning(devices, warn_context): """Test that a warning is raised when `DistributedSampler` is used with evaluation.""" model = BoringModel() - trainer = Trainer(strategy="ddp", devices=2, accelerator="cpu", fast_dev_run=True) + trainer = Trainer(strategy="ddp", devices=devices, accelerator="cpu") trainer._data_connector.attach_data(model) trainer.state.fn = TrainerFn.VALIDATING - with pytest.warns(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): + with warn_context(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): trainer.reset_val_dataloader(model) trainer.state.fn = TrainerFn.TESTING - with pytest.warns(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): + with warn_context(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): trainer.reset_test_dataloader(model) From c9aa3e533dca0c9bb656fdc1c2166e2791250fe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 12 Aug 2022 13:24:35 +0200 Subject: [PATCH 22/30] Remove skipping logic in favor of path filtering (#14170) --- .azure/gpu-tests.yml | 53 ++++++++------------ .github/checkgroup.yml | 1 - .github/file-filters.yml | 9 ---- .github/workflows/ci-app-cloud-e2e-test.yml | 28 ++--------- .github/workflows/ci-app-examples.yml | 7 +++ .github/workflows/ci-app-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 35 +++---------- .github/workflows/ci-pytorch-test-full.yml | 54 +++++---------------- .github/workflows/ci-pytorch-test-slow.yml | 38 ++++----------- 9 files changed, 65 insertions(+), 164 deletions(-) delete mode 100644 .github/file-filters.yml diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 8ae670d265ced..8444468c0c58a 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -12,15 +12,31 @@ trigger: - "master" - "release/*" - "refs/tags/*" + paths: + include: + - ".azure/**" + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" + - "pyproject.toml" + - ".github/workflows/ci-pytorch*.yml" + - ".github/workflows/docs-*.yml" + pr: - "master" - "release/*" -variables: - - name: continue - value: '1' - jobs: - job: testing strategy: @@ -41,22 +57,6 @@ jobs: clean: all steps: - - - bash: | - CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml' - echo $CHANGED_FILES > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "##vso[task.setvariable variable=continue]0" - else - echo "Continue" - echo "##vso[task.setvariable variable=continue]1" - fi - displayName: Skipper - - bash: | lspci | egrep 'VGA|3D' whereis nvidia @@ -66,7 +66,6 @@ jobs: pip --version pip list displayName: 'Image info & NVIDIA' - condition: eq(variables['continue'], '1') - bash: | set -e @@ -82,7 +81,6 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' - condition: eq(variables['continue'], '1') - bash: | set -e @@ -91,16 +89,13 @@ jobs: python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' - condition: eq(variables['continue'], '1') - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' - condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest workingDirectory: src/pytorch_lightning displayName: 'Testing: PyTorch doctests' - condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 env: @@ -108,7 +103,6 @@ jobs: workingDirectory: tests/tests_pytorch displayName: 'Testing: PyTorch standard' timeoutInMinutes: "35" - condition: eq(variables['continue'], '1') - bash: bash run_standalone_tests.sh workingDirectory: tests/tests_pytorch @@ -117,7 +111,6 @@ jobs: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch standalone tests' timeoutInMinutes: "35" - condition: eq(variables['continue'], '1') - bash: bash run_standalone_tasks.sh workingDirectory: tests/tests_pytorch @@ -126,7 +119,6 @@ jobs: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch standalone tasks' timeoutInMinutes: "10" - condition: eq(variables['continue'], '1') - bash: | python -m coverage report @@ -136,14 +128,13 @@ jobs: ls -l workingDirectory: tests/tests_pytorch displayName: 'Statistics' - condition: eq(variables['continue'], '1') - task: PublishTestResults@2 displayName: 'Publish test results' inputs: testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: and(succeededOrFailed(), eq(variables['continue'], '1')) + condition: succeededOrFailed() - script: | set -e @@ -155,11 +146,9 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" displayName: 'Testing: PyTorch examples' - condition: eq(variables['continue'], '1') - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 workingDirectory: tests/tests_pytorch env: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch benchmarks' - condition: eq(variables['continue'], '1') diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 8f1d3c6fb5e86..0cb80d6e34bd8 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -101,7 +101,6 @@ subprojects: - id: "lightning_app" paths: - ".github/workflows/ci-app*.yml" - - "examples/app_**" - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" diff --git a/.github/file-filters.yml b/.github/file-filters.yml deleted file mode 100644 index e621cd83881e4..0000000000000 --- a/.github/file-filters.yml +++ /dev/null @@ -1,9 +0,0 @@ -# This file contains filters to be used in the CI to detect file changes and run the required CI jobs. - -app_examples: - - "src/lightning_app/**" - - "tests/tests_app_examples/**" - - "requirements/app/**" - - "examples/app_*" - - "setup.py" - - "src/pytorch_lightning/__version__.py" diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index 3ad455650a117..c50fee4caa285 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -7,37 +7,19 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - ".github/workflows/ci-app-cloud-e2e-test.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: - # This is job should once only once per PR to detect file changes so run required jobs. - # see .github/file-filters.yml to define file filters and run the jobs based on the output of each filter. - # More info: https://github.com/marketplace/actions/paths-changes-filter - - changes: - runs-on: ubuntu-latest - # Set job outputs to the values from filter step - outputs: - app_examples: ${{ steps.filter.outputs.app_examples }} - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - uses: dorny/paths-filter@v2 - id: filter - with: - filters: .github/file-filters.yml - cloud-test: name: Cloud Test - needs: changes - if: ${{ needs.changes.outputs.app_examples == 'true' }} runs-on: ubuntu-20.04 strategy: fail-fast: false diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 01570f59c2c77..8114f59b01aaa 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -6,6 +6,13 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - ".github/workflows/ci-app-examples.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app_examples/**" + # the examples are used in the app CI + - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index fe3cc36dc16d3..fb2cdbda69079 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -6,10 +6,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: paths: + - ".github/workflows/ci-app-tests.yml" + - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" - - "requirements/app/**" - - "setup.py" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 2bbdb699c2c1e..d314a742bbdcb 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -6,6 +6,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-conda.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -35,28 +41,7 @@ jobs: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Update base dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -70,12 +55,10 @@ jobs: run: pip install "Pillow<9.0" # It messes with torchvision - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Update all dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -95,11 +78,9 @@ jobs: python requirements/pytorch/check-avail-extras.py - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml @@ -111,7 +92,7 @@ jobs: if: failure() - name: Statistics - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -119,7 +100,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 7409ce25a5128..386bb012b8cc6 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -7,6 +7,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-full.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -37,67 +43,42 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-full.yml' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Set up Python ${{ matrix.python-version }} - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: basic setup - if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version pip install -q fire # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS - if: ${{ (runner.os == 'macOS') && (steps.skip.outputs.continue == '1') }} + if: ${{ (runner.os == 'macOS') }} run: | brew install openmpi libuv # Horovod on macOS requires OpenMPI, Gloo not currently supported - name: Setup Windows - if: ${{ (runner.os == 'windows') && (steps.skip.outputs.continue == '1') }} + if: ${{ (runner.os == 'windows') }} run: | python .actions/assistant.py requirements_prune_pkgs horovod - name: Set min. dependencies - if: ${{ (matrix.requires == 'oldest') && (steps.skip.outputs.continue == '1') }} + if: ${{ (matrix.requires == 'oldest') }} run: | python .actions/assistant.py replace_oldest_ver # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -106,11 +87,9 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -122,12 +101,10 @@ jobs: shell: bash - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt @@ -136,7 +113,7 @@ jobs: shell: bash - name: Reinstall Horovod if necessary - if: ${{ (runner.os != 'windows') && (steps.skip.outputs.continue == '1') }} + if: ${{ (runner.os != 'windows') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -153,43 +130,38 @@ jobs: shell: bash - name: Cache datasets - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: Datasets key: pl-dataset - name: Sanity check - if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Prepare Examples - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - name: Run Examples - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -197,7 +169,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} + if: always() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 36007d3311451..8e97ea90b2bc4 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -7,6 +7,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-slow.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -28,43 +34,19 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-slow.yml' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - uses: actions/setup-python@v4 - if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: Get pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: | python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - name: Cache pip - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -73,7 +55,6 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}- - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -85,21 +66,20 @@ jobs: shell: bash - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}.xml env: PL_RUN_SLOW_TESTS: 1 - name: Upload pytest test results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}.xml - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -107,7 +87,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From d03a7e9fc9d89e72cfd8d24e3ebcbfac6b3dc557 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 9 Aug 2022 16:18:21 -0700 Subject: [PATCH 23/30] Support checkpoint save and load with Stochastic Weight Averaging (#9938) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: thomas chaton Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adrian Wälchli Co-authored-by: Carlos Mocholi Co-authored-by: Kushashwa Ravi Shrimali Co-authored-by: Jirka Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 1 + .../callbacks/stochastic_weight_avg.py | 78 ++++++++++- .../callbacks/test_stochastic_weight_avg.py | 128 +++++++++++++++++- 3 files changed, 193 insertions(+), 14 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index df2d1b94b31a9..85d538d3e2b46 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoided false positive warning about using `sync_dist` when using torchmetrics ([#14143](https://github.com/Lightning-AI/lightning/pull/14143)) - Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) - Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) ## [1.7.1] - 2022-08-09 diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 20a3dcc3f0f26..6650bb3f0c479 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -16,7 +16,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ from copy import deepcopy -from typing import Any, Callable, cast, List, Optional, Union +from typing import Any, Callable, cast, Dict, List, Optional, Union import torch from torch import nn, Tensor @@ -24,6 +24,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback +from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig @@ -112,15 +113,22 @@ def __init__( if device is not None and not isinstance(device, (torch.device, str)): raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}") + self.n_averaged: Optional[torch.Tensor] = None self._swa_epoch_start = swa_epoch_start self._swa_lrs = swa_lrs self._annealing_epochs = annealing_epochs self._annealing_strategy = annealing_strategy self._avg_fn = avg_fn or self.avg_fn self._device = device - self._max_epochs: int - self._model_contains_batch_norm: bool + self._model_contains_batch_norm: Optional[bool] = None self._average_model: "pl.LightningModule" + self._initialized = False + self._swa_scheduler: Optional[_LRScheduler] = None + self._scheduler_state: Optional[Dict] = None + self._init_n_averaged = 0 + self._latest_update_epoch = -1 + self.momenta: Optional[Dict[nn.modules.batchnorm._BatchNorm, float]] = None + self._max_epochs: int @property def swa_start(self) -> int: @@ -147,6 +155,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - if len(trainer.lr_scheduler_configs) > 1: raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.") + if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DeepSpeedStrategy)): + raise MisconfigurationException("SWA does not currently support sharded models.") + if isinstance(self._swa_epoch_start, float): self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start) @@ -158,8 +169,13 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - assert trainer.fit_loop.max_epochs is not None trainer.fit_loop.max_epochs += 1 + if self._scheduler_state is not None: + self._clear_schedulers(trainer) + def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if trainer.current_epoch == self.swa_start: + if (not self._initialized) and (self.swa_start <= trainer.current_epoch <= self.swa_end): + self._initialized = True + # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) @@ -180,6 +196,17 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, ), ) + if self._scheduler_state is not None: + # Restore scheduler state from checkpoint + self._swa_scheduler.load_state_dict(self._scheduler_state) + elif trainer.current_epoch != self.swa_start: + # Log a warning if we're initializing after start without any checkpoint data, + # as behaviour will be different compared to having checkpoint data. + rank_zero_warn( + "SWA is initializing after swa_start without any checkpoint data. " + "This may be caused by loading a checkpoint from an older version of PyTorch Lightning." + ) + # We assert that there is only one optimizer on fit start, so know opt_idx is always 0 default_scheduler_cfg = LRSchedulerConfig(self._swa_scheduler, opt_idx=0) assert default_scheduler_cfg.interval == "epoch" and default_scheduler_cfg.frequency == 1 @@ -196,14 +223,18 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo else: trainer.lr_scheduler_configs.append(default_scheduler_cfg) - self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device) + if self.n_averaged is None: + self.n_averaged = torch.tensor(self._init_n_averaged, dtype=torch.long, device=pl_module.device) - if self.swa_start <= trainer.current_epoch <= self.swa_end: + if (self.swa_start <= trainer.current_epoch <= self.swa_end) and ( + trainer.current_epoch > self._latest_update_epoch + ): + assert self.n_averaged is not None self.update_parameters(self._average_model, pl_module, self.n_averaged, self._avg_fn) + self._latest_update_epoch = trainer.current_epoch # Note: No > here in case the callback is saved with the model and training continues if trainer.current_epoch == self.swa_end + 1: - # Transfer weights from average model to pl_module self.transfer_weights(self._average_model, pl_module) @@ -265,6 +296,7 @@ def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> No def reset_momenta(self) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.""" + assert self.momenta is not None for bn_module in self.momenta: bn_module.momentum = self.momenta[bn_module] @@ -285,3 +317,35 @@ def update_parameters( def avg_fn(averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: Tensor) -> Tensor: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97.""" return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) + + def state_dict(self) -> Dict[str, Any]: + return { + "n_averaged": 0 if self.n_averaged is None else self.n_averaged.item(), + "latest_update_epoch": self._latest_update_epoch, + "scheduler_state": None if self._swa_scheduler is None else self._swa_scheduler.state_dict(), + "average_model_state": None if self._average_model is None else self._average_model.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + self._init_n_averaged = state_dict["n_averaged"] + self._latest_update_epoch = state_dict["latest_update_epoch"] + self._scheduler_state = state_dict["scheduler_state"] + self._load_average_model_state(state_dict["average_model_state"]) + + @staticmethod + def _clear_schedulers(trainer: "pl.Trainer") -> None: + # If we have scheduler state saved, clear the scheduler configs so that we don't try to + # load state into the wrong type of schedulers when restoring scheduler checkpoint state. + # We'll configure the scheduler and re-load its state in on_train_epoch_start. + # Note that this relies on the callback state being restored before the scheduler state is + # restored, and doesn't work if restore_checkpoint_after_setup is True, but at the time of + # writing that is only True for deepspeed which is already not supported by SWA. + # See https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 for background. + if trainer.lr_scheduler_configs: + assert len(trainer.lr_scheduler_configs) == 1 + trainer.lr_scheduler_configs.clear() + + def _load_average_model_state(self, model_state: Any) -> None: + if self._average_model is None: + return + self._average_model.load_state_dict(model_state) diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 859cf2fa98c0c..65a0fea2fb4a5 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os +from pathlib import Path +from typing import ContextManager, Optional from unittest import mock import pytest import torch from torch import nn +from torch.optim.lr_scheduler import LambdaLR from torch.optim.swa_utils import SWALR from torch.utils.data import DataLoader @@ -30,7 +34,9 @@ class SwaTestModel(BoringModel): - def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False): + def __init__( + self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False, crash_on_epoch=None + ): super().__init__() layers = [nn.Linear(32, 32)] if batchnorm: @@ -39,17 +45,18 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dat self.layer = nn.Sequential(*layers) self.interval = interval self.iterable_dataset = iterable_dataset + self.crash_on_epoch = crash_on_epoch def training_step(self, batch, batch_idx): + if self.crash_on_epoch and self.trainer.current_epoch >= self.crash_on_epoch: + raise Exception("SWA crash test") output = self.forward(batch) loss = self.loss(batch, output) return {"loss": loss} def train_dataloader(self): - dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset dset = dset_cls(32, 64) - return DataLoader(dset, batch_size=2) def configure_optimizers(self): @@ -66,6 +73,8 @@ def configure_optimizers(self): class SwaTestCallback(StochasticWeightAveraging): update_parameters_calls: int = 0 transfer_weights_calls: int = 0 + # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0 + first_epoch: Optional[int] = None def update_parameters(self, *args, **kwargs): self.update_parameters_calls += 1 @@ -77,6 +86,11 @@ def transfer_weights(self, *args, **kwargs): def on_train_epoch_start(self, trainer, *args): super().on_train_epoch_start(trainer, *args) + if self.first_epoch is None and not trainer.fit_loop.restarting: + # since the checkpoint loaded was saved `on_train_epoch_end`, the first `FitLoop` iteration will + # not update the model and just call the epoch-level hooks, for that reason, we check that we are not + # restarting before choosing the first epoch + self.first_epoch = trainer.current_epoch assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end) if self.swa_start <= trainer.current_epoch: assert isinstance(trainer.lr_scheduler_configs[0].scheduler, SWALR) @@ -88,6 +102,7 @@ def on_train_epoch_end(self, trainer, *args): if self.swa_start <= trainer.current_epoch <= self.swa_end: swa_epoch = trainer.current_epoch - self.swa_start assert self.n_averaged == swa_epoch + 1 + assert self._swa_scheduler is not None # Scheduler is stepped once on initialization and then at the end of each epoch assert self._swa_scheduler._step_count == swa_epoch + 2 elif trainer.current_epoch > self.swa_end: @@ -103,10 +118,13 @@ def on_train_end(self, trainer, pl_module): if not isinstance(trainer.strategy, DDPSpawnStrategy): # check backward call count. the batchnorm update epoch should not backward - assert trainer.strategy.backward.call_count == trainer.max_epochs * trainer.limit_train_batches + assert trainer.strategy.backward.call_count == ( + (trainer.max_epochs - self.first_epoch) * trainer.limit_train_batches + ) # check call counts - assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1) + first_swa_epoch = max(self.first_epoch, self.swa_start) + assert self.update_parameters_calls == trainer.max_epochs - first_swa_epoch assert self.transfer_weights_calls == 1 @@ -140,7 +158,7 @@ def train_with_swa( devices=devices, ) - with mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward): + with _backward_patch(trainer): trainer.fit(model) # check the model is the expected @@ -226,9 +244,10 @@ def test_swa_multiple_lrs(tmpdir): class TestModel(BoringModel): def __init__(self): - super(BoringModel, self).__init__() + super().__init__() self.layer1 = torch.nn.Linear(32, 32) self.layer2 = torch.nn.Linear(32, 2) + self.on_train_epoch_start_called = False def forward(self, x): x = self.layer1(x) @@ -255,3 +274,98 @@ def on_train_epoch_start(self): ) trainer.fit(model) assert model.on_train_epoch_start_called + + +def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False): + swa_start = 3 + trainer_kwargs = { + "default_root_dir": tmpdir, + "max_epochs": 5, + "accelerator": "cpu", + "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None, + "devices": 2 if ddp else 1, + "limit_train_batches": 5, + "limit_val_batches": 0, + "accumulate_grad_batches": 2, + "enable_progress_bar": False, + } + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer), pytest.raises(Exception, match="SWA crash test"): + trainer.fit(model) + + checkpoint_dir = Path(tmpdir) / "lightning_logs" / "version_0" / "checkpoints" + checkpoint_files = os.listdir(checkpoint_dir) + assert len(checkpoint_files) == 1 + ckpt_path = str(checkpoint_dir / checkpoint_files[0]) + + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer): + trainer.fit(resume_model, ckpt_path=ckpt_path) + + +class CustomSchedulerModel(SwaTestModel): + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + + def lr_lambda(current_step: int): + return 0.1 + + scheduler = LambdaLR(optimizer, lr_lambda, -1) + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": scheduler, + "interval": self.interval, + }, + } + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint(tmpdir, crash_on_epoch): + model = SwaTestModel(crash_on_epoch=crash_on_epoch) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint_custom_scheduler(tmpdir, crash_on_epoch): + # Reproduces the bug reported in https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 + model = CustomSchedulerModel(crash_on_epoch=crash_on_epoch) + resume_model = CustomSchedulerModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@RunIf(skip_windows=True) +def test_swa_resume_training_from_checkpoint_ddp(tmpdir): + model = SwaTestModel(crash_on_epoch=3) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=True) + + +@pytest.mark.parametrize( + "strategy", + [ + pytest.param("fsdp", marks=RunIf(fairscale_fully_sharded=True, min_cuda_gpus=1)), + pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), + ], +) +def test_misconfiguration_error_with_sharded_model(tmpdir, strategy: str): + model = SwaTestModel() + swa_callback = SwaTestCallback(swa_epoch_start=2, swa_lrs=0.1) + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=5, + callbacks=[swa_callback], + strategy=strategy, + accelerator="gpu", + devices=1, + ) + with pytest.raises(MisconfigurationException, match="SWA does not currently support sharded models"): + trainer.fit(model) + + +def _backward_patch(trainer: Trainer) -> ContextManager: + return mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward) From 9f189ff88a3980f464c64e57fb0136af3741b1c9 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 17 Aug 2022 18:08:21 +0200 Subject: [PATCH 24/30] Use fsdp module to initialize precision scalar for fsdp native (#14092) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Laverne Henderson Co-authored-by: Rohit Gupta --- docs/source-pytorch/api_references.rst | 1 + docs/source-pytorch/extensions/plugins.rst | 1 + src/pytorch_lightning/CHANGELOG.md | 2 + src/pytorch_lightning/plugins/__init__.py | 2 + .../plugins/precision/__init__.py | 43 ++++++++---- .../precision/fsdp_native_native_amp.py | 65 +++++++++++++++++++ .../precision/fully_sharded_native_amp.py | 26 +------- .../strategies/fully_sharded_native.py | 4 +- .../connectors/accelerator_connector.py | 5 +- .../test_ddp_fully_sharded_native.py | 5 +- 10 files changed, 110 insertions(+), 44 deletions(-) create mode 100644 src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index db4fc1e2c4cf8..ce7723e418e77 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -173,6 +173,7 @@ precision DeepSpeedPrecisionPlugin DoublePrecisionPlugin FullyShardedNativeMixedPrecisionPlugin + FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst index a0dbefd141464..27aff0c11fdcb 100644 --- a/docs/source-pytorch/extensions/plugins.rst +++ b/docs/source-pytorch/extensions/plugins.rst @@ -56,6 +56,7 @@ The full list of built-in precision plugins is listed below. DeepSpeedPrecisionPlugin DoublePrecisionPlugin FullyShardedNativeMixedPrecisionPlugin + FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 85d538d3e2b46..80f6f71a03515 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added `FullyShardedNativeNativeMixedPrecisionPlugin` to handle precision for `DDPFullyShardedNativeStrategy` ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) - Added profiling to these hooks: `on_before_batch_transfer`, `transfer_batch_to_device`, `on_after_batch_transfer`, `configure_gradient_clipping`, `clip_gradients` ([#14069](https://github.com/Lightning-AI/lightning/pull/14069)) ### Changed @@ -26,6 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) - Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) - Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) +- Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) ## [1.7.1] - 2022-08-09 diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index afd10c88c951d..50d83ee708cbe 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -10,6 +10,7 @@ from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin @@ -63,6 +64,7 @@ "FullyShardedNativeMixedPrecisionPlugin", "SingleDevicePlugin", "SingleTPUPlugin", + "FullyShardedNativeNativeMixedPrecisionPlugin", "TPUPrecisionPlugin", "TPUBf16PrecisionPlugin", "TPUSpawnPlugin", diff --git a/src/pytorch_lightning/plugins/precision/__init__.py b/src/pytorch_lightning/plugins/precision/__init__.py index 4bc29c1be1864..5206aed62c497 100644 --- a/src/pytorch_lightning/plugins/precision/__init__.py +++ b/src/pytorch_lightning/plugins/precision/__init__.py @@ -11,17 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import ( # noqa: F401 - FullyShardedNativeMixedPrecisionPlugin, -) -from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin # noqa: F401 +from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin +from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin +from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin +from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin +from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin + +__all__ = [ + "ApexMixedPrecisionPlugin", + "DeepSpeedPrecisionPlugin", + "DoublePrecisionPlugin", + "FullyShardedNativeNativeMixedPrecisionPlugin", + "FullyShardedNativeMixedPrecisionPlugin", + "HPUPrecisionPlugin", + "IPUPrecisionPlugin", + "MixedPrecisionPlugin", + "NativeMixedPrecisionPlugin", + "PrecisionPlugin", + "ShardedNativeMixedPrecisionPlugin", + "TPUPrecisionPlugin", + "TPUBf16PrecisionPlugin", +] diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py new file mode 100644 index 0000000000000..2201db94586a2 --- /dev/null +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -0,0 +1,65 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, Union + +import torch + +from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + +if _TORCH_GREATER_EQUAL_1_12: + from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler +else: + MixedPrecision = None # type: ignore[misc,assignment] + + +class FullyShardedNativeNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): + """Native AMP for Fully Sharded Native Training.""" + + def __init__( + self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None + ) -> None: + if not _TORCH_GREATER_EQUAL_1_12: + raise MisconfigurationException( + "`FullyShardedNativeNativeMixedPrecisionPlugin` is supported from PyTorch v1.12.0 onwards." + ) + super().__init__(precision, device, scaler=ShardedGradScaler() if scaler is None and precision == 16 else None) + + def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: + # see https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.clip_grad_norm_ + # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect + # for FSDP module. To overcome this, needs to call sharded_module.clip_grad_norm(clip_val) + # however we rely on LightningModule's configure_sharded_model to wrap FSDP, it would be hard to + # trace back the root FSDP. Now we only support clip by value. + raise MisconfigurationException( + f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" + ) + + @property + def mixed_precision_config(self) -> Optional[MixedPrecision]: + assert MixedPrecision is not None + if self.precision == PrecisionType.HALF: + dtype = torch.float16 + elif self.precision == PrecisionType.BFLOAT: + dtype = torch.bfloat16 + else: + raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") + return MixedPrecision( + param_dtype=dtype, + reduce_dtype=dtype, + buffer_dtype=dtype, + ) diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 8c693f2975bbd..870e658bfc9c3 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -11,19 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional - -import torch +from typing import Any from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 - -if _TORCH_GREATER_EQUAL_1_12: - from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision -else: - MixedPrecision = None class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): @@ -38,18 +29,3 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: raise MisconfigurationException( f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" ) - - @property - def mixed_precision_config(self) -> Optional[MixedPrecision]: - assert MixedPrecision is not None - if self.precision == PrecisionType.HALF: - dtype = torch.float16 - elif self.precision == PrecisionType.BFLOAT: - dtype = torch.bfloat16 - else: - raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") - return MixedPrecision( - param_dtype=dtype, - reduce_dtype=dtype, - buffer_dtype=dtype, - ) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 4c351f26fa3b9..9b927aa757d17 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -23,7 +23,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast @@ -158,7 +158,7 @@ def mixed_precision_config(self) -> Optional[MixedPrecision]: if self.mixed_precision: return self.mixed_precision plugin = self.precision_plugin - if isinstance(plugin, FullyShardedNativeMixedPrecisionPlugin): + if isinstance(plugin, FullyShardedNativeNativeMixedPrecisionPlugin): return plugin.mixed_precision_config @property diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index bd879cf85ff7a..44c3b3ec7540a 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -53,6 +53,7 @@ TorchElasticEnvironment, ) from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import ( DDP2Strategy, DDPFullyShardedNativeStrategy, @@ -727,7 +728,9 @@ def _check_and_init_precision(self) -> PrecisionPlugin: if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - if isinstance(self.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy)): + if isinstance(self.strategy, DDPFullyShardedNativeStrategy): + return FullyShardedNativeNativeMixedPrecisionPlugin(self._precision_flag, device) + if isinstance(self.strategy, DDPFullyShardedStrategy): return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) return NativeMixedPrecisionPlugin(self._precision_flag, device) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index 74f9534c47ce3..ede201da1f68f 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -7,7 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 @@ -35,7 +35,7 @@ def test_invalid_on_cpu(tmpdir): @RunIf(min_torch="1.12", min_cuda_gpus=1) @pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)]) def test_precision_plugin_config(precision, expected): - plugin = FullyShardedNativeMixedPrecisionPlugin(precision=precision, device="cuda") + plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda") config = plugin.mixed_precision_config assert config.param_dtype == expected assert config.buffer_dtype == expected @@ -96,6 +96,7 @@ def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: in def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer, FullyShardedDataParallel) + assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) assert isinstance(self.layer.module[0], FullyShardedDataParallel) assert isinstance(self.layer.module[2], FullyShardedDataParallel) # root should not be resharding From e2c50eccba62a59e9659849db6d5649a62fc9d71 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 15 Aug 2022 20:06:29 +0200 Subject: [PATCH 25/30] add more issues types (#14174) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add more issues types * Update .github/ISSUE_TEMPLATE/config.yml Co-authored-by: Mansy * typo Co-authored-by: Adrian Wälchli Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Mansy Co-authored-by: Adrian Wälchli Co-authored-by: Laverne Henderson Co-authored-by: Akihiro Nitta --- .github/ISSUE_TEMPLATE/config.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 0fe790310f247..f71844e9664fe 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,14 @@ blank_issues_enabled: false contact_links: - - name: Ask a Question + - name: ❓ Ask a Question url: https://github.com/Lightning-AI/lightning/discussions/new - about: Ask and answer Lightning related questions - - name: 💬 Slack + about: Ask and answer Lightning related questions. + - name: 💬 Chat with us url: https://www.pytorchlightning.ai/community - about: Chat with our community + about: Live chat with experts, engineers, and users in our Slack community. + - name: 📖 Read the documentation + url: https://lightning.ai/lightning-docs/ + about: Please consult the documentation before opening any issues! + - name: 🙋 Contact us about professional services + url: https://lightning.ai + about: Contact the Lightning.ai sales team for paid support. From 9237eda7e2fdce220f63e71eb4101be47b263df6 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 17 Aug 2022 18:10:24 +0200 Subject: [PATCH 26/30] CI: clean building docs (#14216) * CI: clean building docs * group * . --- .github/checkgroup.yml | 2 ++ .github/workflows/docs-checks.yml | 7 ++++--- requirements/app/docs.txt | 17 ++++------------- requirements/docs.txt | 13 +++++++++++++ requirements/pytorch/docs.txt | 15 ++------------- 5 files changed, 25 insertions(+), 29 deletions(-) create mode 100644 requirements/docs.txt diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 0cb80d6e34bd8..a29deb705295e 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -64,6 +64,7 @@ subprojects: paths: - "docs/source-pytorch/**" - ".github/workflows/docs-*.yml" + - "requirements/docs.txt" - "requirements/pytorch/**" checks: - "doctest (pytorch)" @@ -133,6 +134,7 @@ subprojects: paths: - "docs/source-app/**" - ".github/workflows/docs-*.yml" + - "requirements/docs.txt" - "requirements/app/**" checks: - "doctest (app)" diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 977118b644ef3..5b5a9aec778be 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -42,13 +42,13 @@ jobs: - name: Install dependencies env: FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: ${{ matrix.pkg }} run: | sudo apt-get update sudo apt-get install -y cmake pandoc pip --version - pip install -q fire # python -m pip install --upgrade --user pip - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/base.txt -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html pip install -r requirements/${{ matrix.pkg }}/devel.txt pip list shell: bash @@ -91,11 +91,12 @@ jobs: - name: Install dependencies env: FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: ${{ matrix.pkg }} run: | sudo apt-get update sudo apt-get install -y cmake pandoc pip --version - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/base.txt -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures pip list diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index 63ac1f289331f..c189d6034ab28 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -1,17 +1,8 @@ -sphinx>=4.0,<5.0 -myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5, <=0.8.9 +-r ../docs.txt + ipython[notebook] ipython_genutils -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc +pytorch-lightning -https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31-rc1.zip -sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 sphinx-autobuild -jinja2>=3.0.0,<3.1.0 +https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000000000..1b00471602c60 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,13 @@ +sphinx>=4.0, <5.0 +myst-parser>=0.15, <0.17 +nbsphinx>=0.8.5, <=0.8.9 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 +sphinxcontrib-mockautodoc +sphinx-autodoc-typehints>=1.11, <1.15 # strict; v1.15 failing on master (#11405) +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 +sphinx-multiproject +jinja2>=3.0.0,<3.1.0 diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index 50e7c2049f6f6..474620b1e74b8 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,17 +1,6 @@ -sphinx>=4.0,<5.0 -myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5, <=0.8.9 +-r ../docs.txt + ipython[notebook] -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip -sphinx-autodoc-typehints>=1.11,<1.15 # strict; v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 -typing-extensions # already in `requirements.txt` but the docs CI job does not install it -jinja2>=3.0.0,<3.1.0 -r ../../_notebooks/.actions/requirements.txt From 20984f5a39dc3a55d62782fefe3dbb98ab3a7520 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 17 Aug 2022 17:39:27 +0200 Subject: [PATCH 27/30] CI: docker focus on PL only (#14246) * CI: docker focus on PL only * group --- .github/checkgroup.yml | 7 +++++++ .../{cicd-pytorch-dockers.yml => ci-pytorch-dockers.yml} | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) rename .github/workflows/{cicd-pytorch-dockers.yml => ci-pytorch-dockers.yml} (99%) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index a29deb705295e..c2654eddd7ca1 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -73,6 +73,13 @@ subprojects: - id: "pytorch_lightning: Docker" paths: - "dockers/**" + - "!dockers/README.md" + - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" + - "environment.yml" + - ".github/workflows/*docker*.yml" + - "setup.py" checks: - "build-conda (3.8, 1.10)" - "build-conda (3.8, 1.9)" diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml similarity index 99% rename from .github/workflows/cicd-pytorch-dockers.yml rename to .github/workflows/ci-pytorch-dockers.yml index 84051cafd82d8..a05dbbb5bc8ef 100644 --- a/.github/workflows/cicd-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -8,8 +8,9 @@ on: paths: - "dockers/**" - "!dockers/README.md" - - "requirements/**" - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" - "environment.yml" - ".github/workflows/*docker*.yml" - "setup.py" From b4250e511eea1a9f286f64e56ae22b9f5042aeb7 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 17 Aug 2022 18:11:52 +0200 Subject: [PATCH 28/30] Allowed setting attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks (#14212) Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- src/pytorch_lightning/CHANGELOG.md | 1 + src/pytorch_lightning/lite/lite.py | 8 +- src/pytorch_lightning/strategies/ipu.py | 6 +- .../trainer/connectors/data_connector.py | 8 +- src/pytorch_lightning/utilities/data.py | 136 +++++++++++++----- tests/tests_pytorch/lite/test_lite.py | 2 +- tests/tests_pytorch/utilities/test_data.py | 136 +++++++++++++++--- 7 files changed, 227 insertions(+), 70 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 80f6f71a03515..8001b20924c2b 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) - Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) - Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) ## [1.7.1] - 2022-08-09 diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 981eed30635f6..ca45a4011fcdd 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -35,7 +35,7 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, - _replace_init_method, + _replace_dunder_methods, _update_dataloader, has_iterable_dataset, ) @@ -403,9 +403,9 @@ def _run_impl(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: def _run_with_strategy_setup(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: self._strategy.setup_environment() - with self._strategy.model_sharded_context(), _replace_init_method(DataLoader, "dataset"), _replace_init_method( - BatchSampler - ): + with self._strategy.model_sharded_context(), _replace_dunder_methods( + DataLoader, "dataset" + ), _replace_dunder_methods(BatchSampler): return run_method(*args, **kwargs) def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) -> nn.Module: diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 3c630403dafce..7dec5ba4bffe0 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -30,7 +30,7 @@ from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.cloud_io import get_filesystem -from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs +from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden @@ -239,7 +239,9 @@ def _convert_to_poptorch_loader( dataloader, sampler, mode, self.replication_factor > 1 ) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts - dataloader = poptorch.DataLoader(opts, *dl_args, **dl_kwargs) + dataloader = _reinstantiate_wrapped_cls( + dataloader, opts, *dl_args, explicit_cls=poptorch.DataLoader, **dl_kwargs + ) return dataloader def _handle_gradient_accumulation_steps(self) -> None: diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index 6e592b9f6d310..e20eac2ffae57 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -31,7 +31,7 @@ from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, _is_dataloader_shuffled, - _replace_init_method, + _replace_dunder_methods, _update_dataloader, has_iterable_dataset, has_len_all_ranks, @@ -428,9 +428,11 @@ def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[Dat """ source = getattr(self, f"_{stage.dataloader_prefix}_dataloader_source") - with _replace_init_method(DataLoader, "dataset"), _replace_init_method(BatchSampler): + with _replace_dunder_methods(DataLoader, "dataset"), _replace_dunder_methods(BatchSampler): # under this context manager, the arguments passed to `DataLoader.__init__` will be captured and saved as - # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning + # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning. + # Also, it records all attribute setting and deletion using patched `__setattr__` and `__delattr__` + # methods so that the re-instantiated object is as close to the original as possible. dataloader = source.dataloader() if isinstance(dataloader, tuple): dataloader = list(dataloader) diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index f2d3040125141..b4d9d4dec5817 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -37,7 +37,7 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.apply_func import _is_dataclass_instance from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler -from pytorch_lightning.utilities.enums import _FaultTolerantMode +from pytorch_lightning.utilities.enums import _FaultTolerantMode, LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.seed import pl_worker_init_function @@ -48,6 +48,18 @@ warning_cache = WarningCache() +class _WrapAttrTag(LightningEnum): + SET = "set" + DEL = "del" + + def __call__(self, *args): + if self == self.SET: + fn = setattr + else: + fn = delattr + return fn(*args) + + def _extract_batch_size(batch: BType) -> Generator[int, None, None]: if isinstance(batch, Tensor): if batch.ndim == 0: @@ -188,27 +200,7 @@ def _update_dataloader( dataloader: DataLoader, sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None ) -> DataLoader: dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode) - dl_cls = type(dataloader) - try: - dataloader = dl_cls(*dl_args, **dl_kwargs) - except TypeError as e: - # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass - # `__init__` arguments map to one `DataLoader.__init__` argument - import re - - match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) - if not match: - # an unexpected `TypeError`, continue failure - raise - argument = match.groups()[0] - message = ( - f"The {dl_cls.__name__} `DataLoader` implementation has an error where more than one `__init__` argument" - f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" - f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." - f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." - " This argument was automatically passed to your DataLoader by PyTorch Lightning." - ) - raise MisconfigurationException(message) from e + dataloader = _reinstantiate_wrapped_cls(dataloader, *dl_args, **dl_kwargs) return dataloader @@ -374,7 +366,7 @@ def _dataloader_init_kwargs_resolve_sampler( "this, expose an argument `sampler` in the `__init__` method of your custom class." ) - batch_sampler = batch_sampler_cls(*args, **kwargs) + batch_sampler = _reinstantiate_wrapped_cls(batch_sampler, *args, **kwargs) else: try: batch_sampler = batch_sampler_cls( @@ -449,6 +441,37 @@ def _auto_add_worker_init_fn(dataloader: DataLoader, rank: int) -> None: dataloader.worker_init_fn = partial(pl_worker_init_function, rank=rank) +def _reinstantiate_wrapped_cls(orig_object: Any, *args: Any, explicit_cls: Optional[Type] = None, **kwargs: Any) -> Any: + constructor = type(orig_object) if explicit_cls is None else explicit_cls + + try: + result = constructor(*args, **kwargs) + except TypeError as e: + # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass + # `__init__` arguments map to one `DataLoader.__init__` argument + import re + + match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) + if not match: + # an unexpected `TypeError`, continue failure + raise + argument = match.groups()[0] + message = ( + f"The {constructor.__name__} implementation has an error where more than one `__init__` argument" + f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" + f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." + f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." + " This argument was automatically passed to your object by PyTorch Lightning." + ) + raise MisconfigurationException(message) from e + + attrs_record = getattr(orig_object, "__pl_attrs_record", list()) + for args, fn in attrs_record: + fn(result, *args) + + return result + + def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) -> Callable: """Wraps the ``__init__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" @@ -457,6 +480,8 @@ def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: # We need to inspect `init`, as inspecting `obj.__init__` # can lead to inspecting the wrong function with multiple inheritance + old_inside_init = getattr(obj, "__pl_inside_init", False) + object.__setattr__(obj, "__pl_inside_init", True) params = inspect.signature(init).parameters parameters_defaults = OrderedDict( @@ -474,21 +499,49 @@ def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: } if not hasattr(obj, "__pl_saved_args"): - obj.__pl_saved_args = args - obj.__pl_saved_kwargs = kwargs - obj.__pl_saved_arg_names = param_names - obj.__pl_saved_default_kwargs = default_kwargs + object.__setattr__(obj, "__pl_saved_args", args) + object.__setattr__(obj, "__pl_saved_kwargs", kwargs) + object.__setattr__(obj, "__pl_saved_arg_names", param_names) + object.__setattr__(obj, "__pl_saved_default_kwargs", default_kwargs) # We want to use the latest possible value for explicit argument (i.e. ideally what gets passed to base class) # so that we can be sure, that it will not get changed anymore. # That is why we are setting this in every `__init__` if store_explicit_arg is not None: if store_explicit_arg in param_names: - setattr(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) + object.__setattr__(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) elif store_explicit_arg in kwargs: - setattr(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) + object.__setattr__(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) init(obj, *args, **kwargs) + object.__setattr__(obj, "__pl_inside_init", old_inside_init) + + return wrapper + + +def _wrap_attr_method(method: Callable, tag: _WrapAttrTag) -> Callable: + """Wraps the ``__setattr__`` or ``__delattr__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and + :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" + + @functools.wraps(method) + def wrapper(obj: Any, *args: Any): + # First, let's find out if we're the first in inheritance chain calling the patched method. + name, *_ = args + prev_call_name, prev_call_method = getattr(obj, "__pl_current_call", (None, "method")) + first_call = not (prev_call_name == name and prev_call_method == tag) + + # Then mark the current called method + object.__setattr__(obj, "__pl_current_call", (name, tag)) + + # call original method + method(obj, *args) + if first_call and not getattr(obj, "__pl_inside_init", True): + # and save the value it was called with to the internal list, + # if we're outside of __init__ and the original call did not fail and we're the first call + attrs_record = getattr(obj, "__pl_attrs_record", list()) + attrs_record.append((args, tag)) + object.__setattr__(obj, "__pl_attrs_record", attrs_record) + object.__setattr__(obj, "__pl_current_call", (prev_call_name, prev_call_method)) return wrapper @@ -508,25 +561,34 @@ def recurse(cl: Type[Any]) -> None: @contextmanager -def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: +def _replace_dunder_methods(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. - It patches the ``__init__`` method. + It patches the ``__init__``, ``__setattr__`` and ``__delattr__`` methods. """ classes = _get_all_subclasses(base_cls) | {base_cls} for cls in classes: # Check that __init__ belongs to the class # https://stackoverflow.com/a/5253424 if "__init__" in cls.__dict__: - cls._old_init = cls.__init__ + cls.__old__init__ = cls.__init__ cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) + + # we want at least one setattr/delattr in the chain to be patched and it can happen, that none of the subclasses + # implement `__setattr__`/`__delattr__`. Therefore, we are always patching the `base_cls` + for patch_fn_name, tag in (("__setattr__", _WrapAttrTag.SET), ("__delattr__", _WrapAttrTag.DEL)): + if patch_fn_name in cls.__dict__ or cls is base_cls: + saved_name = f"__old{patch_fn_name}" + setattr(cls, saved_name, getattr(cls, patch_fn_name)) + setattr(cls, patch_fn_name, _wrap_attr_method(getattr(cls, patch_fn_name), tag)) yield for cls in classes: - # Check that _old_init belongs to the class - # https://stackoverflow.com/a/5253424 - if "_old_init" in cls.__dict__: - cls.__init__ = cls._old_init - del cls._old_init + for patched_name in ("__setattr__", "__delattr__", "__init__"): + # Check that __old__{init,setattr,delattr} belongs to the class + # https://stackoverflow.com/a/5253424 + if f"__old{patched_name}" in cls.__dict__: + setattr(cls, patched_name, getattr(cls, f"__old{patched_name}")) + delattr(cls, f"__old{patched_name}") def _wrap_with_capture_dataset(dataset: Dataset) -> Dataset: diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 86a0a5a82195a..d45046f249d54 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -177,7 +177,7 @@ def test_setup_dataloaders_return_type(): assert lite_dataloader1.dataset is dataset1 -@mock.patch("pytorch_lightning.lite.lite._replace_init_method") +@mock.patch("pytorch_lightning.lite.lite._replace_dunder_methods") def test_setup_dataloaders_captures_dataloader_arguments(ctx_manager): """Test that Lite intercepts the DataLoader constructor arguments with a context manager in its run method.""" diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index 5b0087a245924..9b7abf0d90a88 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -13,9 +13,10 @@ from pytorch_lightning.utilities.data import ( _dataloader_init_kwargs_resolve_sampler, _get_dataloader_init_args_and_kwargs, - _replace_init_method, + _replace_dunder_methods, _replace_value_in_saved_args, _update_dataloader, + _WrapAttrTag, extract_batch_size, get_len, has_iterable_dataset, @@ -145,10 +146,10 @@ def __init__(self, foo, *args, **kwargs): super().__init__(foo, *args, **kwargs) dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) - with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`dataset`"): + with pytest.raises(MisconfigurationException, match="implementation has an error.*`dataset`"): _update_dataloader(dataloader, dataloader.sampler) - with _replace_init_method(DataLoader, "dataset"): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) new_dataloader = _update_dataloader(dataloader, dataloader.sampler) assert isinstance(new_dataloader, BadStandaloneGoodHookImpl) @@ -160,7 +161,7 @@ def __init__(self, randomize, *args, **kwargs): super().__init__(*args, shuffle=randomize, **kwargs) dataloader = BadImpl(False, []) - with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`shuffle`"): + with pytest.raises(MisconfigurationException, match="implementation has an error.*`shuffle`"): _update_dataloader(dataloader, dataloader.sampler) class GoodImpl(DataLoader): @@ -174,28 +175,33 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) -def test_replace_init_method_multiple_loaders_without_init(): +def test_replace_dunder_methods_multiple_loaders_without_init(): """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` - method (the one we are wrapping), it can happen, that `hasattr(cls, "_old_init")` is True because of parent + method (the one we are wrapping), it can happen, that `hasattr(cls, "__old__init__")` is True because of parent class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error occured only sometimes because it depends on the order in which we are iterating over a set of classes we are patching. This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` - and are children of `DataLoader`. We are testing that a) context manager `_replace_init_method` exits cleanly, and - b) the mechanism checking for presence of `_old_init` works as expected. + and are children of `DataLoader`. We are testing that a) context manager `_replace_dunder_method` exits cleanly, and + b) the mechanism checking for presence of `__old__init__` works as expected. """ classes = [DataLoader] for i in range(100): classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) - with _replace_init_method(DataLoader, "dataset"): + before = {cls: cls.__init__ for cls in classes} + + with _replace_dunder_methods(DataLoader, "dataset"): for cls in classes[1:]: # First one is `DataLoader` - assert "_old_init" not in cls.__dict__ - assert hasattr(cls, "_old_init") + assert "__old__init__" not in cls.__dict__ + assert hasattr(cls, "__old__init__") + + assert "__old__init__" in DataLoader.__dict__ + assert hasattr(DataLoader, "__old__init__") - assert "_old_init" in DataLoader.__dict__ - assert hasattr(DataLoader, "_old_init") + for cls in classes: + assert before[cls] == cls.__init__ class DataLoaderSubclass1(DataLoader): @@ -323,8 +329,8 @@ def __init__(self, dataset, **kwargs): pytest.param(ChangingDataLoader, (range(5),), dict(), ("dataset",), list(range(10)), dict(), id="test9"), ], ) -def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): - with _replace_init_method(DataLoader, "dataset"): +def test_replace_dunder_methods_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = cls(*args, **kwargs) assert dataloader.__pl_saved_args == args @@ -361,12 +367,12 @@ def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, c assert dataloader_value == value -def test_replace_init_method_extra_kwargs(): +def test_replace_dunder_methods_extra_kwargs(): class LoaderSubclass(DataLoader): def __init__(self, dataset, *args, batch_size=10, **kwargs): super().__init__(dataset, *args, batch_size=batch_size, **kwargs) - with _replace_init_method(DataLoader, "dataset"): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = LoaderSubclass(range(10)) assert dataloader.__pl_saved_args == (range(10),) @@ -376,6 +382,90 @@ def __init__(self, dataset, *args, batch_size=10, **kwargs): assert dataloader.__dataset == range(10) +def test_replace_dunder_methods_attrs(): + """This test checks, that all the calls from setting and deleting attributes within `_replace_dunder_methods` + are correctly preserved even after reinstantiation. + + It also includes a custom `__setattr__` + """ + + class Loader(DataLoader): + def __setattr__(self, attr, val): + if attr == "custom_arg": + val = val + 2 + super().__setattr__(attr, val) + + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = Loader(range(10)) + dataloader.custom_arg = 5 + dataloader.my_arg = 10 + dataloader.another_arg = 100 + del dataloader.dataset + try: + del dataloader.abc_arg + except AttributeError: + pass + + assert dataloader.__pl_saved_args == (range(10),) + assert dataloader.__pl_saved_kwargs == {} + assert dataloader.__pl_saved_arg_names == ("dataset",) + assert dataloader.__dataset == range(10) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + assert dataloader.__pl_attrs_record == [ + (("custom_arg", 5), _WrapAttrTag.SET), + (("my_arg", 10), _WrapAttrTag.SET), + (("another_arg", 100), _WrapAttrTag.SET), + (("dataset",), _WrapAttrTag.DEL), + ] + + dataloader = _update_dataloader(dataloader, dataloader.sampler) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + + +def test_replace_dunder_methods_restore_methods(): + """This tests checks whether are all dunder methods restored to their original versions.""" + + class Init(DataLoader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + class SetAttr(DataLoader): + def __setattr__(self, *args): + return super().__setattr__(*args) + + class DelAttr(DataLoader): + def __delattr__(self, *args): + return super().__delattr__(*args) + + class InitAndSetAttr(Init, SetAttr): + pass + + class InitAndDelAttr(Init, DelAttr): + pass + + class SetAttrAndDelAttr(SetAttr, DelAttr): + pass + + class AllDunder(Init, SetAttr, DelAttr): + pass + + before = dict() + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + before[cls] = {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + with _replace_dunder_methods(DataLoader, "dataset"): + pass + + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + assert before[cls] == {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + @pytest.mark.parametrize("predicting", [True, False]) def test_custom_batch_sampler(predicting): """This test asserts, that custom `BatchSampler`, with all the arguments, that are required in order to @@ -392,8 +482,8 @@ def __init__(self, sampler, extra_arg, drop_last=True): super().__init__(sampler, 10, drop_last) sampler = RandomSampler(range(10)) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler(sampler, "random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) @@ -438,8 +528,8 @@ def __init__(self, sampler, extra_arg): super().__init__(sampler, 10, False) sampler = RandomSampler(range(10)) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler(sampler, "random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) @@ -465,8 +555,8 @@ def __init__(self, extra_arg): self.extra_arg = extra_arg super().__init__(RandomSampler(range(10)), 10, False) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler("random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) From 1a65a9ad7a5e36f31ddb92efcbb3655d0d1a299a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 17 Aug 2022 18:27:28 +0200 Subject: [PATCH 29/30] Revert "Remove skipping logic in favor of path filtering (#14170)" (#14244) --- .azure/gpu-tests.yml | 60 ++++++++++--------- .github/file-filters.yml | 9 +++ .github/workflows/ci-app-cloud-e2e-test.yml | 28 +++++++-- .github/workflows/ci-app-examples.yml | 7 --- .github/workflows/ci-app-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 35 ++++++++--- .github/workflows/ci-pytorch-test-full.yml | 56 ++++++++++++----- .github/workflows/ci-pytorch-test-slow.yml | 40 +++++++++---- .../precision/fsdp_native_native_amp.py | 2 +- 9 files changed, 165 insertions(+), 76 deletions(-) create mode 100644 .github/file-filters.yml diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 8444468c0c58a..683212cd55d4b 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -12,31 +12,15 @@ trigger: - "master" - "release/*" - "refs/tags/*" - paths: - include: - - ".azure/**" - - "examples/run_ddp_examples.sh" - - "examples/convert_from_pt_to_pl/**" - - "examples/run_pl_examples.sh" - - "examples/pl_basics/backbone_image_classifier.py" - - "examples/pl_basics/autoencoder.py" - - "examples/pl_loops/mnist_lite.py" - - "examples/pl_fault_tolerant/automatic.py" - - "examples/test_pl_examples.py" - - "examples/pl_integrations/dali_image_classifier.py" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" - - "pyproject.toml" - - ".github/workflows/ci-pytorch*.yml" - - ".github/workflows/docs-*.yml" - pr: - "master" - "release/*" +variables: + - name: continue + value: '1' + jobs: - job: testing strategy: @@ -57,6 +41,22 @@ jobs: clean: all steps: + + - bash: | + CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo $CHANGED_FILES > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "##vso[task.setvariable variable=continue]0" + else + echo "Continue" + echo "##vso[task.setvariable variable=continue]1" + fi + displayName: Skipper + - bash: | lspci | egrep 'VGA|3D' whereis nvidia @@ -66,6 +66,7 @@ jobs: pip --version pip list displayName: 'Image info & NVIDIA' + condition: eq(variables['continue'], '1') - bash: | set -e @@ -81,6 +82,7 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' + condition: eq(variables['continue'], '1') - bash: | set -e @@ -89,13 +91,16 @@ jobs: python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' + condition: eq(variables['continue'], '1') - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' + condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest workingDirectory: src/pytorch_lightning displayName: 'Testing: PyTorch doctests' + condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 env: @@ -103,6 +108,7 @@ jobs: workingDirectory: tests/tests_pytorch displayName: 'Testing: PyTorch standard' timeoutInMinutes: "35" + condition: eq(variables['continue'], '1') - bash: bash run_standalone_tests.sh workingDirectory: tests/tests_pytorch @@ -111,14 +117,7 @@ jobs: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch standalone tests' timeoutInMinutes: "35" - - - bash: bash run_standalone_tasks.sh - workingDirectory: tests/tests_pytorch - env: - PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" - displayName: 'Testing: PyTorch standalone tasks' - timeoutInMinutes: "10" + condition: eq(variables['continue'], '1') - bash: | python -m coverage report @@ -128,13 +127,14 @@ jobs: ls -l workingDirectory: tests/tests_pytorch displayName: 'Statistics' + condition: eq(variables['continue'], '1') - task: PublishTestResults@2 displayName: 'Publish test results' inputs: testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: succeededOrFailed() + condition: and(succeededOrFailed(), eq(variables['continue'], '1')) - script: | set -e @@ -146,9 +146,11 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" displayName: 'Testing: PyTorch examples' + condition: eq(variables['continue'], '1') - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 workingDirectory: tests/tests_pytorch env: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch benchmarks' + condition: eq(variables['continue'], '1') diff --git a/.github/file-filters.yml b/.github/file-filters.yml new file mode 100644 index 0000000000000..e621cd83881e4 --- /dev/null +++ b/.github/file-filters.yml @@ -0,0 +1,9 @@ +# This file contains filters to be used in the CI to detect file changes and run the required CI jobs. + +app_examples: + - "src/lightning_app/**" + - "tests/tests_app_examples/**" + - "requirements/app/**" + - "examples/app_*" + - "setup.py" + - "src/pytorch_lightning/__version__.py" diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index c50fee4caa285..81d5e70441771 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -7,19 +7,37 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - paths: - - ".github/workflows/ci-app-cloud-e2e-test.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: + # This is job should once only once per PR to detect file changes so run required jobs. + # see .github/file-filters.yml to define file filters and run the jobs based on the output of each filter. + # More info: https://github.com/marketplace/actions/paths-changes-filter + + changes: + runs-on: ubuntu-latest + # Set job outputs to the values from filter step + outputs: + app_examples: ${{ steps.filter.outputs.app_examples }} + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: "3.8" + + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: .github/file-filters.yml + cloud-test: name: Cloud Test + needs: changes + if: ${{ needs.changes.outputs.app_examples == 'true' }} runs-on: ubuntu-20.04 strategy: fail-fast: false diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 8114f59b01aaa..01570f59c2c77 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -6,13 +6,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - paths: - - ".github/workflows/ci-app-examples.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "tests/tests_app_examples/**" - # the examples are used in the app CI - - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index fb2cdbda69079..fe3cc36dc16d3 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -6,10 +6,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: paths: - - ".github/workflows/ci-app-tests.yml" - - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" + - "requirements/app/**" + - "setup.py" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index d314a742bbdcb..3498f087ef0aa 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -6,12 +6,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - paths: - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-conda.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -41,7 +35,28 @@ jobs: - uses: actions/checkout@v2 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v23.1 + + - name: Decide if the test should be skipped + id: skip + shell: bash -l {0} + run: | + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "::set-output name=continue::0" + else + echo "Continue" + echo "::set-output name=continue::1" + fi + - name: Update base dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -55,10 +70,12 @@ jobs: run: pip install "Pillow<9.0" # It messes with torchvision - name: DocTests + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Update all dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -78,9 +95,11 @@ jobs: python requirements/pytorch/check-avail-extras.py - name: Pull legacy checkpoints + if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Testing PyTorch + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml @@ -92,7 +111,7 @@ jobs: if: failure() - name: Statistics - if: success() + if: ${{ success() && (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: | coverage report @@ -100,7 +119,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: success() + if: ${{ success() && (steps.skip.outputs.continue == '1') }} # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 386bb012b8cc6..173e2a44a61f4 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -7,12 +7,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] - paths: - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-full.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -43,42 +37,67 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v23.1 + + - name: Decide if the test should be skipped + id: skip + shell: bash -l {0} + run: | + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "::set-output name=continue::0" + else + echo "Continue" + echo "::set-output name=continue::1" + fi + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + if: ${{ (steps.skip.outputs.continue == '1') }} + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Reset caching + if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: basic setup + if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version pip install -q fire # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS - if: ${{ (runner.os == 'macOS') }} + if: ${{ (runner.os == 'macOS') && (steps.skip.outputs.continue == '1') }} run: | brew install openmpi libuv # Horovod on macOS requires OpenMPI, Gloo not currently supported - name: Setup Windows - if: ${{ (runner.os == 'windows') }} + if: ${{ (runner.os == 'windows') && (steps.skip.outputs.continue == '1') }} run: | python .actions/assistant.py requirements_prune_pkgs horovod - name: Set min. dependencies - if: ${{ (matrix.requires == 'oldest') }} + if: ${{ (matrix.requires == 'oldest') && (steps.skip.outputs.continue == '1') }} run: | python .actions/assistant.py replace_oldest_ver # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir + if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: pip cache + if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -87,9 +106,11 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull legacy checkpoints + if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -101,10 +122,12 @@ jobs: shell: bash - name: DocTests + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt @@ -113,7 +136,7 @@ jobs: shell: bash - name: Reinstall Horovod if necessary - if: ${{ (runner.os != 'windows') }} + if: ${{ (runner.os != 'windows') && (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -130,38 +153,43 @@ jobs: shell: bash - name: Cache datasets + if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: Datasets key: pl-dataset - name: Sanity check + if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - name: Testing PyTorch + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest results - if: failure() + if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Prepare Examples + if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - name: Run Examples + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 - name: Statistics - if: success() + if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: | coverage report @@ -169,7 +197,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: always() + if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 8e97ea90b2bc4..0bb9916ee302a 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -7,12 +7,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] - paths: - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-slow.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -34,19 +28,43 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v23.1 + + - name: Decide if the test should be skipped + id: skip + shell: bash -l {0} + run: | + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "::set-output name=continue::0" + else + echo "Continue" + echo "::set-output name=continue::1" + fi + + - uses: actions/setup-python@v2 + if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} - name: Reset caching + if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: Get pip cache + if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: | python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - name: Cache pip + if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -55,6 +73,7 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}- - name: Install dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -66,20 +85,21 @@ jobs: shell: bash - name: Testing PyTorch + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}.xml env: PL_RUN_SLOW_TESTS: 1 - name: Upload pytest test results - if: failure() + if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}.xml - name: Statistics - if: success() + if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: | coverage report @@ -87,7 +107,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: success() + if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py index 2201db94586a2..38ec381fe5485 100644 --- a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -24,7 +24,7 @@ from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler else: - MixedPrecision = None # type: ignore[misc,assignment] + MixedPrecision = None class FullyShardedNativeNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): From a3f48c1e82222f3039d46e743dc00429e4868b47 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 17 Aug 2022 18:34:17 +0200 Subject: [PATCH 30/30] Update defaults for WandbLogger's run name and project name (#14145) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/loggers/wandb.py | 13 +++++++------ tests/tests_pytorch/loggers/test_all.py | 2 +- tests/tests_pytorch/loggers/test_wandb.py | 20 ++++++++++++++------ 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8001b20924c2b..92913fcdf760f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -15,6 +15,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) - Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961)) +- The default project name in `WandbLogger` is now "lightning_logs" ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) +- The `WandbLogger.name` property no longer returns the name of the experiment, and instead returns the project's name ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) ### Fixed @@ -28,6 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) - Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) - Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Fixed an issue in which the default name for a run in `WandbLogger` would be set to the project name instead of a randomly generated string ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) - Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 530fb58fabe5e..baf4bc9092774 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -260,7 +260,7 @@ def __init__( id: Optional[str] = None, anonymous: Optional[bool] = None, version: Optional[str] = None, - project: Optional[str] = None, + project: str = "lightning_logs", log_model: Union[str, bool] = False, experiment: Union[Run, RunDisabled, None] = None, prefix: str = "", @@ -297,7 +297,7 @@ def __init__( self._checkpoint_callback: Optional["ReferenceType[Checkpoint]"] = None # set wandb init arguments self._wandb_init: Dict[str, Any] = dict( - name=name or project, + name=name, project=project, id=version or id, dir=save_dir, @@ -306,6 +306,7 @@ def __init__( ) self._wandb_init.update(**kwargs) # extract parameters + self._project = self._wandb_init.get("project") self._save_dir = self._wandb_init.get("dir") self._name = self._wandb_init.get("name") self._id = self._wandb_init.get("id") @@ -450,13 +451,13 @@ def save_dir(self) -> Optional[str]: @property def name(self) -> Optional[str]: - """Gets the name of the experiment. + """The project name of this experiment. Returns: - The name of the experiment if the experiment exists else the name given to the constructor. + The name of the project the current experiment belongs to. This name is not the same as `wandb.Run`'s + name. To access wandb's internal experiment name, use ``logger.experiment.name`` instead. """ - # don't create an experiment if we don't have one - return self._experiment.name if self._experiment else self._name + return self._project @property def version(self) -> Optional[str]: diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index d613296abccf5..612d7bf035c2f 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -300,7 +300,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): @pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES_WO_NEPTUNE_WANDB) -@RunIf(skip_windows=True, skip_hanging_spawn=True) +@RunIf(skip_windows=True) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" _patch_comet_atexit(monkeypatch) diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index fbc1d5e189637..648e1a8f38ec8 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -25,6 +25,16 @@ from tests_pytorch.helpers.utils import no_warning_call +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) +@mock.patch("pytorch_lightning.loggers.wandb.wandb") +def test_wandb_project_name(*_): + logger = WandbLogger() + assert logger.name == "lightning_logs" + + logger = WandbLogger(project="project") + assert logger.name == "project" + + @mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_logger_init(wandb, monkeypatch): @@ -48,7 +58,7 @@ def test_wandb_logger_init(wandb, monkeypatch): wandb.init.reset_mock() WandbLogger(project="test_project").experiment wandb.init.assert_called_once_with( - name="test_project", dir=None, id=None, project="test_project", resume="allow", anonymous=None + name=None, dir=None, id=None, project="test_project", resume="allow", anonymous=None ) # test wandb.init and setting logger experiment externally @@ -91,7 +101,6 @@ def test_wandb_logger_init(wandb, monkeypatch): logger.watch("model", "log", 10, False) wandb.init().watch.assert_called_once_with("model", log="log", log_freq=10, log_graph=False) - assert logger.name == wandb.init().name assert logger.version == wandb.init().id @@ -140,10 +149,9 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): """Test that the logger creates the folders and files in the right place.""" monkeypatch.setattr(pytorch_lightning.loggers.wandb, "_WANDB_GREATER_EQUAL_0_12_10", True) wandb.run = None - logger = WandbLogger(save_dir=str(tmpdir), offline=True) + logger = WandbLogger(project="project", save_dir=str(tmpdir), offline=True) # the logger get initialized assert logger.version == wandb.init().id - assert logger.name == wandb.init().name # mock return values of experiment wandb.run = None @@ -154,7 +162,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): _ = logger.experiment assert logger.version == "1" - assert logger.name == "run_name" + assert logger.name == "project" assert str(tmpdir) == logger.save_dir assert not os.listdir(tmpdir) @@ -164,7 +172,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): assert trainer.log_dir == logger.save_dir trainer.fit(model) - assert trainer.checkpoint_callback.dirpath == str(tmpdir / "run_name" / version / "checkpoints") + assert trainer.checkpoint_callback.dirpath == str(tmpdir / "project" / version / "checkpoints") assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=3.ckpt"} assert trainer.log_dir == logger.save_dir