From b5f11572757929a81756ec7c73d208a9a39c4736 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 1 Apr 2022 10:06:11 +0200 Subject: [PATCH 1/5] replace serializable with pipeline test for prototype datasets --- .circleci/config.yml | 6 ++--- .circleci/config.yml.in | 6 ++--- test/test_prototype_builtin_datasets.py | 30 +++++++++++++++++++++---- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2a7c679e021..8597ed934fc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -168,9 +168,9 @@ commands: file_or_dir: type: string steps: - - run: - name: Install test utilities - command: pip install --progress-bar=off pytest pytest-mock + - pip_install: + args: pytest pytest-mock pytest-timeout + descr: Install test utilities - run: name: Run tests command: pytest --junitxml=test-results/junit.xml -v --durations 20 <> diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 6f013979771..19209caae66 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -168,9 +168,9 @@ commands: file_or_dir: type: string steps: - - run: - name: Install test utilities - command: pip install --progress-bar=off pytest pytest-mock + - pip_install: + args: pytest pytest-mock pytest-timeout + descr: Install test utilities - run: name: Run tests command: pytest --junitxml=test-results/junit.xml -v --durations 20 <> diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py index 8d51125f41c..0542b4a929f 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_builtin_datasets.py @@ -1,12 +1,13 @@ import functools import io -import pickle +import os from pathlib import Path import pytest import torch from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS from torch.testing._comparison import assert_equal, TensorLikePair, ObjectPair +from torch.utils.data.dataloader_experimental import DataLoader2 from torch.utils.data.graph import traverse from torch.utils.data.graph_settings import get_all_graph_pipes from torchdata.datapipes.iter import IterDataPipe, Shuffler, ShardingFilter @@ -116,13 +117,34 @@ def test_transformable(self, test_home, dataset_mock, config): next(iter(dataset.map(transforms.Identity()))) + @pytest.mark.timeout(10) + @pytest.mark.parametrize("parallelism_mode", ["mp", "thread"]) @parametrize_dataset_mocks(DATASET_MOCKS) - def test_serializable(self, test_home, dataset_mock, config): + def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): dataset_mock.prepare(test_home, config) - dataset = datasets.load(dataset_mock.name, **config) - pickle.dumps(dataset) + transform = transforms.Compose(transforms.DecodeImage(), transforms.Resize([3, 3])) + + # TODO: add a .collate() here as soon as https://github.com/pytorch/vision/pull/5233 is resolved + dp = dataset.map(transform).batch(2, drop_last=parallelism_mode == "thread") + + # Maybe we can make this is a static method of the data_loader? + try: + num_workers = len(os.sched_getaffinity(0)) + except Exception: + num_workers = os.cpu_count() or 1 + + dl = DataLoader2( + dp, + batch_size=None, + shuffle=True, + num_workers=num_workers, + parallelism_mode=parallelism_mode, + ) + + for _ in dl: + pass # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also # that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680 From 26b050ed9914dc48537f3fe190b25beff2e8d6bb Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 1 Apr 2022 10:30:37 +0200 Subject: [PATCH 2/5] exclude some datasets due to deadlock --- test/test_prototype_builtin_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py index 0542b4a929f..e1ebc9285ca 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_builtin_datasets.py @@ -119,7 +119,7 @@ def test_transformable(self, test_home, dataset_mock, config): @pytest.mark.timeout(10) @pytest.mark.parametrize("parallelism_mode", ["mp", "thread"]) - @parametrize_dataset_mocks(DATASET_MOCKS) + @parametrize_dataset_mocks({name: mock for name, mock in DATASET_MOCKS.items() if name not in {"qmnist", "voc"}}) def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): dataset_mock.prepare(test_home, config) dataset = datasets.load(dataset_mock.name, **config) From 40ba5a4503b5281084d903a7c1a2672addd800d9 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 1 Apr 2022 10:39:05 +0200 Subject: [PATCH 3/5] also check pipeline without parallelism --- test/test_prototype_builtin_datasets.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py index e1ebc9285ca..7e9c1c4fa51 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_builtin_datasets.py @@ -118,7 +118,7 @@ def test_transformable(self, test_home, dataset_mock, config): next(iter(dataset.map(transforms.Identity()))) @pytest.mark.timeout(10) - @pytest.mark.parametrize("parallelism_mode", ["mp", "thread"]) + @pytest.mark.parametrize("parallelism_mode", [None, "mp", "thread"]) @parametrize_dataset_mocks({name: mock for name, mock in DATASET_MOCKS.items() if name not in {"qmnist", "voc"}}) def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): dataset_mock.prepare(test_home, config) @@ -129,11 +129,14 @@ def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): # TODO: add a .collate() here as soon as https://github.com/pytorch/vision/pull/5233 is resolved dp = dataset.map(transform).batch(2, drop_last=parallelism_mode == "thread") - # Maybe we can make this is a static method of the data_loader? - try: - num_workers = len(os.sched_getaffinity(0)) - except Exception: - num_workers = os.cpu_count() or 1 + if parallelism_mode: + # Maybe we can make this is a static method of the data_loader? + try: + num_workers = len(os.sched_getaffinity(0)) + except Exception: + num_workers = os.cpu_count() or 1 + else: + num_workers = 0 dl = DataLoader2( dp, From 0786cb21569b43ce1f0e6eb1e8d92e19c6c57429 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 1 Apr 2022 10:58:02 +0200 Subject: [PATCH 4/5] use internal timeout --- .circleci/config.yml | 2 +- .circleci/config.yml.in | 2 +- test/test_prototype_builtin_datasets.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8597ed934fc..708ef5dbfdc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -169,7 +169,7 @@ commands: type: string steps: - pip_install: - args: pytest pytest-mock pytest-timeout + args: pytest pytest-mock descr: Install test utilities - run: name: Run tests diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 19209caae66..fbbed41b7f7 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -169,7 +169,7 @@ commands: type: string steps: - pip_install: - args: pytest pytest-mock pytest-timeout + args: pytest pytest-mock descr: Install test utilities - run: name: Run tests diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py index 7e9c1c4fa51..453f3fae15c 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_builtin_datasets.py @@ -117,7 +117,6 @@ def test_transformable(self, test_home, dataset_mock, config): next(iter(dataset.map(transforms.Identity()))) - @pytest.mark.timeout(10) @pytest.mark.parametrize("parallelism_mode", [None, "mp", "thread"]) @parametrize_dataset_mocks({name: mock for name, mock in DATASET_MOCKS.items() if name not in {"qmnist", "voc"}}) def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): @@ -144,6 +143,7 @@ def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): shuffle=True, num_workers=num_workers, parallelism_mode=parallelism_mode, + timeout=5, ) for _ in dl: From c5ace3e1da0460da9c09a53dc67ad6a3eb5c6412 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 1 Apr 2022 11:30:55 +0200 Subject: [PATCH 5/5] fix timeout for single process --- test/test_prototype_builtin_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py index 453f3fae15c..4ff5c7f70db 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_builtin_datasets.py @@ -143,7 +143,7 @@ def test_pipeline(self, test_home, dataset_mock, config, parallelism_mode): shuffle=True, num_workers=num_workers, parallelism_mode=parallelism_mode, - timeout=5, + timeout=5 if num_workers > 0 else 0, ) for _ in dl: