diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index cea0f297be5..0fe0cbd6dd7 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -10,19 +10,18 @@ import pathlib import pickle import random +import shutil import unittest.mock import warnings import xml.etree.ElementTree as ET from collections import defaultdict, Counter import numpy as np -import PIL.Image import pytest import torch from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid from torch.nn.functional import one_hot from torch.testing import make_tensor as _make_tensor -from torchvision._utils import sequence_to_str from torchvision.prototype import datasets make_tensor = functools.partial(_make_tensor, device="cpu") @@ -62,27 +61,51 @@ def _parse_mock_info(self, mock_info): return mock_info - def prepare(self, config): + def load(self, config): # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in # test/test_prototype_builtin_datasets.py root = pathlib.Path(datasets.home()) / self.name - root.mkdir(exist_ok=True) + # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn, + # this will only download **and** preprocess if the file is not present. In other words, if we already place + # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing. + # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to + # `root` only when it is requested. + tmp_mock_data_folder = root / "__mock__" + tmp_mock_data_folder.mkdir(parents=True) + + mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config)) + + def patched_download(resource, root, **kwargs): + src = tmp_mock_data_folder / resource.file_name + if not src.exists(): + raise pytest.UsageError( + f"Dataset '{self.name}' requires the file {resource.file_name} for {config}" + f"but it was not created by the mock data function." + ) - mock_info = self._parse_mock_info(self.mock_data_fn(root, config)) + dst = root / resource.file_name + shutil.move(str(src), str(root)) - with unittest.mock.patch.object(datasets.utils.Dataset, "__init__"): - required_file_names = { - resource.file_name for resource in datasets.load(self.name, root=root, **config)._resources() - } - available_file_names = {path.name for path in root.glob("*")} - missing_file_names = required_file_names - available_file_names - if missing_file_names: + return dst + + with unittest.mock.patch( + "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download + ): + dataset = datasets.load(self.name, **config) + + extra_files = list(tmp_mock_data_folder.glob("**/*")) + if extra_files: raise pytest.UsageError( - f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} " - f"for {config}, but they were not created by the mock data function." + ( + f"Dataset '{self.name}' created the following files for {config} in the mock data function, " + f"but they were not loaded:\n\n" + ) + + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files) ) - return mock_info + tmp_mock_data_folder.rmdir() + + return dataset, mock_info def config_id(name, config): @@ -513,22 +536,6 @@ def imagenet(root, config): class CocoMockData: - @classmethod - def _make_images_archive(cls, root, name, *, num_samples): - image_paths = create_image_folder( - root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples - ) - - images_meta = [] - for path in image_paths: - with PIL.Image.open(path) as image: - width, height = image.size - images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height)) - - make_zip(root, f"{name}.zip") - - return images_meta - @classmethod def _make_annotations_json( cls, @@ -596,16 +603,38 @@ def generate( cls, root, *, + split, year, num_samples, ): annotations_dir = root / "annotations" annotations_dir.mkdir() - for split in ("train", "val"): - config_name = f"{split}{year}" + for split_ in ("train", "val"): + config_name = f"{split_}{year}" + + images_meta = [ + dict( + file_name=f"{idx:012d}.jpg", + id=idx, + width=width, + height=height, + ) + for idx, (height, width) in enumerate( + torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist() + ) + ] + + if split_ == split: + create_image_folder( + root, + config_name, + file_name_fn=lambda idx: images_meta[idx]["file_name"], + num_examples=num_samples, + size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]), + ) + make_zip(root, f"{config_name}.zip") - images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples) cls._make_annotations( annotations_dir, config_name, @@ -625,7 +654,7 @@ def generate( ) ) def coco(root, config): - return CocoMockData.generate(root, year=config["year"], num_samples=5) + return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5) class SBDMockData: @@ -799,8 +828,11 @@ def add_bndbox(obj): def generate(cls, root, *, year, trainval): archive_folder = root if year == "2011": - archive_folder /= "TrainVal" - data_folder = archive_folder / "VOCdevkit" / f"VOC{year}" + archive_folder = root / "TrainVal" + data_folder = archive_folder / "VOCdevkit" + else: + archive_folder = data_folder = root / "VOCdevkit" + data_folder = data_folder / f"VOC{year}" data_folder.mkdir(parents=True, exist_ok=True) ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval) @@ -810,7 +842,7 @@ def generate(cls, root, *, year, trainval): (cls._make_detection_anns_folder, "Annotations", ".xml"), ]: make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids)) - make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder) + make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder) return num_samples_map @@ -1091,8 +1123,10 @@ def _make_ann_file(path, num_examples, class_idx): } ) + archive_folder = root / "GTSRB" + if config["split"] == "train": - train_folder = root / "GTSRB" / "Training" + train_folder = archive_folder / "Training" train_folder.mkdir(parents=True) for class_idx in classes: @@ -1107,9 +1141,9 @@ def _make_ann_file(path, num_examples, class_idx): num_examples=num_examples_per_class, class_idx=int(class_idx), ) - make_zip(root, "GTSRB-Training_fixed.zip", train_folder) + make_zip(root, "GTSRB-Training_fixed.zip", archive_folder) else: - test_folder = root / "GTSRB" / "Final_Test" + test_folder = archive_folder / "Final_Test" test_folder.mkdir(parents=True) create_image_folder( @@ -1119,7 +1153,7 @@ def _make_ann_file(path, num_examples, class_idx): num_examples=num_examples, ) - make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder) + make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder) _make_ann_file( path=root / "GT-final_test.csv", @@ -1484,11 +1518,10 @@ def stanford_cars(root, config): num_samples = {"train": 5, "test": 7}[split] num_categories = 3 - devkit = root / "devkit" - devkit.mkdir(parents=True) - if split == "train": images_folder_name = "cars_train" + devkit = root / "devkit" + devkit.mkdir() annotations_mat_path = devkit / "cars_train_annos.mat" else: images_folder_name = "cars_test" diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py index 23190b25ddc..5a8c9e7eff8 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_builtin_datasets.py @@ -56,18 +56,14 @@ def test_info(self, name): @parametrize_dataset_mocks(DATASET_MOCKS) def test_smoke(self, dataset_mock, config): - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) if not isinstance(dataset, datasets.utils.Dataset): raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.") @parametrize_dataset_mocks(DATASET_MOCKS) def test_sample(self, dataset_mock, config): - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) try: sample = next(iter(dataset)) @@ -84,17 +80,13 @@ def test_sample(self, dataset_mock, config): @parametrize_dataset_mocks(DATASET_MOCKS) def test_num_samples(self, dataset_mock, config): - mock_info = dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, mock_info = dataset_mock.load(config) assert len(list(dataset)) == mock_info["num_samples"] @parametrize_dataset_mocks(DATASET_MOCKS) def test_no_vanilla_tensors(self, dataset_mock, config): - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor} if vanilla_tensors: @@ -105,24 +97,20 @@ def test_no_vanilla_tensors(self, dataset_mock, config): @parametrize_dataset_mocks(DATASET_MOCKS) def test_transformable(self, dataset_mock, config): - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) next(iter(dataset.map(transforms.Identity()))) @pytest.mark.parametrize("only_datapipe", [False, True]) @parametrize_dataset_mocks(DATASET_MOCKS) def test_traversable(self, dataset_mock, config, only_datapipe): - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) traverse(dataset, only_datapipe=only_datapipe) @parametrize_dataset_mocks(DATASET_MOCKS) def test_serializable(self, dataset_mock, config): - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) pickle.dumps(dataset) @@ -135,8 +123,7 @@ def _collate_fn(self, batch): @pytest.mark.parametrize("num_workers", [0, 1]) @parametrize_dataset_mocks(DATASET_MOCKS) def test_data_loader(self, dataset_mock, config, num_workers): - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) dl = DataLoader( dataset, @@ -153,17 +140,15 @@ def test_data_loader(self, dataset_mock, config, num_workers): @parametrize_dataset_mocks(DATASET_MOCKS) @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter)) def test_has_annotations(self, dataset_mock, config, annotation_dp_type): - - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)): raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.") @parametrize_dataset_mocks(DATASET_MOCKS) def test_save_load(self, dataset_mock, config): - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) + sample = next(iter(dataset)) with io.BytesIO() as buffer: @@ -173,8 +158,7 @@ def test_save_load(self, dataset_mock, config): @parametrize_dataset_mocks(DATASET_MOCKS) def test_infinite_buffer_size(self, dataset_mock, config): - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) for dp in extract_datapipes(dataset): if hasattr(dp, "buffer_size"): @@ -184,8 +168,7 @@ def test_infinite_buffer_size(self, dataset_mock, config): @parametrize_dataset_mocks(DATASET_MOCKS) def test_has_length(self, dataset_mock, config): - dataset_mock.prepare(config) - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) assert len(dataset) > 0 @@ -193,9 +176,7 @@ def test_has_length(self, dataset_mock, config): @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"]) class TestQMNIST: def test_extra_label(self, dataset_mock, config): - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) sample = next(iter(dataset)) for key, type in ( @@ -218,9 +199,7 @@ def test_label_matches_path(self, dataset_mock, config): if config["split"] != "train": return - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) for sample in dataset: label_from_path = int(Path(sample["path"]).parent.name) @@ -230,9 +209,7 @@ def test_label_matches_path(self, dataset_mock, config): @parametrize_dataset_mocks(DATASET_MOCKS["usps"]) class TestUSPS: def test_sample_content(self, dataset_mock, config): - dataset_mock.prepare(config) - - dataset = datasets.load(dataset_mock.name, **config) + dataset, _ = dataset_mock.load(config) for sample in dataset: assert "image" in sample