diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index c5608377d97..251893cd81d 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -15,7 +15,6 @@ from collections import defaultdict, Counter import numpy as np -import PIL.Image import pytest import torch from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file @@ -72,6 +71,28 @@ def prepare(self, home, config): available_file_names = {path.name for path in root.glob("*")} required_file_names = {resource.file_name for resource in self.dataset.resources(config)} missing_file_names = required_file_names - available_file_names + extra_file_names = available_file_names - required_file_names + + # Some datasets need to cannot provide the original resources, for example if the preprocessing step includes + # downloads. In such a case the mock data function might also provide the corresponding result of the + # preprocessing. For example, if the dataset requires the `foo.tar` file, but the preprocessing step extracts it + # to the `foo` folder and performs some more operations, the mock data function can provide the `foo` folder + # directly. + # In such a case `foo.tar` will be picked up in `missing_file_names` at first and `foo` in `extra_file_names`. + # Since `foo.tar` is not actually missing, but already in a preprocessed state, we remove the corresponding + # entries from both sets. + if missing_file_names: + for missing in missing_file_names.copy(): + extra_candidate = missing.split(".")[0] + if extra_candidate in extra_file_names: + missing_file_names.remove(missing) + extra_file_names.remove(extra_candidate) + + if extra_file_names: + raise pytest.UsageError( + f"Dataset '{self.name}' created the files {sequence_to_str(sorted(extra_file_names))} " + f"for {config} in the mock data function, but they are not needed." + ) if missing_file_names: raise pytest.UsageError( f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} " @@ -489,20 +510,11 @@ def imagenet(info, root, config): class CocoMockData: @classmethod - def _make_images_archive(cls, root, name, *, num_samples): - image_paths = create_image_folder( - root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples - ) - - images_meta = [] - for path in image_paths: - with PIL.Image.open(path) as image: - width, height = image.size - images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height)) - - make_zip(root, f"{name}.zip") - - return images_meta + def _make_images_meta(cls, *, num_samples): + return [ + dict(file_name=f"{idx:012d}.jpg", id=idx, width=width, height=height) + for idx, (height, width) in enumerate(torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()) + ] @classmethod def _make_annotations_json( @@ -571,16 +583,27 @@ def generate( cls, root, *, + split, year, num_samples, ): annotations_dir = root / "annotations" annotations_dir.mkdir() - for split in ("train", "val"): - config_name = f"{split}{year}" + for split_ in ("train", "val"): + config_name = f"{split_}{year}" + + images_meta = cls._make_images_meta(num_samples=num_samples) + if split_ == split: + create_image_folder( + root, + config_name, + file_name_fn=lambda idx: images_meta[idx]["file_name"], + num_examples=num_samples, + size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]), + ) + make_zip(root, f"{config_name}.zip") - images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples) cls._make_annotations( annotations_dir, config_name, @@ -594,7 +617,7 @@ def generate( @register_mock def coco(info, root, config): - return CocoMockData.generate(root, year=config.year, num_samples=5) + return CocoMockData.generate(root, split=config.split, year=config.year, num_samples=5) class SBDMockData: @@ -766,10 +789,12 @@ def add_bndbox(obj): @classmethod def generate(cls, root, *, year, trainval): - archive_folder = root if year == "2011": - archive_folder /= "TrainVal" - data_folder = archive_folder / "VOCdevkit" / f"VOC{year}" + archive_folder = root / "TrainVal" + data_folder = archive_folder / "VOCdevkit" + else: + archive_folder = data_folder = root / "VOCdevkit" + data_folder = data_folder / f"VOC{year}" data_folder.mkdir(parents=True, exist_ok=True) ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval) @@ -779,7 +804,7 @@ def generate(cls, root, *, year, trainval): (cls._make_detection_anns_folder, "Annotations", ".xml"), ]: make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids)) - make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder) + make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder) return num_samples_map @@ -907,7 +932,7 @@ def country211(info, root, config): file_name_fn=lambda idx: f"{idx}.jpg", num_examples=num_examples, ) - make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") + # make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") return num_examples * len(classes) @@ -1051,8 +1076,10 @@ def _make_ann_file(path, num_examples, class_idx): } ) + archive_folder = root / "GTSRB" + if config["split"] == "train": - train_folder = root / "GTSRB" / "Training" + train_folder = archive_folder / "Training" train_folder.mkdir(parents=True) for class_idx in classes: @@ -1067,9 +1094,9 @@ def _make_ann_file(path, num_examples, class_idx): num_examples=num_examples_per_class, class_idx=int(class_idx), ) - make_zip(root, "GTSRB-Training_fixed.zip", train_folder) + make_zip(root, "GTSRB-Training_fixed.zip", archive_folder) else: - test_folder = root / "GTSRB" / "Final_Test" + test_folder = archive_folder / "Final_Test" test_folder.mkdir(parents=True) create_image_folder( @@ -1079,7 +1106,7 @@ def _make_ann_file(path, num_examples, class_idx): num_examples=num_examples, ) - make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder) + make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder) _make_ann_file( path=root / "GT-final_test.csv", @@ -1443,11 +1470,11 @@ def stanford_cars(info, root, config): num_samples = {"train": 5, "test": 7}[config["split"]] num_categories = 3 - devkit = root / "devkit" - devkit.mkdir(parents=True) - if config["split"] == "train": images_folder_name = "cars_train" + + devkit = root / "devkit" + devkit.mkdir() annotations_mat_path = devkit / "cars_train_annos.mat" else: images_folder_name = "cars_test"