From 1c899108c07d965a579741332c0fc1bc459dfebe Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 30 Mar 2022 14:18:48 +0200 Subject: [PATCH 1/7] allow preprocessed mock data in prototype datasets tests --- test/builtin_dataset_mocks.py | 65 +++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 768177b1c28..453d7987a68 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -10,6 +10,8 @@ import pathlib import pickle import random +import re +import shutil import warnings import xml.etree.ElementTree as ET from collections import defaultdict, Counter @@ -72,6 +74,25 @@ def prepare(self, home, config): available_file_names = {path.name for path in root.glob("*")} required_file_names = {resource.file_name for resource in self.dataset.resources(config)} missing_file_names = required_file_names - available_file_names + extra_file_names = available_file_names - required_file_names + + # Some datasets need to provide already preprocessed data, for example if the preprocessing includes downloads + if extra_file_names: + for extra in extra_file_names.copy(): + candidate_pattern = re.compile(fr"^{extra.split('.', 1)[0]}([.]\w+)*$") + try: + missing = next(missing for missing in missing_file_names if candidate_pattern.match(missing)) + except StopIteration: + continue + + extra_file_names.remove(extra) + missing_file_names.remove(missing) + + if extra_file_names: + raise pytest.UsageError( + f"Dataset '{self.name}' created the files {sequence_to_str(sorted(extra_file_names))} " + f"for {config} in the mock data function, but they are not needed." + ) if missing_file_names: raise pytest.UsageError( f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} " @@ -489,7 +510,7 @@ def imagenet(info, root, config): class CocoMockData: @classmethod - def _make_images_archive(cls, root, name, *, num_samples): + def _make_images_folder(cls, root, name, *, num_samples): image_paths = create_image_folder( root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples ) @@ -500,8 +521,6 @@ def _make_images_archive(cls, root, name, *, num_samples): width, height = image.size images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height)) - make_zip(root, f"{name}.zip") - return images_meta @classmethod @@ -571,16 +590,22 @@ def generate( cls, root, *, + split, year, num_samples, ): annotations_dir = root / "annotations" annotations_dir.mkdir() - for split in ("train", "val"): - config_name = f"{split}{year}" + for split_ in ("train", "val"): + config_name = f"{split_}{year}" + + images_meta = cls._make_images_folder(root, config_name, num_samples=num_samples) + if split_ == split: + make_zip(root, f"{config_name}.zip") + else: + shutil.rmtree(root / config_name) - images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples) cls._make_annotations( annotations_dir, config_name, @@ -594,7 +619,7 @@ def generate( @register_mock def coco(info, root, config): - return CocoMockData.generate(root, year=config.year, num_samples=5) + return CocoMockData.generate(root, split=config.split, year=config.year, num_samples=5) class SBDMockData: @@ -766,10 +791,12 @@ def add_bndbox(obj): @classmethod def generate(cls, root, *, year, trainval): - archive_folder = root if year == "2011": - archive_folder /= "TrainVal" - data_folder = archive_folder / "VOCdevkit" / f"VOC{year}" + archive_folder = root / "TrainVal" + data_folder = archive_folder / "VOCdevkit" + else: + archive_folder = data_folder = root / "VOCdevkit" + data_folder = data_folder / f"VOC{year}" data_folder.mkdir(parents=True, exist_ok=True) ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval) @@ -779,7 +806,7 @@ def generate(cls, root, *, year, trainval): (cls._make_detection_anns_folder, "Annotations", ".xml"), ]: make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids)) - make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder) + make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder) return num_samples_map @@ -1013,8 +1040,10 @@ def _make_ann_file(path, num_examples, class_idx): } ) + archive_folder = root / "GTSRB" + if config["split"] == "train": - train_folder = root / "GTSRB" / "Training" + train_folder = archive_folder / "Training" train_folder.mkdir(parents=True) for class_idx in classes: @@ -1029,9 +1058,9 @@ def _make_ann_file(path, num_examples, class_idx): num_examples=num_examples_per_class, class_idx=int(class_idx), ) - make_zip(root, "GTSRB-Training_fixed.zip", train_folder) + make_zip(root, "GTSRB-Training_fixed.zip", archive_folder) else: - test_folder = root / "GTSRB" / "Final_Test" + test_folder = archive_folder / "Final_Test" test_folder.mkdir(parents=True) create_image_folder( @@ -1041,7 +1070,7 @@ def _make_ann_file(path, num_examples, class_idx): num_examples=num_examples, ) - make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder) + make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder) _make_ann_file( path=root / "GT-final_test.csv", @@ -1405,11 +1434,11 @@ def stanford_cars(info, root, config): num_samples = {"train": 5, "test": 7}[config["split"]] num_categories = 3 - devkit = root / "devkit" - devkit.mkdir(parents=True) - if config["split"] == "train": images_folder_name = "cars_train" + + devkit = root / "devkit" + devkit.mkdir() annotations_mat_path = devkit / "cars_train_annos.mat" else: images_folder_name = "cars_test" From 0163e1ac9c2cbad8865773a5e12b71011f6b4d9e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 4 Apr 2022 12:06:17 +0200 Subject: [PATCH 2/7] create images meta manually for COCO mock data --- test/builtin_dataset_mocks.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 453d7987a68..d6bb61585f5 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -11,13 +11,11 @@ import pickle import random import re -import shutil import warnings import xml.etree.ElementTree as ET from collections import defaultdict, Counter import numpy as np -import PIL.Image import pytest import torch from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file @@ -510,18 +508,11 @@ def imagenet(info, root, config): class CocoMockData: @classmethod - def _make_images_folder(cls, root, name, *, num_samples): - image_paths = create_image_folder( - root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples - ) - - images_meta = [] - for path in image_paths: - with PIL.Image.open(path) as image: - width, height = image.size - images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height)) - - return images_meta + def _make_images_meta(cls, *, num_samples): + return [ + dict(file_name=f"{idx:012d}.jpg", id=idx, width=width, height=height) + for idx, (height, width) in enumerate(torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()) + ] @classmethod def _make_annotations_json( @@ -600,11 +591,16 @@ def generate( for split_ in ("train", "val"): config_name = f"{split_}{year}" - images_meta = cls._make_images_folder(root, config_name, num_samples=num_samples) + images_meta = cls._make_images_meta(num_samples=num_samples) if split_ == split: + create_image_folder( + root, + config_name, + file_name_fn=lambda idx: images_meta[idx]["file_name"], + num_examples=num_samples, + size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]), + ) make_zip(root, f"{config_name}.zip") - else: - shutil.rmtree(root / config_name) cls._make_annotations( annotations_dir, From 95bafc453bf72493d22b8bbfbfc1a04da7c3270d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 4 Apr 2022 12:09:26 +0200 Subject: [PATCH 3/7] expand explanation for extra / missing file matching --- test/builtin_dataset_mocks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index d6bb61585f5..1786547ba0d 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -74,7 +74,10 @@ def prepare(self, home, config): missing_file_names = required_file_names - available_file_names extra_file_names = available_file_names - required_file_names - # Some datasets need to provide already preprocessed data, for example if the preprocessing includes downloads + # Some datasets need to provide already preprocessed data, for example if the preprocessing includes downloads. + # Such data will be included in the `extra_file_names` while the raw data will be in the `missing_file_names`. + # This detects these cases and removes the corresponding entries from the sets since the files are neither extra + # nor missing. if extra_file_names: for extra in extra_file_names.copy(): candidate_pattern = re.compile(fr"^{extra.split('.', 1)[0]}([.]\w+)*$") From b20edac6cb79281fbb97b09fb26c2bd8e673cdc5 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 4 Apr 2022 13:36:24 +0200 Subject: [PATCH 4/7] simplify detection logic --- test/builtin_dataset_mocks.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 1786547ba0d..125d8a5d692 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -10,7 +10,6 @@ import pathlib import pickle import random -import re import warnings import xml.etree.ElementTree as ET from collections import defaultdict, Counter @@ -78,16 +77,12 @@ def prepare(self, home, config): # Such data will be included in the `extra_file_names` while the raw data will be in the `missing_file_names`. # This detects these cases and removes the corresponding entries from the sets since the files are neither extra # nor missing. - if extra_file_names: - for extra in extra_file_names.copy(): - candidate_pattern = re.compile(fr"^{extra.split('.', 1)[0]}([.]\w+)*$") - try: - missing = next(missing for missing in missing_file_names if candidate_pattern.match(missing)) - except StopIteration: - continue - - extra_file_names.remove(extra) - missing_file_names.remove(missing) + if missing_file_names: + for missing in missing_file_names.copy(): + extra_candidate = missing.split(".", 1)[0] + if extra_candidate in extra_file_names: + missing_file_names.remove(missing) + extra_file_names.remove(extra_candidate) if extra_file_names: raise pytest.UsageError( @@ -933,7 +928,7 @@ def country211(info, root, config): file_name_fn=lambda idx: f"{idx}.jpg", num_examples=num_examples, ) - make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") + # make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") return num_examples * len(classes) From 40e232f77ca195dd371c31d4e5c1927368967322 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 4 Apr 2022 13:39:31 +0200 Subject: [PATCH 5/7] revert unrelated change --- test/builtin_dataset_mocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 3198dfeba6f..d48f4180777 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -928,7 +928,7 @@ def country211(info, root, config): file_name_fn=lambda idx: f"{idx}.jpg", num_examples=num_examples, ) - # make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") + make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") return num_examples * len(classes) From 2a6be388d40807fd1de2a647a1c5d370bf313b67 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 4 Apr 2022 14:09:04 +0200 Subject: [PATCH 6/7] remove partial string splitting --- test/builtin_dataset_mocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index d48f4180777..9ae24af34ad 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -79,7 +79,7 @@ def prepare(self, home, config): # nor missing. if missing_file_names: for missing in missing_file_names.copy(): - extra_candidate = missing.split(".", 1)[0] + extra_candidate = missing.split(".")[0] if extra_candidate in extra_file_names: missing_file_names.remove(missing) extra_file_names.remove(extra_candidate) @@ -928,7 +928,7 @@ def country211(info, root, config): file_name_fn=lambda idx: f"{idx}.jpg", num_examples=num_examples, ) - make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") + # make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") return num_examples * len(classes) From 9d54c1535812c82e0d4269b0711e636e3c743836 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 4 Apr 2022 14:14:54 +0200 Subject: [PATCH 7/7] rewrite explanation --- test/builtin_dataset_mocks.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 9ae24af34ad..251893cd81d 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -73,10 +73,14 @@ def prepare(self, home, config): missing_file_names = required_file_names - available_file_names extra_file_names = available_file_names - required_file_names - # Some datasets need to provide already preprocessed data, for example if the preprocessing includes downloads. - # Such data will be included in the `extra_file_names` while the raw data will be in the `missing_file_names`. - # This detects these cases and removes the corresponding entries from the sets since the files are neither extra - # nor missing. + # Some datasets need to cannot provide the original resources, for example if the preprocessing step includes + # downloads. In such a case the mock data function might also provide the corresponding result of the + # preprocessing. For example, if the dataset requires the `foo.tar` file, but the preprocessing step extracts it + # to the `foo` folder and performs some more operations, the mock data function can provide the `foo` folder + # directly. + # In such a case `foo.tar` will be picked up in `missing_file_names` at first and `foo` in `extra_file_names`. + # Since `foo.tar` is not actually missing, but already in a preprocessed state, we remove the corresponding + # entries from both sets. if missing_file_names: for missing in missing_file_names.copy(): extra_candidate = missing.split(".")[0]