From c506a01f152bda8ca9bab837884ca455b1aef303 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 6 Apr 2022 08:48:39 +0200 Subject: [PATCH 1/2] migrate caltech prototype datasets --- test/builtin_dataset_mocks.py | 41 +++--- .../prototype/datasets/_builtin/caltech.py | 118 +++++++++++------- 2 files changed, 101 insertions(+), 58 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index ad979b6bd84..17e4ac62ca7 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -370,8 +370,8 @@ def cifar100(info, root, config): return len(train_files if config.split == "train" else test_files) -# @register_mock -def caltech101(info, root, config): +@register_mock(configs=[dict()]) +def caltech101(root, config): def create_ann_file(root, name): import scipy.io @@ -390,15 +390,17 @@ def create_ann_folder(root, name, file_name_fn, num_examples): images_root = root / "101_ObjectCategories" anns_root = root / "Annotations" - ann_category_map = { - "Faces_2": "Faces", - "Faces_3": "Faces_easy", - "Motorbikes_16": "Motorbikes", - "Airplanes_Side_2": "airplanes", + image_category_map = { + "Faces": "Faces_2", + "Faces_easy": "Faces_3", + "Motorbikes": "Motorbikes_16", + "airplanes": "Airplanes_Side_2", } + categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"] + num_images_per_category = 2 - for category in info.categories: + for category in categories: create_image_folder( root=images_root, name=category, @@ -407,7 +409,7 @@ def create_ann_folder(root, name, file_name_fn, num_examples): ) create_ann_folder( root=anns_root, - name=ann_category_map.get(category, category), + name=image_category_map.get(category, category), file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat", num_examples=num_images_per_category, ) @@ -417,19 +419,26 @@ def create_ann_folder(root, name, file_name_fn, num_examples): make_tar(root, f"{anns_root.name}.tar", anns_root) - return num_images_per_category * len(info.categories) + return num_images_per_category * len(categories) -# @register_mock -def caltech256(info, root, config): +@register_mock(configs=[dict()]) +def caltech256(root, config): dir = root / "256_ObjectCategories" num_images_per_category = 2 - for idx, category in enumerate(info.categories, 1): + categories = [ + (1, "ak47"), + (127, "laptop-101"), + (198, "spider"), + (257, "clutter"), + ] + + for category_idx, category in categories: files = create_image_folder( dir, - name=f"{idx:03d}.{category}", - file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg", + name=f"{category_idx:03d}.{category}", + file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg", num_examples=num_images_per_category, ) if category == "spider": @@ -437,7 +446,7 @@ def caltech256(info, root, config): make_tar(root, f"{dir.name}.tar", dir) - return num_images_per_category * len(info.categories) + return num_images_per_category * len(categories) @register_mock(configs=combinations_grid(split=("train", "val", "test"))) diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py index 4a409835b5e..e8c7f8a0bc7 100644 --- a/torchvision/prototype/datasets/_builtin/caltech.py +++ b/torchvision/prototype/datasets/_builtin/caltech.py @@ -1,6 +1,6 @@ import pathlib import re -from typing import Any, Dict, List, Tuple, BinaryIO +from typing import Any, Dict, List, Tuple, BinaryIO, Union import numpy as np from torchdata.datapipes.iter import ( @@ -9,26 +9,48 @@ Filter, IterKeyZipper, ) -from torchvision.prototype.datasets.utils import ( - Dataset, - DatasetConfig, - DatasetInfo, - HttpResource, - OnlineResource, +from torchvision.prototype.datasets.utils import Dataset2, DatasetInfo, HttpResource, OnlineResource +from torchvision.prototype.datasets.utils._internal import ( + INFINITE_BUFFER_SIZE, + read_mat, + hint_sharding, + hint_shuffling, + BUILTIN_DIR, ) -from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE, read_mat, hint_sharding, hint_shuffling from torchvision.prototype.features import Label, BoundingBox, _Feature, EncodedImage +from .._api import register_dataset, register_info -class Caltech101(Dataset): - def _make_info(self) -> DatasetInfo: - return DatasetInfo( - "caltech101", - dependencies=("scipy",), - homepage="http://www.vision.caltech.edu/Image_Datasets/Caltech101", + +CALTECH101_CATEGORIES, *_ = zip(*DatasetInfo.read_categories_file(BUILTIN_DIR / "caltech101.categories")) + + +@register_info("caltech101") +def _caltech101_info() -> Dict[str, Any]: + return dict(categories=CALTECH101_CATEGORIES) + + +@register_dataset("caltech101") +class Caltech101(Dataset2): + """ + - **homepage**: http://www.vision.caltech.edu/Image_Datasets/Caltech101 + """ + + def __init__( + self, + root: Union[str, pathlib.Path], + skip_integrity_check: bool = False, + ) -> None: + self._categories = _caltech101_info()["categories"] + + super().__init__( + root, + # TODO: this will only be available after https://github.com/pytorch/vision/pull/5473 + # dependencies=("scipy",), + skip_integrity_check=skip_integrity_check, ) - def resources(self, config: DatasetConfig) -> List[OnlineResource]: + def _resources(self) -> List[OnlineResource]: images = HttpResource( "http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz", sha256="af6ece2f339791ca20f855943d8b55dd60892c0a25105fcd631ee3d6430f9926", @@ -88,7 +110,7 @@ def _prepare_sample( ann = read_mat(ann_buffer) return dict( - label=Label.from_category(category, categories=self.categories), + label=Label.from_category(category, categories=self._categories), image_path=image_path, image=image, ann_path=ann_path, @@ -98,12 +120,7 @@ def _prepare_sample( contour=_Feature(ann["obj_contour"].T), ) - def _make_datapipe( - self, - resource_dps: List[IterDataPipe], - *, - config: DatasetConfig, - ) -> IterDataPipe[Dict[str, Any]]: + def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: images_dp, anns_dp = resource_dps images_dp = Filter(images_dp, self._is_not_background_image) @@ -122,23 +139,42 @@ def _make_datapipe( ) return Mapper(dp, self._prepare_sample) - def _generate_categories(self, root: pathlib.Path) -> List[str]: - resources = self.resources(self.default_config) + def __len__(self) -> int: + return 8677 - dp = resources[0].load(root) + def _generate_categories(self) -> List[str]: + resources = self._resources() + + dp = resources[0].load(self._root) dp = Filter(dp, self._is_not_background_image) return sorted({pathlib.Path(path).parent.name for path, _ in dp}) -class Caltech256(Dataset): - def _make_info(self) -> DatasetInfo: - return DatasetInfo( - "caltech256", - homepage="http://www.vision.caltech.edu/Image_Datasets/Caltech256", - ) +CALTECH256_CATEGORIES, *_ = zip(*DatasetInfo.read_categories_file(BUILTIN_DIR / "caltech256.categories")) - def resources(self, config: DatasetConfig) -> List[OnlineResource]: + +@register_info("caltech256") +def _caltech256_info() -> Dict[str, Any]: + return dict(categories=CALTECH256_CATEGORIES) + + +@register_dataset("caltech256") +class Caltech256(Dataset2): + """ + - **homepage**: http://www.vision.caltech.edu/Image_Datasets/Caltech256 + """ + + def __init__( + self, + root: Union[str, pathlib.Path], + skip_integrity_check: bool = False, + ) -> None: + self._categories = _caltech256_info()["categories"] + + super().__init__(root, skip_integrity_check=skip_integrity_check) + + def _resources(self) -> List[OnlineResource]: return [ HttpResource( "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar", @@ -156,25 +192,23 @@ def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]: return dict( path=path, image=EncodedImage.from_file(buffer), - label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self.categories), + label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories), ) - def _make_datapipe( - self, - resource_dps: List[IterDataPipe], - *, - config: DatasetConfig, - ) -> IterDataPipe[Dict[str, Any]]: + def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter(dp, self._is_not_rogue_file) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample) - def _generate_categories(self, root: pathlib.Path) -> List[str]: - resources = self.resources(self.default_config) + def __len__(self) -> int: + return 30607 + + def _generate_categories(self) -> List[str]: + resources = self._resources() - dp = resources[0].load(root) + dp = resources[0].load(self._root) dir_names = {pathlib.Path(path).parent.name for path, _ in dp} return [name.split(".")[1] for name in sorted(dir_names)] From 031ac5ccb2730773ec138e7f96f39badb04ee931 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 6 Apr 2022 15:06:22 +0200 Subject: [PATCH 2/2] resolve third party dependencies --- torchvision/prototype/datasets/_builtin/caltech.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py index e8c7f8a0bc7..3701063504f 100644 --- a/torchvision/prototype/datasets/_builtin/caltech.py +++ b/torchvision/prototype/datasets/_builtin/caltech.py @@ -34,6 +34,8 @@ def _caltech101_info() -> Dict[str, Any]: class Caltech101(Dataset2): """ - **homepage**: http://www.vision.caltech.edu/Image_Datasets/Caltech101 + - **dependencies**: + - _ """ def __init__( @@ -45,8 +47,7 @@ def __init__( super().__init__( root, - # TODO: this will only be available after https://github.com/pytorch/vision/pull/5473 - # dependencies=("scipy",), + dependencies=("scipy",), skip_integrity_check=skip_integrity_check, )