From c506a01f152bda8ca9bab837884ca455b1aef303 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 6 Apr 2022 08:48:39 +0200
Subject: [PATCH 1/2] migrate caltech prototype datasets

---
 test/builtin_dataset_mocks.py                 |  41 +++---
 .../prototype/datasets/_builtin/caltech.py    | 118 +++++++++++-------
 2 files changed, 101 insertions(+), 58 deletions(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index ad979b6bd84..17e4ac62ca7 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -370,8 +370,8 @@ def cifar100(info, root, config):
     return len(train_files if config.split == "train" else test_files)
 
 
-# @register_mock
-def caltech101(info, root, config):
+@register_mock(configs=[dict()])
+def caltech101(root, config):
     def create_ann_file(root, name):
         import scipy.io
 
@@ -390,15 +390,17 @@ def create_ann_folder(root, name, file_name_fn, num_examples):
     images_root = root / "101_ObjectCategories"
     anns_root = root / "Annotations"
 
-    ann_category_map = {
-        "Faces_2": "Faces",
-        "Faces_3": "Faces_easy",
-        "Motorbikes_16": "Motorbikes",
-        "Airplanes_Side_2": "airplanes",
+    image_category_map = {
+        "Faces": "Faces_2",
+        "Faces_easy": "Faces_3",
+        "Motorbikes": "Motorbikes_16",
+        "airplanes": "Airplanes_Side_2",
     }
 
+    categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"]
+
     num_images_per_category = 2
-    for category in info.categories:
+    for category in categories:
         create_image_folder(
             root=images_root,
             name=category,
@@ -407,7 +409,7 @@ def create_ann_folder(root, name, file_name_fn, num_examples):
         )
         create_ann_folder(
             root=anns_root,
-            name=ann_category_map.get(category, category),
+            name=image_category_map.get(category, category),
             file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
             num_examples=num_images_per_category,
         )
@@ -417,19 +419,26 @@ def create_ann_folder(root, name, file_name_fn, num_examples):
 
     make_tar(root, f"{anns_root.name}.tar", anns_root)
 
-    return num_images_per_category * len(info.categories)
+    return num_images_per_category * len(categories)
 
 
-# @register_mock
-def caltech256(info, root, config):
+@register_mock(configs=[dict()])
+def caltech256(root, config):
     dir = root / "256_ObjectCategories"
     num_images_per_category = 2
 
-    for idx, category in enumerate(info.categories, 1):
+    categories = [
+        (1, "ak47"),
+        (127, "laptop-101"),
+        (198, "spider"),
+        (257, "clutter"),
+    ]
+
+    for category_idx, category in categories:
         files = create_image_folder(
             dir,
-            name=f"{idx:03d}.{category}",
-            file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
+            name=f"{category_idx:03d}.{category}",
+            file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg",
             num_examples=num_images_per_category,
         )
         if category == "spider":
@@ -437,7 +446,7 @@ def caltech256(info, root, config):
 
     make_tar(root, f"{dir.name}.tar", dir)
 
-    return num_images_per_category * len(info.categories)
+    return num_images_per_category * len(categories)
 
 
 @register_mock(configs=combinations_grid(split=("train", "val", "test")))
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index 4a409835b5e..e8c7f8a0bc7 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -1,6 +1,6 @@
 import pathlib
 import re
-from typing import Any, Dict, List, Tuple, BinaryIO
+from typing import Any, Dict, List, Tuple, BinaryIO, Union
 
 import numpy as np
 from torchdata.datapipes.iter import (
@@ -9,26 +9,48 @@
     Filter,
     IterKeyZipper,
 )
-from torchvision.prototype.datasets.utils import (
-    Dataset,
-    DatasetConfig,
-    DatasetInfo,
-    HttpResource,
-    OnlineResource,
+from torchvision.prototype.datasets.utils import Dataset2, DatasetInfo, HttpResource, OnlineResource
+from torchvision.prototype.datasets.utils._internal import (
+    INFINITE_BUFFER_SIZE,
+    read_mat,
+    hint_sharding,
+    hint_shuffling,
+    BUILTIN_DIR,
 )
-from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE, read_mat, hint_sharding, hint_shuffling
 from torchvision.prototype.features import Label, BoundingBox, _Feature, EncodedImage
 
+from .._api import register_dataset, register_info
 
-class Caltech101(Dataset):
-    def _make_info(self) -> DatasetInfo:
-        return DatasetInfo(
-            "caltech101",
-            dependencies=("scipy",),
-            homepage="http://www.vision.caltech.edu/Image_Datasets/Caltech101",
+
+CALTECH101_CATEGORIES, *_ = zip(*DatasetInfo.read_categories_file(BUILTIN_DIR / "caltech101.categories"))
+
+
+@register_info("caltech101")
+def _caltech101_info() -> Dict[str, Any]:
+    return dict(categories=CALTECH101_CATEGORIES)
+
+
+@register_dataset("caltech101")
+class Caltech101(Dataset2):
+    """
+    - **homepage**: http://www.vision.caltech.edu/Image_Datasets/Caltech101
+    """
+
+    def __init__(
+        self,
+        root: Union[str, pathlib.Path],
+        skip_integrity_check: bool = False,
+    ) -> None:
+        self._categories = _caltech101_info()["categories"]
+
+        super().__init__(
+            root,
+            # TODO: this will only be available after https://github.com/pytorch/vision/pull/5473
+            # dependencies=("scipy",),
+            skip_integrity_check=skip_integrity_check,
         )
 
-    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
+    def _resources(self) -> List[OnlineResource]:
         images = HttpResource(
             "http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz",
             sha256="af6ece2f339791ca20f855943d8b55dd60892c0a25105fcd631ee3d6430f9926",
@@ -88,7 +110,7 @@ def _prepare_sample(
         ann = read_mat(ann_buffer)
 
         return dict(
-            label=Label.from_category(category, categories=self.categories),
+            label=Label.from_category(category, categories=self._categories),
             image_path=image_path,
             image=image,
             ann_path=ann_path,
@@ -98,12 +120,7 @@ def _prepare_sample(
             contour=_Feature(ann["obj_contour"].T),
         )
 
-    def _make_datapipe(
-        self,
-        resource_dps: List[IterDataPipe],
-        *,
-        config: DatasetConfig,
-    ) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
         images_dp, anns_dp = resource_dps
 
         images_dp = Filter(images_dp, self._is_not_background_image)
@@ -122,23 +139,42 @@ def _make_datapipe(
         )
         return Mapper(dp, self._prepare_sample)
 
-    def _generate_categories(self, root: pathlib.Path) -> List[str]:
-        resources = self.resources(self.default_config)
+    def __len__(self) -> int:
+        return 8677
 
-        dp = resources[0].load(root)
+    def _generate_categories(self) -> List[str]:
+        resources = self._resources()
+
+        dp = resources[0].load(self._root)
         dp = Filter(dp, self._is_not_background_image)
 
         return sorted({pathlib.Path(path).parent.name for path, _ in dp})
 
 
-class Caltech256(Dataset):
-    def _make_info(self) -> DatasetInfo:
-        return DatasetInfo(
-            "caltech256",
-            homepage="http://www.vision.caltech.edu/Image_Datasets/Caltech256",
-        )
+CALTECH256_CATEGORIES, *_ = zip(*DatasetInfo.read_categories_file(BUILTIN_DIR / "caltech256.categories"))
 
-    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
+
+@register_info("caltech256")
+def _caltech256_info() -> Dict[str, Any]:
+    return dict(categories=CALTECH256_CATEGORIES)
+
+
+@register_dataset("caltech256")
+class Caltech256(Dataset2):
+    """
+    - **homepage**: http://www.vision.caltech.edu/Image_Datasets/Caltech256
+    """
+
+    def __init__(
+        self,
+        root: Union[str, pathlib.Path],
+        skip_integrity_check: bool = False,
+    ) -> None:
+        self._categories = _caltech256_info()["categories"]
+
+        super().__init__(root, skip_integrity_check=skip_integrity_check)
+
+    def _resources(self) -> List[OnlineResource]:
         return [
             HttpResource(
                 "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar",
@@ -156,25 +192,23 @@ def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
         return dict(
             path=path,
             image=EncodedImage.from_file(buffer),
-            label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self.categories),
+            label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories),
         )
 
-    def _make_datapipe(
-        self,
-        resource_dps: List[IterDataPipe],
-        *,
-        config: DatasetConfig,
-    ) -> IterDataPipe[Dict[str, Any]]:
+    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
         dp = resource_dps[0]
         dp = Filter(dp, self._is_not_rogue_file)
         dp = hint_shuffling(dp)
         dp = hint_sharding(dp)
         return Mapper(dp, self._prepare_sample)
 
-    def _generate_categories(self, root: pathlib.Path) -> List[str]:
-        resources = self.resources(self.default_config)
+    def __len__(self) -> int:
+        return 30607
+
+    def _generate_categories(self) -> List[str]:
+        resources = self._resources()
 
-        dp = resources[0].load(root)
+        dp = resources[0].load(self._root)
         dir_names = {pathlib.Path(path).parent.name for path, _ in dp}
 
         return [name.split(".")[1] for name in sorted(dir_names)]

From 031ac5ccb2730773ec138e7f96f39badb04ee931 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 6 Apr 2022 15:06:22 +0200
Subject: [PATCH 2/2] resolve third party dependencies

---
 torchvision/prototype/datasets/_builtin/caltech.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
index e8c7f8a0bc7..3701063504f 100644
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ b/torchvision/prototype/datasets/_builtin/caltech.py
@@ -34,6 +34,8 @@ def _caltech101_info() -> Dict[str, Any]:
 class Caltech101(Dataset2):
     """
     - **homepage**: http://www.vision.caltech.edu/Image_Datasets/Caltech101
+    - **dependencies**:
+        - <scipy `https://scipy.org/`>_
     """
 
     def __init__(
@@ -45,8 +47,7 @@ def __init__(
 
         super().__init__(
             root,
-            # TODO: this will only be available after https://github.com/pytorch/vision/pull/5473
-            # dependencies=("scipy",),
+            dependencies=("scipy",),
             skip_integrity_check=skip_integrity_check,
         )