From 237a707373471cd74131b8c4ec4b5e0ab4df946d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 5 Jan 2022 15:02:30 +0000
Subject: [PATCH 01/36] Change default of download for Food101 and DTD

---
 torchvision/datasets/dtd.py     | 4 ++--
 torchvision/datasets/food101.py | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/torchvision/datasets/dtd.py b/torchvision/datasets/dtd.py
index ceacc64eedb..1970cd6dce4 100644
--- a/torchvision/datasets/dtd.py
+++ b/torchvision/datasets/dtd.py
@@ -23,7 +23,7 @@ class DTD(VisionDataset):
 
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
+            downloaded again. Default is False.
         transform (callable, optional): A function/transform that  takes in a PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
@@ -37,7 +37,7 @@ def __init__(
         root: str,
         split: str = "train",
         partition: int = 1,
-        download: bool = True,
+        download: bool = False,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
     ) -> None:
diff --git a/torchvision/datasets/food101.py b/torchvision/datasets/food101.py
index cffe0c50a06..fa194d56468 100644
--- a/torchvision/datasets/food101.py
+++ b/torchvision/datasets/food101.py
@@ -21,6 +21,9 @@ class Food101(VisionDataset):
     Args:
         root (string): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default) and ``"test"``.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again. Default is False.
         transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
@@ -33,7 +36,7 @@ def __init__(
         self,
         root: str,
         split: str = "train",
-        download: bool = True,
+        download: bool = False,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
     ) -> None:

From bc3be4eef377bfde3f796b78a4faa3c31725b128 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 7 Jan 2022 16:49:35 +0000
Subject: [PATCH 02/36] WIP

---
 .../prototype/datasets/_builtin/gtsrb.py      | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 torchvision/prototype/datasets/_builtin/gtsrb.py

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
new file mode 100644
index 00000000000..dc1959bc24d
--- /dev/null
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -0,0 +1,59 @@
+import functools
+import io
+from typing import Any, Callable, Dict, List, Optional, Union, cast
+
+import torch
+from torchdata.datapipes.iter import IterDataPipe, Mapper, CSVDictParser
+from torchvision.prototype.datasets.decoder import raw
+from torchvision.prototype.datasets.utils import (
+    Dataset,
+    DatasetConfig,
+    DatasetInfo,
+    OnlineResource,
+    DatasetType,
+    HttpResource,
+)
+from torchvision.prototype.datasets.utils._internal import (
+    hint_sharding,
+    hint_shuffling,
+    image_buffer_from_array,
+)
+from torchvision.prototype.features import Label, Image
+
+
+class GTSRB(Dataset):
+    def _make_info(self) -> DatasetInfo:
+        return DatasetInfo(
+            "GTSRB",
+            type=DatasetType.RAW,
+            homepage="https://benchmark.ini.rub.de",
+            categories=(
+                "TO",
+                "DO",
+            ),  # TODO
+            valid_options=dict(split=("train", "test")),
+        )
+
+    _URLS = {
+        "train": "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB-Training_fixed.zip",
+        "test": "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_Images.zip",
+        "test_gt": "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_GT.zip",
+    }
+    _CHECKSUMS = {
+        "train": "df4144942083645bd60b594de348aa6930126c3e0e5de09e39611630abf8455a",
+        "test": "48ba6fab7e877eb64eaf8de99035b0aaecfbc279bee23e35deca4ac1d0a837fa",
+        "test_gt": "f94e5a7614d75845c74c04ddb26b8796b9e483f43541dd95dd5b726504e16d6d",
+    }
+
+    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
+        rsrcs = [HttpResource(self._URLS[config.split], sha256=self._CHECKSUMS[config.split])]
+
+        if config.split == "test":
+            rsrcs.append(
+                HttpResource(
+                    self._URLS["test_gt"],
+                    sha256=self._CHECKSUMS["test_gt"],
+                )
+            )
+
+        return rsrcs

From 87695d4114b92113a1997edc7b6d7289743fe775 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 10:58:15 +0000
Subject: [PATCH 03/36] Set download default to False and put it at the end

---
 torchvision/datasets/clevr.py           |  2 +-
 torchvision/datasets/country211.py      |  2 +-
 torchvision/datasets/dtd.py             |  8 ++++----
 torchvision/datasets/eurosat.py         | 13 +++++++------
 torchvision/datasets/fgvc_aircraft.py   |  8 ++++----
 torchvision/datasets/flowers102.py      |  8 ++++----
 torchvision/datasets/food101.py         |  8 ++++----
 torchvision/datasets/oxford_iiit_pet.py |  2 +-
 torchvision/datasets/pcam.py            |  2 +-
 torchvision/datasets/sun397.py          |  8 ++++----
 10 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/torchvision/datasets/clevr.py b/torchvision/datasets/clevr.py
index 7ba5ca6cc47..112765a6b5d 100644
--- a/torchvision/datasets/clevr.py
+++ b/torchvision/datasets/clevr.py
@@ -34,7 +34,7 @@ def __init__(
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
-        download: bool = True,
+        download: bool = False,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
         super().__init__(root, transform=transform, target_transform=target_transform)
diff --git a/torchvision/datasets/country211.py b/torchvision/datasets/country211.py
index 20b69bc729e..b5c650cb276 100644
--- a/torchvision/datasets/country211.py
+++ b/torchvision/datasets/country211.py
@@ -32,7 +32,7 @@ def __init__(
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
-        download: bool = True,
+        download: bool = False,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "valid", "test"))
 
diff --git a/torchvision/datasets/dtd.py b/torchvision/datasets/dtd.py
index 1970cd6dce4..deb27312573 100644
--- a/torchvision/datasets/dtd.py
+++ b/torchvision/datasets/dtd.py
@@ -21,12 +21,12 @@ class DTD(VisionDataset):
                 The partition only changes which split each image belongs to. Thus, regardless of the selected
                 partition, combining all splits will result in all images.
 
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again. Default is False.
         transform (callable, optional): A function/transform that  takes in a PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again. Default is False.
     """
 
     _URL = "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz"
@@ -37,9 +37,9 @@ def __init__(
         root: str,
         split: str = "train",
         partition: int = 1,
-        download: bool = False,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        download: bool = False,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
         if not isinstance(partition, int) and not (1 <= partition <= 10):
diff --git a/torchvision/datasets/eurosat.py b/torchvision/datasets/eurosat.py
index d7876b7afd5..4096d0e2c66 100644
--- a/torchvision/datasets/eurosat.py
+++ b/torchvision/datasets/eurosat.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any
+from typing import Callable, Optional
 
 from .folder import ImageFolder
 from .utils import download_and_extract_archive
@@ -10,13 +10,13 @@ class EuroSAT(ImageFolder):
 
     Args:
         root (string): Root directory of dataset where ``root/eurosat`` exists.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again. Default is False.
         transform (callable, optional): A function/transform that  takes in an PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again. Default is False.
     """
 
     url = "https://madm.dfki.de/files/sentinel/EuroSAT.zip"
@@ -25,8 +25,9 @@ class EuroSAT(ImageFolder):
     def __init__(
         self,
         root: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
         download: bool = False,
-        **kwargs: Any,
     ) -> None:
         self.root = os.path.expanduser(root)
         self._base_folder = os.path.join(self.root, "eurosat")
@@ -38,7 +39,7 @@ def __init__(
         if not self._check_exists():
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
-        super().__init__(self._data_folder, **kwargs)
+        super().__init__(self._data_folder, transform=transform, target_transform=target_transform)
         self.root = os.path.expanduser(root)
 
     def __len__(self) -> int:
diff --git a/torchvision/datasets/fgvc_aircraft.py b/torchvision/datasets/fgvc_aircraft.py
index 687d44fb7f0..d0bbf586639 100644
--- a/torchvision/datasets/fgvc_aircraft.py
+++ b/torchvision/datasets/fgvc_aircraft.py
@@ -26,15 +26,15 @@ class FGVCAircraft(VisionDataset):
         root (string): Root directory of the FGVC Aircraft dataset.
         split (string, optional): The dataset split, supports ``train``, ``val``,
             ``trainval`` and ``test``.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
         annotation_level (str, optional): The annotation level, supports ``variant``,
             ``family`` and ``manufacturer``.
         transform (callable, optional): A function/transform that  takes in an PIL image
             and returns a transformed version. E.g, ``transforms.RandomCrop``
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
     """
 
     _URL = "https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz"
@@ -43,10 +43,10 @@ def __init__(
         self,
         root: str,
         split: str = "trainval",
-        download: bool = False,
         annotation_level: str = "variant",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        download: bool = False,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "val", "trainval", "test"))
diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py
index 55347ffa550..8f4810e62e1 100644
--- a/torchvision/datasets/flowers102.py
+++ b/torchvision/datasets/flowers102.py
@@ -24,12 +24,12 @@ class Flowers102(VisionDataset):
     Args:
         root (string): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``.
-        download (bool, optional): If true, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
         transform (callable, optional): A function/transform that takes in an PIL image and returns a
             transformed version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
     """
 
     _download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/"
@@ -44,9 +44,9 @@ def __init__(
         self,
         root: str,
         split: str = "train",
-        download: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        download: bool = False,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "val", "test"))
diff --git a/torchvision/datasets/food101.py b/torchvision/datasets/food101.py
index fa194d56468..1bb4d8094d5 100644
--- a/torchvision/datasets/food101.py
+++ b/torchvision/datasets/food101.py
@@ -21,12 +21,12 @@ class Food101(VisionDataset):
     Args:
         root (string): Root directory of the dataset.
         split (string, optional): The dataset split, supports ``"train"`` (default) and ``"test"``.
-        download (bool, optional): If True, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again. Default is False.
         transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again. Default is False.
     """
 
     _URL = "http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz"
@@ -36,9 +36,9 @@ def __init__(
         self,
         root: str,
         split: str = "train",
-        download: bool = False,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        download: bool = False,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self._split = verify_str_arg(split, "split", ("train", "test"))
diff --git a/torchvision/datasets/oxford_iiit_pet.py b/torchvision/datasets/oxford_iiit_pet.py
index f7f77b997c2..733aa78256b 100644
--- a/torchvision/datasets/oxford_iiit_pet.py
+++ b/torchvision/datasets/oxford_iiit_pet.py
@@ -45,7 +45,7 @@ def __init__(
         transforms: Optional[Callable] = None,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
-        download: bool = True,
+        download: bool = False,
     ):
         self._split = verify_str_arg(split, "split", ("trainval", "test"))
         if isinstance(target_types, str):
diff --git a/torchvision/datasets/pcam.py b/torchvision/datasets/pcam.py
index f9b9b6817bf..7238931d1f3 100644
--- a/torchvision/datasets/pcam.py
+++ b/torchvision/datasets/pcam.py
@@ -72,7 +72,7 @@ def __init__(
         split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
-        download: bool = True,
+        download: bool = False,
     ):
         try:
             import h5py  # type: ignore[import]
diff --git a/torchvision/datasets/sun397.py b/torchvision/datasets/sun397.py
index da34351771f..2814ca80232 100644
--- a/torchvision/datasets/sun397.py
+++ b/torchvision/datasets/sun397.py
@@ -19,12 +19,12 @@ class SUN397(VisionDataset):
         split (string, optional): The dataset split, supports ``"train"`` (default) and ``"test"``.
         partition (int, optional): A valid partition can be an integer from 1 to 10 or None,
             for the entire dataset.
-        download (bool, optional): If true, downloads the dataset from the internet and
-            puts it in root directory. If dataset is already downloaded, it is not
-            downloaded again.
         transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
     """
 
     _DATASET_URL = "http://vision.princeton.edu/projects/2010/SUN/SUN397.tar.gz"
@@ -37,9 +37,9 @@ def __init__(
         root: str,
         split: str = "train",
         partition: Optional[int] = 1,
-        download: bool = True,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
+        download: bool = False,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
         self.split = verify_str_arg(split, "split", ("train", "test"))

From 1e6e37d860b38eebd89ca74de9b2ff104e401e61 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 11:02:41 +0000
Subject: [PATCH 04/36] Keep stuff private

---
 torchvision/datasets/eurosat.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/torchvision/datasets/eurosat.py b/torchvision/datasets/eurosat.py
index 4096d0e2c66..bec6df5312d 100644
--- a/torchvision/datasets/eurosat.py
+++ b/torchvision/datasets/eurosat.py
@@ -19,9 +19,6 @@ class EuroSAT(ImageFolder):
             downloaded again. Default is False.
     """
 
-    url = "https://madm.dfki.de/files/sentinel/EuroSAT.zip"
-    md5 = "c8fa014336c82ac7804f0398fcb19387"
-
     def __init__(
         self,
         root: str,
@@ -54,4 +51,8 @@ def download(self) -> None:
             return
 
         os.makedirs(self._base_folder, exist_ok=True)
-        download_and_extract_archive(self.url, download_root=self._base_folder, md5=self.md5)
+        download_and_extract_archive(
+            "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
+            download_root=self._base_folder,
+            md5="c8fa014336c82ac7804f0398fcb19387",
+        )

From 474546fe8e82ebcc5b7387893244766e1205bfe1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 11:23:44 +0000
Subject: [PATCH 05/36] GTSRB: train -> split. Also use pathlib

---
 test/test_datasets.py         | 10 +++---
 torchvision/datasets/gtsrb.py | 61 ++++++++++++++++-------------------
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index e306930aaf2..02876364651 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2397,17 +2397,17 @@ class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.GTSRB
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
-        root_folder = os.path.join(tmpdir, "GTSRB")
+        root_folder = os.path.join(tmpdir, "gtsrb")
         os.makedirs(root_folder, exist_ok=True)
 
         # Train data
-        train_folder = os.path.join(root_folder, "Training")
+        train_folder = os.path.join(root_folder, "GTSRB", "Training")
         os.makedirs(train_folder, exist_ok=True)
 
-        num_examples = 3
+        num_examples = 3 if config["split"] == "train" else 4
         classes = ("00000", "00042", "00012")
         for class_idx in classes:
             datasets_utils.create_image_folder(
@@ -2419,7 +2419,7 @@ def inject_fake_data(self, tmpdir: str, config):
 
         total_number_of_examples = num_examples * len(classes)
         # Test data
-        test_folder = os.path.join(root_folder, "Final_Test", "Images")
+        test_folder = os.path.join(root_folder, "GTSRB", "Final_Test", "Images")
         os.makedirs(test_folder, exist_ok=True)
 
         with open(os.path.join(root_folder, "GT-final_test.csv"), "w") as csv_file:
diff --git a/torchvision/datasets/gtsrb.py b/torchvision/datasets/gtsrb.py
index d970a0b472d..8a8ee3bdd77 100644
--- a/torchvision/datasets/gtsrb.py
+++ b/torchvision/datasets/gtsrb.py
@@ -1,11 +1,11 @@
 import csv
-import os
+import pathlib
 from typing import Any, Callable, Optional, Tuple
 
 import PIL
 
 from .folder import make_dataset
-from .utils import download_and_extract_archive
+from .utils import download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
 
 
@@ -14,8 +14,7 @@ class GTSRB(VisionDataset):
 
     Args:
         root (string): Root directory of the dataset.
-        train (bool, optional): If True, creates dataset from training set, otherwise
-            creates from test set.
+        split (string, optional): The dataset split, supports ``"train"`` (default), or ``"test"``.
         transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
@@ -24,23 +23,10 @@ class GTSRB(VisionDataset):
             downloaded again.
     """
 
-    # Ground Truth for the test set
-    _gt_url = "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_GT.zip"
-    _gt_csv = "GT-final_test.csv"
-    _gt_md5 = "fe31e9c9270bbcd7b84b7f21a9d9d9e5"
-
-    # URLs for the test and train set
-    _urls = (
-        "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_Images.zip",
-        "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB-Training_fixed.zip",
-    )
-
-    _md5s = ("c7e4e6327067d32654124b0fe9e82185", "513f3c79a4c5141765e10e952eaa2478")
-
     def __init__(
         self,
         root: str,
-        train: bool = True,
+        split: str = "train",
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
@@ -48,12 +34,11 @@ def __init__(
 
         super().__init__(root, transform=transform, target_transform=target_transform)
 
-        self.root = os.path.expanduser(root)
-
-        self.train = train
-
-        self._base_folder = os.path.join(self.root, type(self).__name__)
-        self._target_folder = os.path.join(self._base_folder, "Training" if self.train else "Final_Test/Images")
+        self._split = verify_str_arg(split, "split", ("train", "test"))
+        self._base_folder = pathlib.Path(root) / "gtsrb"
+        self._target_folder = (
+            self._base_folder / "GTSRB" / ("Training" if self._split == "train" else "Final_Test/Images")
+        )
 
         if download:
             self.download()
@@ -61,12 +46,12 @@ def __init__(
         if not self._check_exists():
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
-        if train:
+        if self._split == "train":
             samples = make_dataset(self._target_folder, extensions=(".ppm",))
         else:
-            with open(os.path.join(self._base_folder, self._gt_csv)) as csv_file:
+            with open(self._base_folder / "GT-final_test.csv") as csv_file:
                 samples = [
-                    (os.path.join(self._target_folder, row["Filename"]), int(row["ClassId"]))
+                    (self._target_folder / row["Filename"], int(row["ClassId"]))
                     for row in csv.DictReader(csv_file, delimiter=";", skipinitialspace=True)
                 ]
 
@@ -91,16 +76,26 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]:
         return sample, target
 
     def _check_exists(self) -> bool:
-        return os.path.exists(self._target_folder) and os.path.isdir(self._target_folder)
+        return self._target_folder.is_dir()
 
     def download(self) -> None:
         if self._check_exists():
             return
 
-        download_and_extract_archive(self._urls[self.train], download_root=self.root, md5=self._md5s[self.train])
-
-        if not self.train:
-            # Download Ground Truth for the test set
+        if self._split == "train":
+            download_and_extract_archive(
+                "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB-Training_fixed.zip",
+                download_root=str(self._base_folder),
+                md5="513f3c79a4c5141765e10e952eaa2478",
+            )
+        else:
+            download_and_extract_archive(
+                "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_Images.zip",
+                download_root=str(self._base_folder),
+                md5="c7e4e6327067d32654124b0fe9e82185",
+            )
             download_and_extract_archive(
-                self._gt_url, download_root=self.root, extract_root=self._base_folder, md5=self._gt_md5
+                "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_GT.zip",
+                download_root=str(self._base_folder),
+                md5="fe31e9c9270bbcd7b84b7f21a9d9d9e5",
             )

From a38a18b8c15486f8ae01960bf6bb7cd5f1901a5b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 11:42:47 +0000
Subject: [PATCH 06/36] mypy

---
 mypy.ini                     | 4 ++++
 torchvision/datasets/pcam.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/mypy.ini b/mypy.ini
index a6000f8a9d5..931665240f3 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -117,3 +117,7 @@ ignore_missing_imports = True
 [mypy-torchdata.*]
 
 ignore_missing_imports = True
+
+[mypy-h5py.*]
+
+ignore_missing_imports = True
diff --git a/torchvision/datasets/pcam.py b/torchvision/datasets/pcam.py
index 7238931d1f3..4f124674961 100644
--- a/torchvision/datasets/pcam.py
+++ b/torchvision/datasets/pcam.py
@@ -75,7 +75,7 @@ def __init__(
         download: bool = False,
     ):
         try:
-            import h5py  # type: ignore[import]
+            import h5py
 
             self.h5py = h5py
         except ImportError:

From d58ef16d9ac4050955628b1db22f67113c8d824b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 14:12:05 +0000
Subject: [PATCH 07/36] Remove split and partition for SUN397

---
 test/test_datasets.py          | 18 +-----------------
 torchvision/datasets/sun397.py | 30 ++++--------------------------
 2 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 02876364651..ca1579429be 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2281,11 +2281,6 @@ def inject_fake_data(self, tmpdir: str, config):
 class SUN397TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SUN397
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("train", "test"),
-        partition=(1, 10, None),
-    )
-
     def inject_fake_data(self, tmpdir: str, config):
         data_dir = pathlib.Path(tmpdir) / "SUN397"
         data_dir.mkdir()
@@ -2308,18 +2303,7 @@ def inject_fake_data(self, tmpdir: str, config):
         with open(data_dir / "ClassName.txt", "w") as file:
             file.writelines("\n".join(f"/{cls[0]}/{cls}" for cls in sampled_classes))
 
-        if config["partition"] is not None:
-            num_samples = max(len(im_paths) // (2 if config["split"] == "train" else 3), 1)
-
-            with open(data_dir / f"{config['split'].title()}ing_{config['partition']:02d}.txt", "w") as file:
-                file.writelines(
-                    "\n".join(
-                        f"/{f_path.relative_to(data_dir).as_posix()}"
-                        for f_path in random.choices(im_paths, k=num_samples)
-                    )
-                )
-        else:
-            num_samples = len(im_paths)
+        num_samples = len(im_paths)
 
         return num_samples
 
diff --git a/torchvision/datasets/sun397.py b/torchvision/datasets/sun397.py
index 2814ca80232..cc3457fb16f 100644
--- a/torchvision/datasets/sun397.py
+++ b/torchvision/datasets/sun397.py
@@ -3,7 +3,7 @@
 
 import PIL.Image
 
-from .utils import verify_str_arg, download_and_extract_archive
+from .utils import download_and_extract_archive
 from .vision import VisionDataset
 
 
@@ -11,14 +11,10 @@ class SUN397(VisionDataset):
     """`The SUN397 Data Set <https://vision.princeton.edu/projects/2010/SUN/>`_.
 
     The SUN397 or Scene UNderstanding (SUN) is a dataset for scene recognition consisting of
-    397 categories with 108'754 images. The dataset also provides 10 partitions for training
-    and testing, with each partition consisting of 50 images per class.
+    397 categories with 108'754 images.
 
     Args:
         root (string): Root directory of the dataset.
-        split (string, optional): The dataset split, supports ``"train"`` (default) and ``"test"``.
-        partition (int, optional): A valid partition can be an integer from 1 to 10 or None,
-            for the entire dataset.
         transform (callable, optional): A function/transform that  takes in an PIL image and returns a transformed
             version. E.g, ``transforms.RandomCrop``.
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
@@ -29,27 +25,17 @@ class SUN397(VisionDataset):
 
     _DATASET_URL = "http://vision.princeton.edu/projects/2010/SUN/SUN397.tar.gz"
     _DATASET_MD5 = "8ca2778205c41d23104230ba66911c7a"
-    _PARTITIONS_URL = "https://vision.princeton.edu/projects/2010/SUN/download/Partitions.zip"
-    _PARTITIONS_MD5 = "29a205c0a0129d21f36cbecfefe81881"
 
     def __init__(
         self,
         root: str,
-        split: str = "train",
-        partition: Optional[int] = 1,
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
     ) -> None:
         super().__init__(root, transform=transform, target_transform=target_transform)
-        self.split = verify_str_arg(split, "split", ("train", "test"))
-        self.partition = partition
         self._data_dir = Path(self.root) / "SUN397"
 
-        if self.partition is not None:
-            if self.partition < 0 or self.partition > 10:
-                raise RuntimeError(f"The partition parameter should be an int in [1, 10] or None, got {partition}.")
-
         if download:
             self._download()
 
@@ -60,11 +46,7 @@ def __init__(
             self.classes = [c[3:].strip() for c in f]
 
         self.class_to_idx = dict(zip(self.classes, range(len(self.classes))))
-        if self.partition is not None:
-            with open(self._data_dir / f"{self.split.title()}ing_{self.partition:02d}.txt", "r") as f:
-                self._image_files = [self._data_dir.joinpath(*line.strip()[1:].split("/")) for line in f]
-        else:
-            self._image_files = list(self._data_dir.rglob("sun_*.jpg"))
+        self._image_files = list(self._data_dir.rglob("sun_*.jpg"))
 
         self._labels = [
             self.class_to_idx["/".join(path.relative_to(self._data_dir).parts[1:-1])] for path in self._image_files
@@ -86,13 +68,9 @@ def __getitem__(self, idx) -> Tuple[Any, Any]:
         return image, label
 
     def _check_exists(self) -> bool:
-        return self._data_dir.exists() and self._data_dir.is_dir()
-
-    def extra_repr(self) -> str:
-        return "Split: {split}".format(**self.__dict__)
+        return self._data_dir.is_dir()
 
     def _download(self) -> None:
         if self._check_exists():
             return
         download_and_extract_archive(self._DATASET_URL, download_root=self.root, md5=self._DATASET_MD5)
-        download_and_extract_archive(self._PARTITIONS_URL, download_root=str(self._data_dir), md5=self._PARTITIONS_MD5)

From 5061141aedad6930f9d2eb269924180508047784 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 14:25:24 +0000
Subject: [PATCH 08/36] mypy

---
 torchvision/datasets/gtsrb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/datasets/gtsrb.py b/torchvision/datasets/gtsrb.py
index 8a8ee3bdd77..52a84ac7bca 100644
--- a/torchvision/datasets/gtsrb.py
+++ b/torchvision/datasets/gtsrb.py
@@ -47,7 +47,7 @@ def __init__(
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
         if self._split == "train":
-            samples = make_dataset(self._target_folder, extensions=(".ppm",))
+            samples = make_dataset(str(self._target_folder), extensions=(".ppm",))
         else:
             with open(self._base_folder / "GT-final_test.csv") as csv_file:
                 samples = [

From 6c02cff8ff277f92cf686ddc6c3eb89b5684f825 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 15:08:24 +0000
Subject: [PATCH 09/36] mypy

---
 torchvision/datasets/gtsrb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/datasets/gtsrb.py b/torchvision/datasets/gtsrb.py
index 52a84ac7bca..9a6dd934aa5 100644
--- a/torchvision/datasets/gtsrb.py
+++ b/torchvision/datasets/gtsrb.py
@@ -51,7 +51,7 @@ def __init__(
         else:
             with open(self._base_folder / "GT-final_test.csv") as csv_file:
                 samples = [
-                    (self._target_folder / row["Filename"], int(row["ClassId"]))
+                    (str(self._target_folder / row["Filename"]), int(row["ClassId"]))
                     for row in csv.DictReader(csv_file, delimiter=";", skipinitialspace=True)
                 ]
 

From 521b75c8d973ca8f3eabc296e6ad25bc05e0140c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Tue, 18 Jan 2022 18:40:57 +0000
Subject: [PATCH 10/36] WIP

---
 .../prototype/datasets/_builtin/README.md     | 150 +++++++++++++-----
 .../prototype/datasets/_builtin/__init__.py   |   1 +
 .../prototype/datasets/_builtin/gtsrb.py      |  29 +++-
 3 files changed, 140 insertions(+), 40 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index 20bcd7b89bb..4b69011bb57 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -1,14 +1,22 @@
 # How to add new built-in prototype datasets
 
-As the name implies, the datasets are still in a prototype state and thus subject to rapid change. This in turn means that this document will also change a lot.
+As the name implies, the datasets are still in a prototype state and thus
+subject to rapid change. This in turn means that this document will also change
+a lot.
 
-If you hit a blocker while adding a dataset, please have a look at another similar dataset to see how it is implemented there. If you can't resolve it yourself, feel free to send a draft PR in order for us to help you out. 
+If you hit a blocker while adding a dataset, please have a look at another
+similar dataset to see how it is implemented there. If you can't resolve it
+yourself, feel free to send a draft PR in order for us to help you out.
 
 Finally, `from torchvision.prototype import datasets` is implied below.
 
 ## Implementation
 
-Before we start with the actual implementation, you should create a module in `torchvision/prototype/datasets/_builtin` that hints at the dataset you are going to add. For example `caltech.py` for `caltech101` and `caltech256`. In that module create a class that inherits from `datasets.utils.Dataset` and overwrites at minimum three methods that will be discussed in detail below:
+Before we start with the actual implementation, you should create a module in
+`torchvision/prototype/datasets/_builtin` that hints at the dataset you are
+going to add. For example `caltech.py` for `caltech101` and `caltech256`. In
+that module create a class that inherits from `datasets.utils.Dataset` and
+overwrites at minimum three methods that will be discussed in detail below:
 
 ```python
 import io
@@ -37,27 +45,54 @@ class MyDataset(Dataset):
 
 ### `_make_info(self)`
 
-The `DatasetInfo` carries static information about the dataset. There are two required fields:
-- `name`: Name of the dataset. This will be used to load the dataset with `datasets.load(name)`. Should only contain lower characters.
-- `type`: Field of the `datasets.utils.DatasetType` enum. This is used to select the default decoder in case the user doesn't pass one. There are currently only two options: `IMAGE` and `RAW` ([see below](what-is-the-datasettyperaw-and-when-do-i-use-it) for details).
+The `DatasetInfo` carries static information about the dataset. There are two
+required fields:
+- `name`: Name of the dataset. This will be used to load the dataset with
+  `datasets.load(name)`. Should only contain lowercase characters.
+- `type`: Field of the `datasets.utils.DatasetType` enum. This is used to select
+   the default decoder in case the user doesn't pass one. There are currently
+   only two options: `IMAGE` and `RAW` ([see
+   below](what-is-the-datasettyperaw-and-when-do-i-use-it) for details).
 
 There are more optional parameters that can be passed:
 
-- `dependencies`: Collection of third-party dependencies that are needed to load the dataset, e.g. `("scipy",)`. Their availability will be automatically checked if a user tries to load the dataset. Within the implementation, import these packages lazily to avoid missing dependencies at import time.
-- `categories`: Sequence of human-readable category names for each label. The index of each category has to match the corresponding label returned in the dataset samples. [See below](#how-do-i-handle-a-dataset-that-defines-many-categories) how to handle cases with many categories.
-- `valid_options`: Configures valid options that can be passed to the dataset. It should be `Dict[str, Sequence[str]]`. The options are accessible through the `config` namespace in the other two functions. First value of the sequence is taken as default if the user passes no option to `torchvision.prototype.datasets.load()`.
+- `dependencies`: Collection of third-party dependencies that are needed to load
+  the dataset, e.g. `("scipy",)`. Their availability will be automatically
+  checked if a user tries to load the dataset. Within the implementation, import
+  these packages lazily to avoid missing dependencies at import time.
+- `categories`: Sequence of human-readable category names for each label. The
+  index of each category has to match the corresponding label returned in the
+  dataset samples. [See
+  below](#how-do-i-handle-a-dataset-that-defines-many-categories) how to handle
+  cases with many categories.
+- `valid_options`: Configures valid options that can be passed to the dataset.
+  It should be `Dict[str, Sequence[Any]]`. The options are accessible through
+  the `config` namespace in the other two functions. First value of the sequence
+  is taken as default if the user passes no option to
+  `torchvision.prototype.datasets.load()`.
 
 ## `resources(self, config)`
 
-Returns `List[datasets.utils.OnlineResource]` of all the files that need to be present locally before the dataset with a specific `config` can be build. The download will happen automatically. 
+Returns `List[datasets.utils.OnlineResource]` of all the files that need to be
+present locally before the dataset with a specific `config` can be build. The
+download will happen automatically. 
 
 Currently, the following `OnlineResource`'s are supported:
 
-- `HttpResource`: Used for files that are directly exposed through HTTP(s) and only requires the URL.
-- `GDriveResource`: Used for files that are hosted on GDrive and requires the GDrive ID as well as the `file_name`.
-- `ManualDownloadResource`: Used files are not publicly accessible and requires instructions how to download them manually. If the file does not exist, an error will be raised with the supplied instructions.
-
-Although optional in general, all resources used in the built-in datasets should comprise [SHA256](https://en.wikipedia.org/wiki/SHA-2) checksum for security. It will be automatically checked after the download. You can compute the checksum with system utilities or this snippet:
+- `HttpResource`: Used for files that are directly exposed through HTTP(s) and
+  only requires the URL.
+- `GDriveResource`: Used for files that are hosted on GDrive and requires the
+  GDrive ID as well as the `file_name`.
+- `ManualDownloadResource`: Used files are not publicly accessible and requires
+  instructions how to download them manually. If the file does not exist, an
+  error will be raised with the supplied instructions.
+- `KaggleDownloadResource`: Used for files that are available on Kaggle. This
+  inherits from `ManualDownloadResource`.
+
+Although optional in general, all resources used in the built-in datasets should
+comprise [SHA256](https://en.wikipedia.org/wiki/SHA-2) checksum for security. It
+will be automatically checked after the download. You can compute the checksum
+with system utilities e.g `sha256-sum`, or this snippet:
 
 ```python
 import hashlib
@@ -72,35 +107,60 @@ def sha256sum(path, chunk_size=1024 * 1024):
 
 ### `_make_datapipe(resource_dps, *, config, decoder)`
 
-This method is the heart of the dataset that need to transform the raw data into a usable form. A major difference compared to the current stable datasets is that everything is performed through `IterDataPipe`'s. From the perspective of someone that is working with them rather than on them, `IterDataPipe`'s behave just as generators, i.e. you can't do anything with them besides iterating. 
+This method is the heart of the dataset, where we transform the raw data into
+a usable form. A major difference compared to the current stable datasets is
+that everything is performed through `IterDataPipe`'s. From the perspective of
+someone that is working with them rather than on them, `IterDataPipe`'s behave
+just as generators, i.e. you can't do anything with them besides iterating.
 
-Of course, there are some common building blocks that should suffice in 95% of the cases. The most used 
+Of course, there are some common building blocks that should suffice in 95% of
+the cases. The most used are:
 
 - `Mapper`: Apply a callable to every item in the datapipe. 
 - `Filter`: Keep only items that satisfy a condition.
 - `Demultiplexer`: Split a datapipe into multiple ones.
 - `IterKeyZipper`: Merge two datapipes into one.
 
-All of them can be imported `from torchdata.datapipes.iter`. In addition, use `functools.partial` in case a callable needs extra arguments.  If the provided `IterDataPipe`'s are not sufficient for the use case, it is also not complicated to add one. See the MNIST or CelebA datasets for example.
-
-`make_datapipe()` receives `resource_dps`, which is a list of datapipes that has a 1-to-1 correspondence with the return value of `resources()`. In case of archives with regular suffixes (`.tar`, `.zip`, ...), the datapipe will contain tuples comprised of the path and the handle for every file in the archive. Otherwise the datapipe will only contain one of such tuples for the file specified by the resource.
-
-Since the datapipes are iterable in nature, some datapipes feature an in-memory buffer, e.g. `IterKeyZipper` and `Grouper`. There are two issues with that:
-1. If not used carefully, this can easily overflow the host memory, since most datasets will not fit in completely.
-2. This can lead to unnecessarily long warm-up times when data is buffered that is only needed at runtime.
-
-Thus, all buffered datapipes should be used as early as possible, e.g. zipping two datapipes of file handles rather than trying to zip already loaded images.
-
-There are two special datapipes that are not used through their class, but through the functions `hint_sharding` and `hint_shuffling`. As the name implies they only hint part in the datapipe graph where sharding and shuffling should take place, but are no-ops by default. They can be imported from `torchvision.prototype.datasets.utils._internal` and are required in each dataset.
-
-Finally, each item in the final datapipe should be a dictionary with `str` keys. There is no standardization of the names (yet!).
+All of them can be imported `from torchdata.datapipes.iter`. In addition, use
+`functools.partial` in case a callable needs extra arguments.  If the provided
+`IterDataPipe`'s are not sufficient for the use case, it is also not complicated
+to add one. See the MNIST or CelebA datasets for example.
+
+`make_datapipe()` receives `resource_dps`, which is a list of datapipes that has
+a 1-to-1 correspondence with the return value of `resources()`. In case of
+archives with regular suffixes (`.tar`, `.zip`, ...), the datapipe will contain
+tuples comprised of the path and the handle for every file in the archive.
+Otherwise the datapipe will only contain one of such tuples for the file
+specified by the resource.
+
+Since the datapipes are iterable in nature, some datapipes feature an in-memory
+buffer, e.g. `IterKeyZipper` and `Grouper`. There are two issues with that: 1.
+If not used carefully, this can easily overflow the host memory, since most
+datasets will not fit in completely. 2. This can lead to unnecessarily long
+warm-up times when data is buffered that is only needed at runtime.
+
+Thus, all buffered datapipes should be used as early as possible, e.g. zipping
+two datapipes of file handles rather than trying to zip already loaded images.
+
+There are two special datapipes that are not used through their class, but
+through the functions `hint_sharding` and `hint_shuffling`. As the name implies
+they only hint part in the datapipe graph where sharding and shuffling should
+take place, but are no-ops by default. They can be imported from
+`torchvision.prototype.datasets.utils._internal` and are required in each
+dataset.
+
+Finally, each item in the final datapipe should be a dictionary with `str` keys.
+There is no standardization of the names (yet!).
 
 ## FAQ
 
 ### What is the `DatasetType.RAW` and when do I use it?
 
-`DatasetType.RAW` marks dataset that provides decoded, i.e. raw pixel values, rather than encoded image files such as 
-`.jpg` or `.png`. This is usually only the case for small datasets, since it requires a lot more disk space. The default decoder `datasets.decoder.raw` is only a sentinel and should not be called directly. The decoding should look something like 
+`DatasetType.RAW` marks dataset that provides decoded, i.e. raw pixel values,
+rather than encoded image files such as `.jpg` or `.png`. This is usually only
+the case for small datasets, since it requires a lot more disk space. The
+default decoder `datasets.decoder.raw` is only a sentinel and should not be
+called directly. The decoding should look something like
 
 ```python
 from torchvision.prototype.datasets.decoder import raw
@@ -118,10 +178,28 @@ For examples, have a look at the MNIST, CIFAR, or SEMEION datasets.
 
 ### How do I handle a dataset that defines many categories?
 
-As a rule of thumb, `datasets.utils.DatasetInfo(..., categories=)` should only be set directly for ten categories or fewer. If more categories are needed, you can add a `$NAME.categories` file to the `_builtin` folder in which each line specifies a category. If `$NAME` matches the name of the dataset (which it definitively should!) it will be automatically loaded if `categories=` is not set.
-
-In case the categories can be generated from the dataset files, e.g. the dataset follow an image folder approach where each folder denotes the name of the category, the dataset can overwrite the `_generate_categories` method. It gets passed the `root` path to the resources, but they have to be manually loaded, e.g. `self.resources(config)[0].load(root)`. The method should return a sequence of strings representing the category names. To generate the `$NAME.categories` file, run `python -m torchvision.prototype.datasets.generate_category_files $NAME`.
+As a rule of thumb, `datasets.utils.DatasetInfo(..., categories=)` should only
+be set directly for ten categories or fewer. If more categories are needed, you
+can add a `$NAME.categories` file to the `_builtin` folder in which each line
+specifies a category. If `$NAME` matches the name of the dataset (which it
+definitively should!) it will be automatically loaded if `categories=` is not
+set.
+
+In case the categories can be generated from the dataset files, e.g. the dataset
+follows an image folder approach where each folder denotes the name of the
+category, the dataset can overwrite the `_generate_categories` method. It gets
+passed the `root` path to the resources, but they have to be manually loaded,
+e.g. `self.resources(config)[0].load(root)`. The method should return a sequence
+of strings representing the category names. To generate the `$NAME.categories`
+file, run `python -m torchvision.prototype.datasets.generate_category_files
+$NAME`.
 
 ### What if a resource file forms an I/O bottleneck?
 
-In general, we are ok with small performance hits of iterating archives rather than their extracted content. However, if the performance hit becomes significant, the archives can still be decompressed or extracted. To do this, the `decompress: bool` and `extract: bool` flags can be used for every `OnlineResource` individually. For more complex cases, each resource also accepts a `preprocess` callable that gets passed a `pathlib.Path` of the raw file and should return `pathlib.Path` of the preprocessed file or folder.
+In general, we are ok with small performance hits of iterating archives rather
+than their extracted content. However, if the performance hit becomes
+significant, the archives can still be decompressed or extracted. To do this,
+the `decompress: bool` and `extract: bool` flags can be used for every
+`OnlineResource` individually. For more complex cases, each resource also
+accepts a `preprocess` callable that gets passed a `pathlib.Path` of the raw
+file and should return `pathlib.Path` of the preprocessed file or folder.
diff --git a/torchvision/prototype/datasets/_builtin/__init__.py b/torchvision/prototype/datasets/_builtin/__init__.py
index a8be77351e7..465ecbbbf70 100644
--- a/torchvision/prototype/datasets/_builtin/__init__.py
+++ b/torchvision/prototype/datasets/_builtin/__init__.py
@@ -6,6 +6,7 @@
 from .cub200 import CUB200
 from .dtd import DTD
 from .fer2013 import FER2013
+from .gtsrb import GTSRB
 from .imagenet import ImageNet
 from .mnist import MNIST, FashionMNIST, KMNIST, EMNIST, QMNIST
 from .oxford_iiit_pet import OxfordIITPet
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index dc1959bc24d..9a7dc3b015f 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -1,9 +1,12 @@
-import functools
 import io
-from typing import Any, Callable, Dict, List, Optional, Union, cast
+import pathlib
+import resource
+from functools import partial
+from importlib import resources
+from typing import Any, Callable, Dict, List, Optional, Union, cast, Tuple
 
 import torch
-from torchdata.datapipes.iter import IterDataPipe, Mapper, CSVDictParser
+from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, IterKeyZipper, Demultiplexer, CSVDictParser
 from torchvision.prototype.datasets.decoder import raw
 from torchvision.prototype.datasets.utils import (
     Dataset,
@@ -24,7 +27,7 @@
 class GTSRB(Dataset):
     def _make_info(self) -> DatasetInfo:
         return DatasetInfo(
-            "GTSRB",
+            "gtsrb",
             type=DatasetType.RAW,
             homepage="https://benchmark.ini.rub.de",
             categories=(
@@ -57,3 +60,21 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
             )
 
         return rsrcs
+
+    def _filter_images(self, data: Tuple[str, Any]) -> bool:
+        return pathlib.Path(data[0]).suffix == ".ppm"
+
+    def _collate(self, data: Tuple[str, Any]):
+        return {"image_path": data[0], "image": "LMAO YOU WISH", "label": pathlib.Path(data[0]).parent.stem}
+
+    def _make_datapipe(
+        self,
+        resource_dps: List[IterDataPipe],
+        *,
+        config: DatasetConfig,
+        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
+    ) -> IterDataPipe[Dict[str, Any]]:
+        dp = resource_dps[0]
+        dp = Filter(dp, self._filter_images)
+        dp = Mapper(dp, self._collate)
+        return dp

From 1c1ceb0df5ff661f53807aa364f4f8f559930a92 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 11:33:59 +0000
Subject: [PATCH 11/36] WIP

---
 .../datasets/_builtin/gtsrb.categories        | 43 ++++++++++++
 .../prototype/datasets/_builtin/gtsrb.py      | 65 ++++++++++++++-----
 2 files changed, 92 insertions(+), 16 deletions(-)
 create mode 100644 torchvision/prototype/datasets/_builtin/gtsrb.categories

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.categories b/torchvision/prototype/datasets/_builtin/gtsrb.categories
new file mode 100644
index 00000000000..a9ea9ee8f08
--- /dev/null
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.categories
@@ -0,0 +1,43 @@
+00000
+00001
+00002
+00003
+00004
+00005
+00006
+00007
+00008
+00009
+00010
+00011
+00012
+00013
+00014
+00015
+00016
+00017
+00018
+00019
+00020
+00021
+00022
+00023
+00024
+00025
+00026
+00027
+00028
+00029
+00030
+00031
+00032
+00033
+00034
+00035
+00036
+00037
+00038
+00039
+00040
+00041
+00042
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 9a7dc3b015f..f62daa5d2ed 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -1,13 +1,10 @@
 import io
 import pathlib
-import resource
 from functools import partial
-from importlib import resources
-from typing import Any, Callable, Dict, List, Optional, Union, cast, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
-from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, IterKeyZipper, Demultiplexer, CSVDictParser
-from torchvision.prototype.datasets.decoder import raw
+from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, IterKeyZipper
 from torchvision.prototype.datasets.utils import (
     Dataset,
     DatasetConfig,
@@ -17,18 +14,18 @@
     HttpResource,
 )
 from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    image_buffer_from_array,
+    INFINITE_BUFFER_SIZE,
+    path_accessor,
+    getitem,
 )
-from torchvision.prototype.features import Label, Image
+from torchvision.prototype.features import Label
 
 
 class GTSRB(Dataset):
     def _make_info(self) -> DatasetInfo:
         return DatasetInfo(
             "gtsrb",
-            type=DatasetType.RAW,
+            type=DatasetType.IMAGE,
             homepage="https://benchmark.ini.rub.de",
             categories=(
                 "TO",
@@ -58,14 +55,23 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
                     sha256=self._CHECKSUMS["test_gt"],
                 )
             )
-
         return rsrcs
 
     def _filter_images(self, data: Tuple[str, Any]) -> bool:
         return pathlib.Path(data[0]).suffix == ".ppm"
 
-    def _collate(self, data: Tuple[str, Any]):
-        return {"image_path": data[0], "image": "LMAO YOU WISH", "label": pathlib.Path(data[0]).parent.stem}
+    def _append_label_train(self, path_and_handle: Tuple[str, Any]):
+        path = path_and_handle[0]
+        label = int(pathlib.Path(path).parent.stem)
+        return *path_and_handle, label
+
+    def _append_label_test(self, path_and_handle, csv_info):
+        label = int(csv_info["ClassId"])
+        return *path_and_handle, label
+
+    def _collate(self, data, decoder):
+        image_path, image_buffer, label = data
+        return {"image_path": image_path, "image": decoder(image_buffer), "label": Label(label)}
 
     def _make_datapipe(
         self,
@@ -74,7 +80,34 @@ def _make_datapipe(
         config: DatasetConfig,
         decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
     ) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, self._filter_images)
-        dp = Mapper(dp, self._collate)
+
+        if config["split"] == "train":
+            images_dp = resource_dps[0]
+            images_dp = Filter(images_dp, self._filter_images)
+            dp = Mapper(images_dp, self._append_label_train)  # path, handle, label
+        else:
+            images_dp, gt_dp = resource_dps
+            dp = Filter(images_dp, self._filter_images)
+
+            gt_dp = CSVDictParser(gt_dp, fieldnames=("Filename", "ClassId"), delimiter=";")
+
+            dp = IterKeyZipper(
+                images_dp,
+                gt_dp,
+                key_fn=path_accessor("name"),
+                ref_key_fn=getitem("Filename"),
+                buffer_size=INFINITE_BUFFER_SIZE,
+                merge_fn=self._append_label_test,
+            )  # path, handle, label
+
+        dp = Mapper(dp, partial(self._collate, decoder=decoder))
         return dp
+
+    def _generate_categories(self, root: pathlib.Path) -> List[str]:
+        config = self.default_config
+
+        images_dp = self.resources(config)[0].load(root)
+        images_dp = Filter(images_dp, self._filter_images)
+
+        labels = sorted(set(pathlib.Path(path).parent.stem for path, _ in images_dp))
+        return labels

From 4fdb976705b138378f4c818a553761f2233327f1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 11:34:56 +0000
Subject: [PATCH 12/36] WIP

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index f62daa5d2ed..e38c0c24cea 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -4,7 +4,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
-from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, IterKeyZipper
+from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, IterKeyZipper, CSVDictParser
 from torchvision.prototype.datasets.utils import (
     Dataset,
     DatasetConfig,
@@ -27,10 +27,6 @@ def _make_info(self) -> DatasetInfo:
             "gtsrb",
             type=DatasetType.IMAGE,
             homepage="https://benchmark.ini.rub.de",
-            categories=(
-                "TO",
-                "DO",
-            ),  # TODO
             valid_options=dict(split=("train", "test")),
         )
 

From a6ae4c4d6abe42a77a3af3f274cd52f51010aeaf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 13:05:00 +0000
Subject: [PATCH 13/36] Add tests

---
 test/builtin_dataset_mocks.py                 | 51 +++++++++++++++++++
 test/test_prototype_builtin_datasets.py       |  2 +-
 .../prototype/datasets/_builtin/gtsrb.py      |  5 ++
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index fc980326307..1133d1b2c21 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -1021,6 +1021,57 @@ def fer2013(info, root, config):
     return num_samples
 
 
+@DATASET_MOCKS.set_from_named_callable
+def gtsrb(info, root, config):
+    num_examples_per_class = 5 if config.split == "train" else 3
+    classes = ("00000", "00042", "00012")
+    num_examples = num_examples_per_class * len(classes)
+
+    if config["split"] == "train":
+        train_folder = root / "GTSRB" / "Training"
+        train_folder.mkdir(parents=True)
+
+        for class_idx in classes:
+            create_image_folder(
+                train_folder,
+                name=class_idx,
+                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                num_examples=num_examples_per_class,
+            )
+
+        make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
+    else:
+        test_folder = root / "GTSRB" / "Final_Test"
+        test_folder.mkdir(parents=True)
+
+        create_image_folder(
+            test_folder,
+            name="Images",
+            file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
+            num_examples=num_examples,
+        )
+
+        make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
+
+        with open(root / "GT-final_test.csv", "w") as csv_file:
+            csv_file.write("Filename;Width;Height;Roi.X1;Roi.Y1;Roi.X2;Roi.Y2;ClassId\n")
+            for image_idx in range(num_examples):
+                row = [
+                    f"{image_idx:05d}.ppm",
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, len(classes) + 1, size=()).item(),
+                ]
+                csv_file.write(";".join(map(str, row)) + "\n")
+        make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
+
+    return num_examples
+
+
 @DATASET_MOCKS.set_from_named_callable
 def clevr(info, root, config):
     data_folder = root / "CLEVR_v1.0"
diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
index bebeaccaadd..3ae2057f47b 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -114,7 +114,7 @@ def scan(graph):
             if type(dp) is annotation_dp_type:
                 break
         else:
-            raise AssertionError(f"The dataset doesn't comprise a {annotation_dp_type.__name__}() datapipe.")
+            raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
 
 
 @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index e38c0c24cea..b3a87641c40 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -14,6 +14,8 @@
     HttpResource,
 )
 from torchvision.prototype.datasets.utils._internal import (
+    hint_sharding,
+    hint_shuffling,
     INFINITE_BUFFER_SIZE,
     path_accessor,
     getitem,
@@ -96,6 +98,9 @@ def _make_datapipe(
                 merge_fn=self._append_label_test,
             )  # path, handle, label
 
+        dp = hint_sharding(dp)
+        dp = hint_shuffling(dp)
+
         dp = Mapper(dp, partial(self._collate, decoder=decoder))
         return dp
 

From 761e5d71108213de79d2c73a9781fa15e78ad529 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 13:14:10 +0000
Subject: [PATCH 14/36] Add some types

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index b3a87641c40..15557c989ff 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -58,18 +58,24 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
     def _filter_images(self, data: Tuple[str, Any]) -> bool:
         return pathlib.Path(data[0]).suffix == ".ppm"
 
-    def _append_label_train(self, path_and_handle: Tuple[str, Any]):
+    def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, Any, int]:
         path = path_and_handle[0]
         label = int(pathlib.Path(path).parent.stem)
         return *path_and_handle, label
 
-    def _append_label_test(self, path_and_handle, csv_info):
+    def _append_label_test(self, path_and_handle: Tuple[str, Any], csv_info: Dict[str, Any]) -> Tuple[str, Any, int]:
         label = int(csv_info["ClassId"])
         return *path_and_handle, label
 
-    def _collate(self, data, decoder):
+    def _collate(
+        self, data: Tuple[str, Any, int], decoder: Optional[Callable[[io.IOBase], torch.Tensor]]
+    ) -> Dict[str, Any]:
         image_path, image_buffer, label = data
-        return {"image_path": image_path, "image": decoder(image_buffer), "label": Label(label)}
+        return {
+            "image_path": image_path,
+            "image": decoder(image_buffer) if decoder else image_buffer,
+            "label": Label(label),
+        }
 
     def _make_datapipe(
         self,

From 1dd6efe97d70a990eae6b476c7e70d2acbd583e1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 13:24:25 +0000
Subject: [PATCH 15/36] lmao mypy you funny lad

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 15557c989ff..0b4aff523fd 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -44,7 +44,7 @@ def _make_info(self) -> DatasetInfo:
     }
 
     def resources(self, config: DatasetConfig) -> List[OnlineResource]:
-        rsrcs = [HttpResource(self._URLS[config.split], sha256=self._CHECKSUMS[config.split])]
+        rsrcs: List[OnlineResource] = [HttpResource(self._URLS[config.split], sha256=self._CHECKSUMS[config.split])]
 
         if config.split == "test":
             rsrcs.append(
@@ -53,6 +53,7 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
                     sha256=self._CHECKSUMS["test_gt"],
                 )
             )
+
         return rsrcs
 
     def _filter_images(self, data: Tuple[str, Any]) -> bool:

From a32ab88114c6ebe28f31f85fdf31ae98f2eb9298 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 13:35:23 +0000
Subject: [PATCH 16/36] fix unpacking

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 0b4aff523fd..559095b8020 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -60,13 +60,14 @@ def _filter_images(self, data: Tuple[str, Any]) -> bool:
         return pathlib.Path(data[0]).suffix == ".ppm"
 
     def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, Any, int]:
-        path = path_and_handle[0]
+        path, handle = path_and_handle
         label = int(pathlib.Path(path).parent.stem)
-        return *path_and_handle, label
+        return path, handle, label
 
     def _append_label_test(self, path_and_handle: Tuple[str, Any], csv_info: Dict[str, Any]) -> Tuple[str, Any, int]:
+        path, handle = path_and_handle
         label = int(csv_info["ClassId"])
-        return *path_and_handle, label
+        return path, handle, label
 
     def _collate(
         self, data: Tuple[str, Any, int], decoder: Optional[Callable[[io.IOBase], torch.Tensor]]

From e487828367093997680d0e1057a5416561e21eaa Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 15:22:38 +0000
Subject: [PATCH 17/36] Use DictWriter

---
 test/builtin_dataset_mocks.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index 1133d1b2c21..d7e593f4473 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -1054,19 +1054,23 @@ def gtsrb(info, root, config):
         make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
 
         with open(root / "GT-final_test.csv", "w") as csv_file:
-            csv_file.write("Filename;Width;Height;Roi.X1;Roi.Y1;Roi.X2;Roi.Y2;ClassId\n")
+            csv_file.write("")
+            columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
+            writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=";")
+            writer.writeheader()
             for image_idx in range(num_examples):
-                row = [
-                    f"{image_idx:05d}.ppm",
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, len(classes) + 1, size=()).item(),
-                ]
-                csv_file.write(";".join(map(str, row)) + "\n")
+                writer.writerow(
+                    {
+                        "Filename": f"{image_idx:05d}.ppm",
+                        "Width": torch.randint(1, 100, size=()).item(),
+                        "Height": torch.randint(1, 100, size=()).item(),
+                        "Roi.X1": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
+                        "Roi.X2": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
+                        "ClassId": torch.randint(1, len(classes) + 1, size=()).item(),
+                    }
+                )
         make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
 
     return num_examples

From 8f15cc3a3657a6ad5b9ab4920185613f6350af26 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 15:26:27 +0000
Subject: [PATCH 18/36] Hardcode categories since they are just ints in [0, 42]

---
 .../datasets/_builtin/gtsrb.categories        | 43 -------------------
 .../prototype/datasets/_builtin/gtsrb.py      | 10 +----
 2 files changed, 1 insertion(+), 52 deletions(-)
 delete mode 100644 torchvision/prototype/datasets/_builtin/gtsrb.categories

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.categories b/torchvision/prototype/datasets/_builtin/gtsrb.categories
deleted file mode 100644
index a9ea9ee8f08..00000000000
--- a/torchvision/prototype/datasets/_builtin/gtsrb.categories
+++ /dev/null
@@ -1,43 +0,0 @@
-00000
-00001
-00002
-00003
-00004
-00005
-00006
-00007
-00008
-00009
-00010
-00011
-00012
-00013
-00014
-00015
-00016
-00017
-00018
-00019
-00020
-00021
-00022
-00023
-00024
-00025
-00026
-00027
-00028
-00029
-00030
-00031
-00032
-00033
-00034
-00035
-00036
-00037
-00038
-00039
-00040
-00041
-00042
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 559095b8020..3402260fa40 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -29,6 +29,7 @@ def _make_info(self) -> DatasetInfo:
             "gtsrb",
             type=DatasetType.IMAGE,
             homepage="https://benchmark.ini.rub.de",
+            categories=[f"{label:05d}" for label in range(43)],
             valid_options=dict(split=("train", "test")),
         )
 
@@ -111,12 +112,3 @@ def _make_datapipe(
 
         dp = Mapper(dp, partial(self._collate, decoder=decoder))
         return dp
-
-    def _generate_categories(self, root: pathlib.Path) -> List[str]:
-        config = self.default_config
-
-        images_dp = self.resources(config)[0].load(root)
-        images_dp = Filter(images_dp, self._filter_images)
-
-        labels = sorted(set(pathlib.Path(path).parent.stem for path, _ in images_dp))
-        return labels

From 9ac22d33bcd2ffd712a2c48047881836efa3ac57 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 15:28:10 +0000
Subject: [PATCH 19/36] Split URL root

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 3402260fa40..07c8adf9bd5 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -33,10 +33,11 @@ def _make_info(self) -> DatasetInfo:
             valid_options=dict(split=("train", "test")),
         )
 
+    _URL_ROOT = "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/"
     _URLS = {
-        "train": "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB-Training_fixed.zip",
-        "test": "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_Images.zip",
-        "test_gt": "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_GT.zip",
+        "train": f"{_URL_ROOT}GTSRB-Training_fixed.zip",
+        "test": f"{_URL_ROOT}GTSRB_Final_Test_Images.zip",
+        "test_gt": f"{_URL_ROOT}GTSRB_Final_Test_GT.zip",
     }
     _CHECKSUMS = {
         "train": "df4144942083645bd60b594de348aa6930126c3e0e5de09e39611630abf8455a",

From 1f1fa3571cbdd6a76a9f9e1f08a2483f95f166f5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 15:28:27 +0000
Subject: [PATCH 20/36] Use name instead of stem

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 07c8adf9bd5..edbaa531575 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -63,7 +63,7 @@ def _filter_images(self, data: Tuple[str, Any]) -> bool:
 
     def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, Any, int]:
         path, handle = path_and_handle
-        label = int(pathlib.Path(path).parent.stem)
+        label = int(pathlib.Path(path).parent.name)
         return path, handle, label
 
     def _append_label_test(self, path_and_handle: Tuple[str, Any], csv_info: Dict[str, Any]) -> Tuple[str, Any, int]:

From f25a83a1c1cce801f044049f031ae27be4862abd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 15:33:57 +0000
Subject: [PATCH 21/36] Add category to labels, and fix dict reading

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index edbaa531575..50992c22cd0 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -78,7 +78,7 @@ def _collate(
         return {
             "image_path": image_path,
             "image": decoder(image_buffer) if decoder else image_buffer,
-            "label": Label(label),
+            "label": Label(label, category=self.categories[label]),
         }
 
     def _make_datapipe(
@@ -97,7 +97,8 @@ def _make_datapipe(
             images_dp, gt_dp = resource_dps
             dp = Filter(images_dp, self._filter_images)
 
-            gt_dp = CSVDictParser(gt_dp, fieldnames=("Filename", "ClassId"), delimiter=";")
+            fieldnames = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
+            gt_dp = CSVDictParser(gt_dp, fieldnames=fieldnames, delimiter=";")
 
             dp = IterKeyZipper(
                 images_dp,

From 52ec648a1f1536676af8266fbe30de9dbcce1aa0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 17:31:46 +0000
Subject: [PATCH 22/36] Use path_comparator

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 50992c22cd0..e72177c7f25 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -14,6 +14,7 @@
     HttpResource,
 )
 from torchvision.prototype.datasets.utils._internal import (
+    path_comparator,
     hint_sharding,
     hint_shuffling,
     INFINITE_BUFFER_SIZE,
@@ -58,9 +59,6 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
 
         return rsrcs
 
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
-        return pathlib.Path(data[0]).suffix == ".ppm"
-
     def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, Any, int]:
         path, handle = path_and_handle
         label = int(pathlib.Path(path).parent.name)
@@ -89,13 +87,13 @@ def _make_datapipe(
         decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
     ) -> IterDataPipe[Dict[str, Any]]:
 
+        images_dp = resource_dps[0]
+        images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
+
         if config["split"] == "train":
-            images_dp = resource_dps[0]
-            images_dp = Filter(images_dp, self._filter_images)
             dp = Mapper(images_dp, self._append_label_train)  # path, handle, label
         else:
-            images_dp, gt_dp = resource_dps
-            dp = Filter(images_dp, self._filter_images)
+            gt_dp = resource_dps[1]
 
             fieldnames = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
             gt_dp = CSVDictParser(gt_dp, fieldnames=fieldnames, delimiter=";")

From 379876fd0ea7eb71ed632cd587f5d8de2c951cc7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 19 Jan 2022 17:56:07 +0000
Subject: [PATCH 23/36] Use buffer_size=1

---
 test/datasets_utils.py                           | 2 +-
 torchvision/prototype/datasets/_builtin/gtsrb.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index b87d50ca3db..9de45bccf16 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -877,7 +877,7 @@ def _make_archive(root, name, *files_or_dirs, opener, adder, remove=True):
     files, dirs = _split_files_or_dirs(root, *files_or_dirs)
 
     with opener(archive) as fh:
-        for file in files:
+        for file in sorted(files):
             adder(fh, file, file.relative_to(root))
 
     if remove:
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index e72177c7f25..251062f284b 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -17,7 +17,6 @@
     path_comparator,
     hint_sharding,
     hint_shuffling,
-    INFINITE_BUFFER_SIZE,
     path_accessor,
     getitem,
 )
@@ -103,7 +102,7 @@ def _make_datapipe(
                 gt_dp,
                 key_fn=path_accessor("name"),
                 ref_key_fn=getitem("Filename"),
-                buffer_size=INFINITE_BUFFER_SIZE,
+                buffer_size=1,
                 merge_fn=self._append_label_test,
             )  # path, handle, label
 

From e26b4564d3f4a47e35494a73d1bf79ba721946e0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 20 Jan 2022 15:34:54 +0000
Subject: [PATCH 24/36] Use Zipper instead of IterKeyZipper

---
 .../prototype/datasets/_builtin/gtsrb.py      | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 251062f284b..2b2df919ef9 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -4,7 +4,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
-from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, IterKeyZipper, CSVDictParser
+from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, CSVDictParser, Zipper
 from torchvision.prototype.datasets.utils import (
     Dataset,
     DatasetConfig,
@@ -17,8 +17,6 @@
     path_comparator,
     hint_sharding,
     hint_shuffling,
-    path_accessor,
-    getitem,
 )
 from torchvision.prototype.features import Label
 
@@ -63,8 +61,8 @@ def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, An
         label = int(pathlib.Path(path).parent.name)
         return path, handle, label
 
-    def _append_label_test(self, path_and_handle: Tuple[str, Any], csv_info: Dict[str, Any]) -> Tuple[str, Any, int]:
-        path, handle = path_and_handle
+    def _append_label_test(self, data: Tuple[str, Any, Dict[str, Any]]) -> Tuple[str, Any, int]:
+        (path, handle), csv_info = data
         label = int(csv_info["ClassId"])
         return path, handle, label
 
@@ -95,16 +93,10 @@ def _make_datapipe(
             gt_dp = resource_dps[1]
 
             fieldnames = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
-            gt_dp = CSVDictParser(gt_dp, fieldnames=fieldnames, delimiter=";")
-
-            dp = IterKeyZipper(
-                images_dp,
-                gt_dp,
-                key_fn=path_accessor("name"),
-                ref_key_fn=getitem("Filename"),
-                buffer_size=1,
-                merge_fn=self._append_label_test,
-            )  # path, handle, label
+            gt_dp = CSVDictParser(gt_dp, fieldnames=fieldnames, delimiter=";", skip_lines=1)
+
+            dp = Zipper(images_dp, gt_dp)
+            dp = Mapper(dp, self._append_label_test)  # path, handle, label
 
         dp = hint_sharding(dp)
         dp = hint_shuffling(dp)

From b958b6b582f658cecac07810963fa077a440d74f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 20 Jan 2022 15:41:14 +0000
Subject: [PATCH 25/36] mypy

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 2b2df919ef9..621e0f3e0ea 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -61,7 +61,7 @@ def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, An
         label = int(pathlib.Path(path).parent.name)
         return path, handle, label
 
-    def _append_label_test(self, data: Tuple[str, Any, Dict[str, Any]]) -> Tuple[str, Any, int]:
+    def _append_label_test(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Tuple[str, Any, int]:
         (path, handle), csv_info = data
         label = int(csv_info["ClassId"])
         return path, handle, label

From 06c090435c7874572f2ee88b2b995b63ae3ac68c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 20 Jan 2022 17:01:17 +0000
Subject: [PATCH 26/36] Some more instructions

---
 .../prototype/datasets/_builtin/README.md     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index 4b69011bb57..97a7dfadbaf 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -41,7 +41,6 @@ class MyDataset(Dataset):
         decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
     ) -> IterDataPipe[Dict[str, Any]]:
         ...
-```
 
 ### `_make_info(self)`
 
@@ -154,6 +153,27 @@ There is no standardization of the names (yet!).
 
 ## FAQ
 
+### How do I start?
+
+Get the skeleton of your dataset class ready with all 3 methods. For
+`_make_datapipe()`, you can just do `return resources_dp[0]` to get started.
+Then import the dataset class in
+`torchvision/prototype/datasets/_builtin/__init__.py`: this will automatically
+register the dataset and it will be instantiable via
+`datasets.load("mydataset")`. On a separate script, try something like
+
+```py
+d = datasets.load("mydataset")
+for e in d:
+    print(e)  # this is the content of the datapipe returned by _make_datapipe()
+# Or you can also inspect e in a debugger
+```
+
+This will give you an idea of what the first datapipe in `resources_dp`
+contains. You can also do that with `resources_dp[1]` or `resources_dp[2]`
+(etc.) if they exist. Then follow the instructions above to manipulate these
+datapipes and return the appropriate dict format.
+
 ### What is the `DatasetType.RAW` and when do I use it?
 
 `DatasetType.RAW` marks dataset that provides decoded, i.e. raw pixel values,

From 18b87e293e4c98a9ca4aa6425117af47a084866d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 20 Jan 2022 17:06:21 +0000
Subject: [PATCH 27/36] forgot backquotes

---
 torchvision/prototype/datasets/_builtin/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index 97a7dfadbaf..b04591f3587 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -41,6 +41,7 @@ class MyDataset(Dataset):
         decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
     ) -> IterDataPipe[Dict[str, Any]]:
         ...
+```
 
 ### `_make_info(self)`
 

From 44bb8f16a77fc40747312c808e8c6e8129b4aaba Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 21 Jan 2022 11:15:44 +0000
Subject: [PATCH 28/36] Apply suggestions from code review

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/prototype/datasets/_builtin/README.md | 12 ++++++++----
 torchvision/prototype/datasets/_builtin/gtsrb.py  |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index b04591f3587..f9eb6b0e735 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -164,16 +164,20 @@ register the dataset and it will be instantiable via
 `datasets.load("mydataset")`. On a separate script, try something like
 
 ```py
-d = datasets.load("mydataset")
-for e in d:
-    print(e)  # this is the content of the datapipe returned by _make_datapipe()
+from torchvision.prototype import datasets
+
+dataset = datasets.load("mydataset")
+for sample in dataset:
+    # this is the content of an item in datapipe returned by _make_datapipe()
+    print(sample)
+    break
 # Or you can also inspect e in a debugger
 ```
 
 This will give you an idea of what the first datapipe in `resources_dp`
 contains. You can also do that with `resources_dp[1]` or `resources_dp[2]`
 (etc.) if they exist. Then follow the instructions above to manipulate these
-datapipes and return the appropriate dict format.
+datapipes and return the appropriate dictionary format.
 
 ### What is the `DatasetType.RAW` and when do I use it?
 
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 621e0f3e0ea..2248faa346b 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -88,9 +88,9 @@ def _make_datapipe(
         images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
 
         if config["split"] == "train":
-            dp = Mapper(images_dp, self._append_label_train)  # path, handle, label
+            dp = Mapper(images_dp, self._append_label_train)
         else:
-            gt_dp = resource_dps[1]
+            ground_truth_dp = resource_dps[1]
 
             fieldnames = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
             gt_dp = CSVDictParser(gt_dp, fieldnames=fieldnames, delimiter=";", skip_lines=1)

From c1ec16df42dff12eb4881e51769428684f0bfe1f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 21 Jan 2022 11:20:00 +0000
Subject: [PATCH 29/36] gt -> ground_truth

---
 test/datasets_utils.py                           |  2 +-
 torchvision/prototype/datasets/_builtin/gtsrb.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 88eb4e17823..5cb43680cda 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -881,7 +881,7 @@ def _make_archive(root, name, *files_or_dirs, opener, adder, remove=True):
     files, dirs = _split_files_or_dirs(root, *files_or_dirs)
 
     with opener(archive) as fh:
-        for file in sorted(files):
+        for file in files:
             adder(fh, file, file.relative_to(root))
 
     if remove:
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 2248faa346b..a696de58f22 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -35,12 +35,12 @@ def _make_info(self) -> DatasetInfo:
     _URLS = {
         "train": f"{_URL_ROOT}GTSRB-Training_fixed.zip",
         "test": f"{_URL_ROOT}GTSRB_Final_Test_Images.zip",
-        "test_gt": f"{_URL_ROOT}GTSRB_Final_Test_GT.zip",
+        "test_ground_truth": f"{_URL_ROOT}GTSRB_Final_Test_GT.zip",
     }
     _CHECKSUMS = {
         "train": "df4144942083645bd60b594de348aa6930126c3e0e5de09e39611630abf8455a",
         "test": "48ba6fab7e877eb64eaf8de99035b0aaecfbc279bee23e35deca4ac1d0a837fa",
-        "test_gt": "f94e5a7614d75845c74c04ddb26b8796b9e483f43541dd95dd5b726504e16d6d",
+        "test_ground_truth": "f94e5a7614d75845c74c04ddb26b8796b9e483f43541dd95dd5b726504e16d6d",
     }
 
     def resources(self, config: DatasetConfig) -> List[OnlineResource]:
@@ -49,8 +49,8 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
         if config.split == "test":
             rsrcs.append(
                 HttpResource(
-                    self._URLS["test_gt"],
-                    sha256=self._CHECKSUMS["test_gt"],
+                    self._URLS["test_ground_truth"],
+                    sha256=self._CHECKSUMS["test_ground_truth"],
                 )
             )
 
@@ -93,10 +93,10 @@ def _make_datapipe(
             ground_truth_dp = resource_dps[1]
 
             fieldnames = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
-            gt_dp = CSVDictParser(gt_dp, fieldnames=fieldnames, delimiter=";", skip_lines=1)
+            ground_truth_dp = CSVDictParser(ground_truth_dp, fieldnames=fieldnames, delimiter=";", skip_lines=1)
 
-            dp = Zipper(images_dp, gt_dp)
-            dp = Mapper(dp, self._append_label_test)  # path, handle, label
+            dp = Zipper(images_dp, ground_truth_dp)
+            dp = Mapper(dp, self._append_label_test)
 
         dp = hint_sharding(dp)
         dp = hint_shuffling(dp)

From ff78c70ed9359e28d25e2d4bb77361e77d0f337a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 21 Jan 2022 11:26:55 +0000
Subject: [PATCH 30/36] e -> sample

---
 torchvision/prototype/datasets/_builtin/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
index f9eb6b0e735..8ee6e8e5a66 100644
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ b/torchvision/prototype/datasets/_builtin/README.md
@@ -168,10 +168,9 @@ from torchvision.prototype import datasets
 
 dataset = datasets.load("mydataset")
 for sample in dataset:
-    # this is the content of an item in datapipe returned by _make_datapipe()
-    print(sample)
+    print(sample)  # this is the content of an item in datapipe returned by _make_datapipe()
     break
-# Or you can also inspect e in a debugger
+# Or you can also inspect the sample in a debugger
 ```
 
 This will give you an idea of what the first datapipe in `resources_dp`

From cd38e250deb9ec4e8a6008972f5cf3f05e87b38d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 21 Jan 2022 14:54:58 +0000
Subject: [PATCH 31/36] Add support for bboxes

---
 test/builtin_dataset_mocks.py                 | 53 ++++++++++++-------
 .../prototype/datasets/_builtin/gtsrb.py      | 51 ++++++++++--------
 2 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index 3f7590feafb..caf36c20941 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -1023,6 +1023,29 @@ def gtsrb(info, root, config):
     classes = ("00000", "00042", "00012")
     num_examples = num_examples_per_class * len(classes)
 
+    csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
+
+    def _make_ann_file(path, num_examples, class_idx):
+        if class_idx == "random":
+            class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
+
+        with open(path, "w") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
+            writer.writeheader()
+            for image_idx in range(num_examples):
+                writer.writerow(
+                    {
+                        "Filename": f"{image_idx:05d}.ppm",
+                        "Width": torch.randint(1, 100, size=()).item(),
+                        "Height": torch.randint(1, 100, size=()).item(),
+                        "Roi.X1": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
+                        "Roi.X2": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
+                        "ClassId": class_idx,
+                    }
+                )
+
     if config["split"] == "train":
         train_folder = root / "GTSRB" / "Training"
         train_folder.mkdir(parents=True)
@@ -1034,7 +1057,11 @@ def gtsrb(info, root, config):
                 file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
                 num_examples=num_examples_per_class,
             )
-
+            _make_ann_file(
+                path=train_folder / class_idx / f"GT-{class_idx}.csv",
+                num_examples=num_examples_per_class,
+                class_idx=int(class_idx),
+            )
         make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
     else:
         test_folder = root / "GTSRB" / "Final_Test"
@@ -1049,24 +1076,12 @@ def gtsrb(info, root, config):
 
         make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
 
-        with open(root / "GT-final_test.csv", "w") as csv_file:
-            csv_file.write("")
-            columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
-            writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=";")
-            writer.writeheader()
-            for image_idx in range(num_examples):
-                writer.writerow(
-                    {
-                        "Filename": f"{image_idx:05d}.ppm",
-                        "Width": torch.randint(1, 100, size=()).item(),
-                        "Height": torch.randint(1, 100, size=()).item(),
-                        "Roi.X1": torch.randint(1, 100, size=()).item(),
-                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
-                        "Roi.X2": torch.randint(1, 100, size=()).item(),
-                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
-                        "ClassId": torch.randint(1, len(classes) + 1, size=()).item(),
-                    }
-                )
+        _make_ann_file(
+            path=root / "GT-final_test.csv",
+            num_examples=num_examples,
+            class_idx="random",
+        )
+
         make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
 
     return num_examples
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index a696de58f22..3e8c357e8fc 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -4,7 +4,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
-from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, CSVDictParser, Zipper
+from torchdata.datapipes.iter import IterDataPipe, Mapper, Filter, CSVDictParser, Zipper, Demultiplexer
 from torchvision.prototype.datasets.utils import (
     Dataset,
     DatasetConfig,
@@ -17,8 +17,9 @@
     path_comparator,
     hint_sharding,
     hint_shuffling,
+    INFINITE_BUFFER_SIZE,
 )
-from torchvision.prototype.features import Label
+from torchvision.prototype.features import Label, BoundingBox
 
 
 class GTSRB(Dataset):
@@ -61,21 +62,32 @@ def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, An
         label = int(pathlib.Path(path).parent.name)
         return path, handle, label
 
-    def _append_label_test(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Tuple[str, Any, int]:
-        (path, handle), csv_info = data
+    def _collate_and_decode(
+        self, data: Tuple[Tuple[str, Any], Dict[str, Any]], decoder: Optional[Callable[[io.IOBase], torch.Tensor]]
+    ) -> Dict[str, Any]:
+        (image_path, image_buffer), csv_info = data
         label = int(csv_info["ClassId"])
-        return path, handle, label
 
-    def _collate(
-        self, data: Tuple[str, Any, int], decoder: Optional[Callable[[io.IOBase], torch.Tensor]]
-    ) -> Dict[str, Any]:
-        image_path, image_buffer, label = data
+        bbox = BoundingBox(
+            torch.tensor([int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")]), format="xyxy"
+        )
+
         return {
             "image_path": image_path,
             "image": decoder(image_buffer) if decoder else image_buffer,
             "label": Label(label, category=self.categories[label]),
+            "bbox": bbox,
         }
 
+    def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+        path = pathlib.Path(data[0])
+        if path.suffix == ".ppm":
+            return 0
+        elif path.suffix == ".csv":
+            return 1
+        else:
+            return None
+
     def _make_datapipe(
         self,
         resource_dps: List[IterDataPipe],
@@ -84,22 +96,19 @@ def _make_datapipe(
         decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
     ) -> IterDataPipe[Dict[str, Any]]:
 
-        images_dp = resource_dps[0]
-        images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
-
-        if config["split"] == "train":
-            dp = Mapper(images_dp, self._append_label_train)
+        if config.split == "train":
+            images_dp, ann_dp = Demultiplexer(
+                resource_dps[0], 2, self._classify_train_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
+            )
         else:
-            ground_truth_dp = resource_dps[1]
-
-            fieldnames = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
-            ground_truth_dp = CSVDictParser(ground_truth_dp, fieldnames=fieldnames, delimiter=";", skip_lines=1)
+            images_dp, ann_dp = resource_dps
+            images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
 
-            dp = Zipper(images_dp, ground_truth_dp)
-            dp = Mapper(dp, self._append_label_test)
+        ann_dp = CSVDictParser(ann_dp, delimiter=";")
+        dp = Zipper(images_dp, ann_dp)
 
         dp = hint_sharding(dp)
         dp = hint_shuffling(dp)
 
-        dp = Mapper(dp, partial(self._collate, decoder=decoder))
+        dp = Mapper(dp, partial(self._collate_and_decode, decoder=decoder))
         return dp

From 1e8aea6b512095d92f10bd1bf6b266cc39903c59 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 21 Jan 2022 15:44:26 +0000
Subject: [PATCH 32/36] Update torchvision/prototype/datasets/_builtin/gtsrb.py

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 3e8c357e8fc..c8f9c058fee 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -69,7 +69,7 @@ def _collate_and_decode(
         label = int(csv_info["ClassId"])
 
         bbox = BoundingBox(
-            torch.tensor([int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")]), format="xyxy"
+            torch.tensor([int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")]), format="xyxy", image_size=(int(csv_info["Height"]), int(csv_info["Width"]))
         )
 
         return {

From 8e9a617f9e48138858682ab8aff43710028278f4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 21 Jan 2022 15:45:00 +0000
Subject: [PATCH 33/36] format

---
 torchvision/prototype/datasets/_builtin/gtsrb.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index c8f9c058fee..632256ad29e 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -69,7 +69,9 @@ def _collate_and_decode(
         label = int(csv_info["ClassId"])
 
         bbox = BoundingBox(
-            torch.tensor([int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")]), format="xyxy", image_size=(int(csv_info["Height"]), int(csv_info["Width"]))
+            torch.tensor([int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")]),
+            format="xyxy",
+            image_size=(int(csv_info["Height"]), int(csv_info["Width"])),
         )
 
         return {

From 6703710f008915b4229bd424a497dc8e84138c1a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 21 Jan 2022 15:45:44 +0000
Subject: [PATCH 34/36] Remove unused method

---
 .../prototype/datasets/_builtin/gtsrb.py      | 21 +++++++------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 632256ad29e..2ac30a389ed 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -57,10 +57,14 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
 
         return rsrcs
 
-    def _append_label_train(self, path_and_handle: Tuple[str, Any]) -> Tuple[str, Any, int]:
-        path, handle = path_and_handle
-        label = int(pathlib.Path(path).parent.name)
-        return path, handle, label
+    def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
+        path = pathlib.Path(data[0])
+        if path.suffix == ".ppm":
+            return 0
+        elif path.suffix == ".csv":
+            return 1
+        else:
+            return None
 
     def _collate_and_decode(
         self, data: Tuple[Tuple[str, Any], Dict[str, Any]], decoder: Optional[Callable[[io.IOBase], torch.Tensor]]
@@ -81,15 +85,6 @@ def _collate_and_decode(
             "bbox": bbox,
         }
 
-    def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.suffix == ".ppm":
-            return 0
-        elif path.suffix == ".csv":
-            return 1
-        else:
-            return None
-
     def _make_datapipe(
         self,
         resource_dps: List[IterDataPipe],

From 6b67ce740f044fb4b38b5942147504db34348db7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 21 Jan 2022 16:07:44 +0000
Subject: [PATCH 35/36] Add test for label matching

---
 test/datasets_utils.py                          |  2 +-
 test/test_prototype_builtin_datasets.py         | 17 +++++++++++++++++
 .../prototype/datasets/_builtin/gtsrb.py        |  2 ++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 5cb43680cda..88eb4e17823 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -881,7 +881,7 @@ def _make_archive(root, name, *files_or_dirs, opener, adder, remove=True):
     files, dirs = _split_files_or_dirs(root, *files_or_dirs)
 
     with opener(archive) as fh:
-        for file in files:
+        for file in sorted(files):
             adder(fh, file, file.relative_to(root))
 
     if remove:
diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
index 9ceedc4a95d..f7ea4b3f6b2 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -1,4 +1,5 @@
 import io
+from pathlib import Path
 
 import pytest
 import torch
@@ -143,3 +144,19 @@ def test_extra_label(self, dataset_mock, config):
             ("unused", bool),
         ):
             assert key in sample and isinstance(sample[key], type)
+
+
+@parametrize_dataset_mocks(DATASET_MOCKS["gtsrb"])
+class TestGTSRB:
+    def test_label_matches_path(self, dataset_mock, config):
+        # We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
+        # This test makes sure that they're both the same
+        if config["split"] != "train":
+            return
+
+        with dataset_mock.prepare(config):
+            dataset = datasets.load(dataset_mock.name, **config)
+
+        for sample in dataset:
+            label_from_path = int(Path(sample["image_path"]).parent.name)
+            assert sample["label"] == label_from_path
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
index 2ac30a389ed..08855b3a2bd 100644
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ b/torchvision/prototype/datasets/_builtin/gtsrb.py
@@ -101,6 +101,8 @@ def _make_datapipe(
             images_dp, ann_dp = resource_dps
             images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
 
+        # The order of the image files in the the .zip archives perfectly match the order of the entries in
+        # the (possibly concatenated) .csv files. So we're able to use Zipper here instead of a IterKeyZipper.
         ann_dp = CSVDictParser(ann_dp, delimiter=";")
         dp = Zipper(images_dp, ann_dp)
 

From 1ef84e04e095c8b1e6d96c86da3f353363b91db1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 24 Jan 2022 09:48:41 +0000
Subject: [PATCH 36/36] Update test/test_prototype_builtin_datasets.py

---
 test/test_prototype_builtin_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
index f7ea4b3f6b2..fbebcbfd5cc 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -151,7 +151,7 @@ class TestGTSRB:
     def test_label_matches_path(self, dataset_mock, config):
         # We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
         # This test makes sure that they're both the same
-        if config["split"] != "train":
+        if config.split != "train":
             return
 
         with dataset_mock.prepare(config):