pytorch
diff --git a/‎test/builtin_dataset_mocks.py‎
Lines changed: 70 additions & 0 deletions b/‎test/builtin_dataset_mocks.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎test/datasets_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎test/datasets_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/test_prototype_builtin_datasets.py‎
Lines changed: 18 additions & 1 deletion b/‎test/test_prototype_builtin_datasets.py‎
Lines changed: 18 additions & 1 deletion
@@ -1017,6 +1017,76 @@ def fer2013(info, root, config):
     return num_samples
 
 
+@DATASET_MOCKS.set_from_named_callable
+def gtsrb(info, root, config):
+    num_examples_per_class = 5 if config.split == "train" else 3
+    classes = ("00000", "00042", "00012")
+    num_examples = num_examples_per_class * len(classes)
+
+    csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
+
+    def _make_ann_file(path, num_examples, class_idx):
+        if class_idx == "random":
+            class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
+
+        with open(path, "w") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
+            writer.writeheader()
+            for image_idx in range(num_examples):
+                writer.writerow(
+                    {
+                        "Filename": f"{image_idx:05d}.ppm",
+                        "Width": torch.randint(1, 100, size=()).item(),
+                        "Height": torch.randint(1, 100, size=()).item(),
+                        "Roi.X1": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
+                        "Roi.X2": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
+                        "ClassId": class_idx,
+                    }
+                )
+
+    if config["split"] == "train":
+        train_folder = root / "GTSRB" / "Training"
+        train_folder.mkdir(parents=True)
+
+        for class_idx in classes:
+            create_image_folder(
+                train_folder,
+                name=class_idx,
+                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                num_examples=num_examples_per_class,
+            )
+            _make_ann_file(
+                path=train_folder / class_idx / f"GT-{class_idx}.csv",
+                num_examples=num_examples_per_class,
+                class_idx=int(class_idx),
+            )
+        make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
+    else:
+        test_folder = root / "GTSRB" / "Final_Test"
+        test_folder.mkdir(parents=True)
+
+        create_image_folder(
+            test_folder,
+            name="Images",
+            file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
+            num_examples=num_examples,
+        )
+
+        make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
+
+        _make_ann_file(
+            path=root / "GT-final_test.csv",
+            num_examples=num_examples,
+            class_idx="random",
+        )
+
+        make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
+
+    return num_examples
+
+
 @DATASET_MOCKS.set_from_named_callable
 def clevr(info, root, config):
     data_folder = root / "CLEVR_v1.0"
 
@@ -881,7 +881,7 @@ def _make_archive(root, name, *files_or_dirs, opener, adder, remove=True):
     files, dirs = _split_files_or_dirs(root, *files_or_dirs)
 
     with opener(archive) as fh:
-        for file in files:
+        for file in sorted(files):
             adder(fh, file, file.relative_to(root))
 
     if remove:
 
@@ -1,4 +1,5 @@
 import io
+from pathlib import Path
 
 import pytest
 import torch
@@ -123,7 +124,7 @@ def scan(graph):
             if type(dp) is annotation_dp_type:
                 break
         else:
-            raise AssertionError(f"The dataset doesn't comprise a {annotation_dp_type.__name__}() datapipe.")
+            raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
 
 
 @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
@@ -143,3 +144,19 @@ def test_extra_label(self, dataset_mock, config):
             ("unused", bool),
         ):
             assert key in sample and isinstance(sample[key], type)
+
+
+@parametrize_dataset_mocks(DATASET_MOCKS["gtsrb"])
+class TestGTSRB:
+    def test_label_matches_path(self, dataset_mock, config):
+        # We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
+        # This test makes sure that they're both the same
+        if config.split != "train":
+            return
+
+        with dataset_mock.prepare(config):
+            dataset = datasets.load(dataset_mock.name, **config)
+
+        for sample in dataset:
+            label_from_path = int(Path(sample["image_path"]).parent.name)
+            assert sample["label"] == label_from_path