From 16b46c801b5f7bb90f5b189b3f0deff3f1db7b1c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Apr 2023 10:07:04 +0200
Subject: [PATCH 1/4] benchmark ssdlite detection pipeline

---
 datasets.py                |  95 +++++++++-
 main.py                    |  10 +-
 results/20230331121654.log | 257 --------------------------
 results/20230403073901.log | 363 +++++++++++++++++++++++++++++++++++++
 tasks.py                   |  10 +-
 transforms.py              | 301 +++++++++++++++++++++++++++++-
 6 files changed, 769 insertions(+), 267 deletions(-)
 delete mode 100644 results/20230331121654.log
 create mode 100644 results/20230403073901.log

diff --git a/datasets.py b/datasets.py
index 00bd36f..b4e3a7c 100644
--- a/datasets.py
+++ b/datasets.py
@@ -1,13 +1,102 @@
-import torch
+import pathlib
+
+from torch.hub import tqdm
 
+from torchvision import datasets
 from torchvision.transforms import functional as F_v1
 
+COCO_ROOT = "~/datasets/coco"
+
+__all__ = ["classification_dataset_builder", "detection_dataset_builder"]
 
-def classification_dataset_builder(*, input_type, api_version, rng, num_samples):
+
+def classification_dataset_builder(*, api_version, rng, num_samples):
     return [
         F_v1.to_pil_image(
             # average size of images in ImageNet
-            torch.randint(0, 256, (3, 469, 387), dtype=torch.uint8, generator=rng)
+            torch.randint(0, 256, (3, 469, 387), dtype=torch.uint8, generator=rng),
         )
         for _ in range(num_samples)
     ]
+
+
+def detection_dataset_builder(*, api_version, rng, num_samples):
+    root = pathlib.Path(COCO_ROOT).expanduser().resolve()
+    image_folder = str(root / "train2017")
+    annotation_file = str(root / "annotations" / "instances_train2017.json")
+    if api_version == "v1":
+        dataset = CocoDetectionV1(image_folder, annotation_file, transforms=None)
+    elif api_version == "v2":
+        dataset = datasets.CocoDetection(image_folder, annotation_file)
+    else:
+        raise ValueError(f"Got {api_version=}")
+
+    dataset = _coco_remove_images_without_annotations(dataset)
+
+    idcs = torch.randperm(len(dataset), generator=rng)[:num_samples]
+    print(f"Caching {num_samples} COCO samples")
+    return [dataset[idx] for idx in tqdm(idcs.tolist())]
+
+
+# everything below is copy-pasted from
+# https://github.com/pytorch/vision/blob/main/references/detection/coco_utils.py
+
+import torch
+import torchvision
+
+
+class CocoDetectionV1(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)
+        image_id = self.ids[idx]
+        target = dict(image_id=image_id, annotations=target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    if not isinstance(dataset, torchvision.datasets.CocoDetection):
+        raise TypeError(
+            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
+        )
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
diff --git a/main.py b/main.py
index 054c6c9..a9e8b92 100644
--- a/main.py
+++ b/main.py
@@ -23,6 +23,10 @@ def write(self, message):
         self.stdout.write(message)
         self.file.write(message)
 
+    def flush(self):
+        self.stdout.flush()
+        self.file.flush()
+
 
 def main(*, input_types, tasks, num_samples):
     # This is hardcoded when using a DataLoader with multiple workers:
@@ -111,7 +115,11 @@ def main(*, input_types, tasks, num_samples):
 
     with contextlib.redirect_stdout(tee):
         main(
-            tasks=["classification-simple", "classification-complex"],
+            tasks=[
+                "classification-simple",
+                "classification-complex",
+                "detection-ssdlite",
+            ],
             input_types=["Tensor", "PIL", "Datapoint"],
             num_samples=10_000,
         )
diff --git a/results/20230331121654.log b/results/20230331121654.log
deleted file mode 100644
index 83f9c9e..0000000
--- a/results/20230331121654.log
+++ /dev/null
@@ -1,257 +0,0 @@
-############################################################
-classification-simple
-############################################################
-input_type='Tensor', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          110 µs +-     11 µs
-RandomResizedCropWithoutResizeV1      54 µs +-      7 µs
-Resize                               645 µs +-    170 µs
-RandomHorizontalFlip                  21 µs +-     11 µs
-ConvertImageDtype                     48 µs +-     10 µs
-Normalize                             75 µs +-     10 µs
-
-total                                953 µs
-------------------------------------------------------------
-input_type='Tensor', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          119 µs +-      9 µs
-RandomResizedCropWithoutResizeV2      54 µs +-     13 µs
-Resize                               653 µs +-    214 µs
-RandomHorizontalFlip                  32 µs +-     13 µs
-ConvertDtype                          42 µs +-      4 µs
-Normalize                             62 µs +-      6 µs
-
-total                                962 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV1      73 µs +-     19 µs
-Resize                               564 µs +-    155 µs
-RandomHorizontalFlip                  25 µs +-     21 µs
-PILToTensor                           51 µs +-      5 µs
-ConvertImageDtype                     50 µs +-      5 µs
-Normalize                            438 µs +-     40 µs
-
-total                               1202 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV2      77 µs +-     13 µs
-Resize                               575 µs +-    159 µs
-RandomHorizontalFlip                  31 µs +-     24 µs
-PILToTensor                           60 µs +-      6 µs
-ConvertDtype                          44 µs +-      4 µs
-Normalize                            424 µs +-     43 µs
-
-total                               1212 µs
-------------------------------------------------------------
-input_type='Datapoint', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-ToImageTensor                        122 µs +-     10 µs
-RandomResizedCropWithoutResizeV2      59 µs +-      7 µs
-Resize                               647 µs +-    163 µs
-RandomHorizontalFlip                  38 µs +-     13 µs
-ConvertDtype                          46 µs +-      4 µs
-Normalize                             65 µs +-      6 µs
-
-total                                978 µs
-------------------------------------------------------------
-
-Summaries
-
-           v2 / v1
-Tensor        1.01
-PIL           1.01
-
-               x / PIL, v1
-Tensor, v1            0.79
-Tensor, v2            0.80
-PIL, v1               1.00
-PIL, v2               1.01
-Datapoint, v2         0.81
-############################################################
-classification-complex
-############################################################
-input_type='Tensor', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          113 µs +-      9 µs
-RandomResizedCropWithoutResizeV1      54 µs +-      6 µs
-Resize                               633 µs +-    165 µs
-RandomHorizontalFlip                  26 µs +-      8 µs
-AutoAugment                          782 µs +-    587 µs
-RandomErasing                         15 µs +-     35 µs
-ConvertImageDtype                     48 µs +-      5 µs
-Normalize                             75 µs +-      6 µs
-
-total                               1745 µs
-------------------------------------------------------------
-input_type='Tensor', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          118 µs +-      9 µs
-RandomResizedCropWithoutResizeV2      55 µs +-      7 µs
-Resize                               634 µs +-    158 µs
-RandomHorizontalFlip                  34 µs +-     11 µs
-AutoAugment                          624 µs +-    484 µs
-RandomErasing                         19 µs +-     37 µs
-ConvertDtype                          42 µs +-      3 µs
-Normalize                             62 µs +-      6 µs
-
-total                               1588 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV1      78 µs +-     15 µs
-Resize                               577 µs +-    160 µs
-RandomHorizontalFlip                  28 µs +-     22 µs
-AutoAugment                          334 µs +-    230 µs
-PILToTensor                           56 µs +-      7 µs
-RandomErasing                         15 µs +-     35 µs
-ConvertImageDtype                     50 µs +-      8 µs
-Normalize                            444 µs +-     46 µs
-
-total                               1582 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV2      77 µs +-     13 µs
-Resize                               569 µs +-    154 µs
-RandomHorizontalFlip                  24 µs +-     24 µs
-AutoAugment                          278 µs +-    232 µs
-PILToTensor                           62 µs +-      6 µs
-RandomErasing                         17 µs +-     36 µs
-ConvertDtype                          43 µs +-      6 µs
-Normalize                            418 µs +-     38 µs
-
-total                               1487 µs
-------------------------------------------------------------
-input_type='Datapoint', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-ToImageTensor                        124 µs +-      9 µs
-RandomResizedCropWithoutResizeV2      60 µs +-      7 µs
-Resize                               633 µs +-    160 µs
-RandomHorizontalFlip                  39 µs +-     13 µs
-AutoAugment                          622 µs +-    414 µs
-RandomErasing                         19 µs +-     40 µs
-ConvertDtype                          47 µs +-      4 µs
-Normalize                             65 µs +-      7 µs
-
-total                               1609 µs
-------------------------------------------------------------
-
-Summaries
-
-           v2 / v1
-Tensor        0.91
-PIL           0.94
-
-               x / PIL, v1
-Tensor, v1            1.10
-Tensor, v2            1.00
-PIL, v1               1.00
-PIL, v2               0.94
-Datapoint, v2         1.02
-############################################################
-Collecting environment information...
-PyTorch version: 2.1.0.dev20230326+cpu
-Is debug build: False
-CUDA used to build PyTorch: Could not collect
-ROCM used to build PyTorch: N/A
-
-OS: Arch Linux (x86_64)
-GCC version: (GCC) 12.2.1 20230201
-Clang version: 15.0.7
-CMake version: version 3.25.3
-Libc version: glibc-2.37
-
-Python version: 3.8.16 (default, Mar  2 2023, 03:21:46)  [GCC 11.2.0] (64-bit runtime)
-Python platform: Linux-6.2.6-arch1-1-x86_64-with-glibc2.17
-Is CUDA available: False
-CUDA runtime version: 11.7.99
-CUDA_MODULE_LOADING set to: N/A
-GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1080
-Nvidia driver version: 525.89.02
-cuDNN version: Could not collect
-HIP runtime version: N/A
-MIOpen runtime version: N/A
-Is XNNPACK available: True
-
-CPU:
-Architecture:                    x86_64
-CPU op-mode(s):                  32-bit, 64-bit
-Address sizes:                   48 bits physical, 48 bits virtual
-Byte Order:                      Little Endian
-CPU(s):                          24
-On-line CPU(s) list:             0-23
-Vendor ID:                       AuthenticAMD
-Model name:                      AMD Ryzen 9 5900X 12-Core Processor
-CPU family:                      25
-Model:                           33
-Thread(s) per core:              2
-Core(s) per socket:              12
-Socket(s):                       1
-Stepping:                        0
-Frequency boost:                 enabled
-CPU(s) scaling MHz:              50%
-CPU max MHz:                     4950,1948
-CPU min MHz:                     2200,0000
-BogoMIPS:                        7389,95
-Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
-Virtualization:                  AMD-V
-L1d cache:                       384 KiB (12 instances)
-L1i cache:                       384 KiB (12 instances)
-L2 cache:                        6 MiB (12 instances)
-L3 cache:                        64 MiB (2 instances)
-NUMA node(s):                    1
-NUMA node0 CPU(s):               0-23
-Vulnerability Itlb multihit:     Not affected
-Vulnerability L1tf:              Not affected
-Vulnerability Mds:               Not affected
-Vulnerability Meltdown:          Not affected
-Vulnerability Mmio stale data:   Not affected
-Vulnerability Retbleed:          Not affected
-Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
-Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
-Vulnerability Spectre v2:        Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected
-Vulnerability Srbds:             Not affected
-Vulnerability Tsx async abort:   Not affected
-
-Versions of relevant libraries:
-[pip3] light-the-torch==0.7.2
-[pip3] mypy-extensions==1.0.0
-[pip3] numpy==1.24.1
-[pip3] torch==2.1.0.dev20230326+cpu
-[pip3] torchvision==0.16.0.dev20230326+cpu
-[conda] light-the-torch           0.7.2                    pypi_0    pypi
-[conda] numpy                     1.24.1                   pypi_0    pypi
-[conda] torch                     2.1.0.dev20230326+cpu          pypi_0    pypi
-[conda] torchvision               0.16.0.dev20230326+cpu          pypi_0    pypi
diff --git a/results/20230403073901.log b/results/20230403073901.log
new file mode 100644
index 0000000..354de98
--- /dev/null
+++ b/results/20230403073901.log
@@ -0,0 +1,363 @@
+############################################################
+classification-simple
+############################################################
+input_type='Tensor', api_version='v1'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+PILToTensor                          106 µs +-      8 µs
+RandomResizedCropWithoutResizeV1      50 µs +-      7 µs
+Resize                               619 µs +-    157 µs
+RandomHorizontalFlip                  24 µs +-      9 µs
+ConvertImageDtype                     46 µs +-      7 µs
+Normalize                             73 µs +-      8 µs
+
+total                                918 µs
+------------------------------------------------------------
+input_type='Tensor', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+PILToTensor                          118 µs +-     10 µs
+RandomResizedCropWithoutResizeV2      54 µs +-      7 µs
+Resize                               647 µs +-    168 µs
+RandomHorizontalFlip                  34 µs +-     11 µs
+ConvertDtype                          43 µs +-      4 µs
+Normalize                             63 µs +-      7 µs
+
+total                                959 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v1'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV1      76 µs +-     12 µs
+Resize                               583 µs +-    160 µs
+RandomHorizontalFlip                  52 µs +-     23 µs
+PILToTensor                           53 µs +-      5 µs
+ConvertImageDtype                     52 µs +-      6 µs
+Normalize                            451 µs +-     44 µs
+
+total                               1265 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV2      77 µs +-     11 µs
+Resize                               578 µs +-    159 µs
+RandomHorizontalFlip                  59 µs +-     24 µs
+PILToTensor                           60 µs +-      4 µs
+ConvertDtype                          45 µs +-      3 µs
+Normalize                            430 µs +-     36 µs
+
+total                               1248 µs
+------------------------------------------------------------
+input_type='Datapoint', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+ToImageTensor                        121 µs +-      8 µs
+RandomResizedCropWithoutResizeV2      60 µs +-      7 µs
+Resize                               651 µs +-    163 µs
+RandomHorizontalFlip                  38 µs +-     13 µs
+ConvertDtype                          47 µs +-      4 µs
+Normalize                             66 µs +-      7 µs
+
+total                                983 µs
+------------------------------------------------------------
+
+Summaries
+
+           v2 / v1
+Tensor        1.04
+PIL           0.99
+
+               x / PIL, v1
+Tensor, v1            0.73
+Tensor, v2            0.76
+PIL, v1               1.00
+PIL, v2               0.99
+Datapoint, v2         0.78
+############################################################
+classification-complex
+############################################################
+input_type='Tensor', api_version='v1'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+PILToTensor                          114 µs +-      8 µs
+RandomResizedCropWithoutResizeV1      55 µs +-      6 µs
+Resize                               649 µs +-    165 µs
+RandomHorizontalFlip                  27 µs +-      9 µs
+AutoAugment                          803 µs +-    565 µs
+RandomErasing                         15 µs +-     36 µs
+ConvertImageDtype                     50 µs +-      4 µs
+Normalize                             79 µs +-      5 µs
+
+total                               1793 µs
+------------------------------------------------------------
+input_type='Tensor', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+PILToTensor                          120 µs +-     10 µs
+RandomResizedCropWithoutResizeV2      56 µs +-      7 µs
+Resize                               655 µs +-    164 µs
+RandomHorizontalFlip                  28 µs +-     11 µs
+AutoAugment                          637 µs +-    467 µs
+RandomErasing                         19 µs +-     39 µs
+ConvertDtype                          44 µs +-      4 µs
+Normalize                             64 µs +-      6 µs
+
+total                               1624 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v1'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV1      80 µs +-     16 µs
+Resize                               568 µs +-    157 µs
+RandomHorizontalFlip                  24 µs +-     22 µs
+AutoAugment                          332 µs +-    228 µs
+PILToTensor                           55 µs +-      7 µs
+RandomErasing                         15 µs +-     34 µs
+ConvertImageDtype                     52 µs +-     13 µs
+Normalize                            441 µs +-     39 µs
+
+total                               1566 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV2      80 µs +-     12 µs
+Resize                               588 µs +-    159 µs
+RandomHorizontalFlip                  60 µs +-     25 µs
+AutoAugment                          288 µs +-    238 µs
+PILToTensor                           65 µs +-      6 µs
+RandomErasing                         19 µs +-     38 µs
+ConvertDtype                          46 µs +-      4 µs
+Normalize                            433 µs +-     38 µs
+
+total                               1579 µs
+------------------------------------------------------------
+input_type='Datapoint', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                  median          std   
+ToImageTensor                        126 µs +-     14 µs
+RandomResizedCropWithoutResizeV2      63 µs +-      8 µs
+Resize                               641 µs +-    163 µs
+RandomHorizontalFlip                  40 µs +-     14 µs
+AutoAugment                          626 µs +-    414 µs
+RandomErasing                         20 µs +-     41 µs
+ConvertDtype                          48 µs +-      4 µs
+Normalize                             68 µs +-     10 µs
+
+total                               1633 µs
+------------------------------------------------------------
+
+Summaries
+
+           v2 / v1
+Tensor        0.91
+PIL           1.01
+
+               x / PIL, v1
+Tensor, v1            1.14
+Tensor, v2            1.04
+PIL, v1               1.00
+PIL, v2               1.01
+Datapoint, v2         1.04
+############################################################
+detection-ssdlite
+############################################################
+loading annotations into memory...
+Done (t=9.03s)
+creating index...
+index created!
+Caching 10000 COCO samples
+input_type='Tensor', api_version='v1'
+
+Results computed for 10_000 samples
+
+                                            median          std   
+DetectionReferenceConvertCocoPolysToMaskV1    2876 µs +-   4445 µs
+DetectionReferencePILToTensorV1                269 µs +-     71 µs
+DetectionReferenceRandomIoUCropV1              453 µs +-   7086 µs
+DetectionReferenceRandomHorizontalFlipV1        29 µs +-    243 µs
+DetectionReferenceConvertImageDtypeV1          293 µs +-    184 µs
+
+total                                         3921 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=12.46s)
+creating index...
+index created!
+Caching 10000 COCO samples
+input_type='Tensor', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                                 median          std   
+WrapCocoDetectionReferenceSampleForTransformsV2    1610 µs +-   2546 µs
+PILToTensor                                         757 µs +-  18357 µs
+RandomIoUCrop                                      1829 µs +-  20137 µs
+RandomHorizontalFlip                                554 µs +-  19808 µs
+ConvertDtype                                        771 µs +-  18949 µs
+SanitizeBoundingBox                                1001 µs +-  16996 µs
+
+total                                              6521 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=14.08s)
+creating index...
+index created!
+Caching 10000 COCO samples
+input_type='PIL', api_version='v1'
+
+Results computed for 10_000 samples
+
+                                            median          std   
+DetectionReferenceConvertCocoPolysToMaskV1    3006 µs +-   4571 µs
+DetectionReferenceRandomIoUCropV1              604 µs +-   7082 µs
+DetectionReferenceRandomHorizontalFlipV1       132 µs +-    255 µs
+DetectionReferencePILToTensorV1                193 µs +-    133 µs
+DetectionReferenceConvertImageDtypeV1          332 µs +-    170 µs
+
+total                                         4268 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=10.88s)
+creating index...
+index created!
+Caching 10000 COCO samples
+input_type='PIL', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                                 median          std   
+WrapCocoDetectionReferenceSampleForTransformsV2    1677 µs +-   2587 µs
+RandomIoUCrop                                      1807 µs +-  21534 µs
+RandomHorizontalFlip                                579 µs +-  21147 µs
+PILToTensor                                         692 µs +-  16187 µs
+ConvertDtype                                        792 µs +-  19596 µs
+SanitizeBoundingBox                                1016 µs +-  18356 µs
+
+total                                              6562 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=13.01s)
+creating index...
+index created!
+Caching 10000 COCO samples
+input_type='Datapoint', api_version='v2'
+
+Results computed for 10_000 samples
+
+                                                 median          std   
+WrapCocoDetectionReferenceSampleForTransformsV2    1699 µs +-   2608 µs
+ToImageTensor                                       883 µs +-  19872 µs
+RandomIoUCrop                                      1614 µs +-  22483 µs
+RandomHorizontalFlip                                548 µs +-  16108 µs
+ConvertDtype                                        748 µs +-  20230 µs
+SanitizeBoundingBox                                1009 µs +-  20313 µs
+
+total                                              6500 µs
+------------------------------------------------------------
+
+Summaries
+
+           v2 / v1
+Tensor        1.66
+PIL           1.54
+
+               x / PIL, v1
+Tensor, v1            0.92
+Tensor, v2            1.53
+PIL, v1               1.00
+PIL, v2               1.54
+Datapoint, v2         1.52
+############################################################
+Collecting environment information...
+PyTorch version: 2.1.0.dev20230326+cpu
+Is debug build: False
+CUDA used to build PyTorch: Could not collect
+ROCM used to build PyTorch: N/A
+
+OS: Arch Linux (x86_64)
+GCC version: (GCC) 12.2.1 20230201
+Clang version: 15.0.7
+CMake version: version 3.25.3
+Libc version: glibc-2.37
+
+Python version: 3.8.16 (default, Mar  2 2023, 03:21:46)  [GCC 11.2.0] (64-bit runtime)
+Python platform: Linux-6.2.6-arch1-1-x86_64-with-glibc2.17
+Is CUDA available: False
+CUDA runtime version: 11.7.99
+CUDA_MODULE_LOADING set to: N/A
+GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1080
+Nvidia driver version: 525.89.02
+cuDNN version: Could not collect
+HIP runtime version: N/A
+MIOpen runtime version: N/A
+Is XNNPACK available: True
+
+CPU:
+Architecture:                    x86_64
+CPU op-mode(s):                  32-bit, 64-bit
+Address sizes:                   48 bits physical, 48 bits virtual
+Byte Order:                      Little Endian
+CPU(s):                          24
+On-line CPU(s) list:             0-23
+Vendor ID:                       AuthenticAMD
+Model name:                      AMD Ryzen 9 5900X 12-Core Processor
+CPU family:                      25
+Model:                           33
+Thread(s) per core:              2
+Core(s) per socket:              12
+Socket(s):                       1
+Stepping:                        0
+Frequency boost:                 enabled
+CPU(s) scaling MHz:              52%
+CPU max MHz:                     4950,1948
+CPU min MHz:                     2200,0000
+BogoMIPS:                        7389,03
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
+Virtualization:                  AMD-V
+L1d cache:                       384 KiB (12 instances)
+L1i cache:                       384 KiB (12 instances)
+L2 cache:                        6 MiB (12 instances)
+L3 cache:                        64 MiB (2 instances)
+NUMA node(s):                    1
+NUMA node0 CPU(s):               0-23
+Vulnerability Itlb multihit:     Not affected
+Vulnerability L1tf:              Not affected
+Vulnerability Mds:               Not affected
+Vulnerability Meltdown:          Not affected
+Vulnerability Mmio stale data:   Not affected
+Vulnerability Retbleed:          Not affected
+Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
+Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+Vulnerability Spectre v2:        Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected
+Vulnerability Srbds:             Not affected
+Vulnerability Tsx async abort:   Not affected
+
+Versions of relevant libraries:
+[pip3] light-the-torch==0.7.2
+[pip3] mypy-extensions==1.0.0
+[pip3] numpy==1.24.1
+[pip3] torch==2.1.0.dev20230326+cpu
+[pip3] torchvision==0.16.0.dev20230326+cpu
+[conda] Could not collect
diff --git a/tasks.py b/tasks.py
index 14d46c5..fe0c62e 100644
--- a/tasks.py
+++ b/tasks.py
@@ -1,7 +1,8 @@
-from datasets import classification_dataset_builder
+from datasets import classification_dataset_builder, detection_dataset_builder
 from transforms import (
-    classification_simple_pipeline_builder,
     classification_complex_pipeline_builder,
+    classification_simple_pipeline_builder,
+    detection_ssdlite_pipeline_builder,
 )
 
 TASKS = {
@@ -13,6 +14,10 @@
         classification_complex_pipeline_builder,
         classification_dataset_builder,
     ),
+    "detection-ssdlite": (
+        detection_ssdlite_pipeline_builder,
+        detection_dataset_builder,
+    ),
 }
 
 
@@ -24,7 +29,6 @@ def make_task(name, *, input_type, api_version, dataset_rng, num_samples):
         return None
 
     dataset = dataset_builder(
-        input_type=input_type,
         api_version=api_version,
         rng=dataset_rng,
         num_samples=num_samples,
diff --git a/transforms.py b/transforms.py
index 0efa863..7828557 100644
--- a/transforms.py
+++ b/transforms.py
@@ -1,12 +1,19 @@
+import functools
 from time import perf_counter_ns
-
-import torch
+from types import SimpleNamespace
 
 import torchvision.transforms.v2 as transforms_v2
-from torchvision import transforms as transforms_v1
+from torchvision import datasets, transforms as transforms_v1
+from torchvision.datapoints._dataset_wrapper import WRAPPER_FACTORIES
 from torchvision.transforms import functional as F_v1
 from torchvision.transforms.v2 import functional as F_v2
 
+__all__ = [
+    "classification_simple_pipeline_builder",
+    "classification_complex_pipeline_builder",
+    "detection_ssdlite_pipeline_builder",
+]
+
 
 class Pipeline:
     def __init__(self, transforms):
@@ -126,6 +133,59 @@ def classification_complex_pipeline_builder(*, input_type, api_version):
     return Pipeline(pipeline)
 
 
+def detection_ssdlite_pipeline_builder(*, input_type, api_version):
+    if input_type == "Datapoint" and api_version == "v1":
+        return None
+
+    pipeline = []
+    if api_version == "v1":
+        pipeline.append(DetectionReferenceConvertCocoPolysToMaskV1())
+
+        if input_type == "Tensor":
+            pipeline.append(DetectionReferencePILToTensorV1())
+
+        pipeline.extend(
+            [
+                DetectionReferenceRandomIoUCropV1(),
+                DetectionReferenceRandomHorizontalFlipV1(p=0.5),
+            ]
+        )
+
+        if input_type == "PIL":
+            pipeline.append(DetectionReferencePILToTensorV1())
+
+        pipeline.append(DetectionReferenceConvertImageDtypeV1(torch.float))
+
+    elif api_version == "v2":
+        pipeline.append(WrapCocoDetectionReferenceSampleForTransformsV2())
+
+        if input_type == "Tensor":
+            pipeline.append(transforms_v2.PILToTensor())
+        elif input_type == "Datapoint":
+            pipeline.append(transforms_v2.ToImageTensor())
+
+        pipeline.extend(
+            [
+                transforms_v2.RandomIoUCrop(),
+                transforms_v2.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        if input_type == "PIL":
+            pipeline.append(transforms_v2.PILToTensor())
+
+        pipeline.extend(
+            [
+                transforms_v2.ConvertDtype(torch.float),
+                transforms_v2.SanitizeBoundingBox(),
+            ]
+        )
+    else:
+        raise ValueError(f"Got {api_version=}")
+
+    return Pipeline(pipeline)
+
+
 class RandomResizedCropWithoutResizeV1(transforms_v1.RandomResizedCrop):
     def forward(self, img):
         i, j, h, w = self.get_params(img, self.scale, self.ratio)
@@ -135,3 +195,238 @@ def forward(self, img):
 class RandomResizedCropWithoutResizeV2(transforms_v2.RandomResizedCrop):
     def _transform(self, inpt, params):
         return F_v2.crop(inpt, **params)
+
+
+class WrapCocoDetectionReferenceSampleForTransformsV2:
+    def __init__(self):
+        num_samples = 117_266
+        wrapper_factory = WRAPPER_FACTORIES[datasets.CocoDetection]
+        mock_dataset = SimpleNamespace(ids=list(range(num_samples)))
+        wrapper = wrapper_factory(mock_dataset)
+        self.wrapper = functools.partial(wrapper, num_samples // 2)
+
+    def __call__(self, *inputs):
+        return self.wrapper(inputs if len(inputs) > 1 else inputs[0])
+
+
+# everything below is copy-pasted from
+# https://github.com/pytorch/vision/blob/main/references/detection/coco_utils.py
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torchvision
+from pycocotools import mask as coco_mask
+from torch import nn, Tensor
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class DetectionReferenceConvertCocoPolysToMaskV1:
+    def __call__(self, image, target):
+        w, h = image.size
+
+        try:
+            image_id = target["image_id"]
+        except:
+            raise
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+class DetectionReferenceRandomHorizontalFlipV1(transforms_v1.RandomHorizontalFlip):
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if torch.rand(1) < self.p:
+            image = F_v1.hflip(image)
+            if target is not None:
+                _, _, width = F_v1.get_dimensions(image)
+                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
+                if "masks" in target:
+                    target["masks"] = target["masks"].flip(-1)
+                if "keypoints" in target:
+                    keypoints = target["keypoints"]
+                    keypoints = _flip_coco_person_keypoints(keypoints, width)
+                    target["keypoints"] = keypoints
+        return image, target
+
+
+class DetectionReferencePILToTensorV1(nn.Module):
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        image = F_v1.pil_to_tensor(image)
+        return image, target
+
+
+class DetectionReferenceConvertImageDtypeV1(nn.Module):
+    def __init__(self, dtype: torch.dtype) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        image = F_v1.convert_image_dtype(image, self.dtype)
+        return image, target
+
+
+class DetectionReferenceRandomIoUCropV1(nn.Module):
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[List[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if target is None:
+            raise ValueError("The targets can't be None for this transform.")
+
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(
+                    f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions."
+                )
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        _, orig_h, orig_w = F_v1.get_dimensions(image)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if (
+                min_jaccard_overlap >= 1.0
+            ):  # a value larger than 1 encodes the leave as-is option
+                return image, target
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # check for any valid boxes with centers within the crop area
+                cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
+                cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
+                is_within_crop_area = (
+                    (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                )
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                boxes = target["boxes"][is_within_crop_area]
+                ious = torchvision.ops.boxes.box_iou(
+                    boxes,
+                    torch.tensor(
+                        [[left, top, right, bottom]],
+                        dtype=boxes.dtype,
+                        device=boxes.device,
+                    ),
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                # keep only valid boxes and perform cropping
+                target["boxes"] = boxes
+                target["labels"] = target["labels"][is_within_crop_area]
+                target["boxes"][:, 0::2] -= left
+                target["boxes"][:, 1::2] -= top
+                target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
+                target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
+                image = F_v1.crop(image, top, left, new_h, new_w)
+
+                return image, target

From c80dc73fbc16502df645f5335d316d22ecb8c937 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Apr 2023 14:37:05 +0200
Subject: [PATCH 2/4] improve summary

---
 main.py                    |  18 +-
 results/20230403073901.log | 363 -------------------------------------
 2 files changed, 12 insertions(+), 369 deletions(-)
 delete mode 100644 results/20230403073901.log

diff --git a/main.py b/main.py
index a9e8b92..48fb6d2 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
 import contextlib
-import itertools
 import pathlib
+import string
 import sys
 from datetime import datetime
 
@@ -98,16 +98,22 @@ def main(*, input_types, tasks, num_samples):
 
         print()
 
-        median_ref = medians["PIL"]["v1"]
         medians_flat = {
             f"{input_type}, {api_version}": median
             for input_type, api_versions in medians.items()
             for api_version, median in api_versions.items()
         }
         field_len = max(len(label) for label in medians_flat)
-        print(f"{' ' * field_len}  x / PIL, v1")
-        for label, median in medians_flat.items():
-            print(f"{label:{field_len}}  {median / median_ref:>11.2f}")
+
+        print(
+            f"{' ' * (field_len + 5)}  {'  '.join(f' [{id}]' for _, id in zip(range(len(medians_flat)), string.ascii_lowercase))}"
+        )
+        for (label, val), id in zip(medians_flat.items(), string.ascii_lowercase):
+            print(
+                f"{label:>{field_len}}, [{id}]  {'  '.join(f'{val / ref:4.2f}' for ref in medians_flat.values())}"
+            )
+        print()
+        print("Slowdown as row / col")
 
 
 if __name__ == "__main__":
@@ -121,7 +127,7 @@ def main(*, input_types, tasks, num_samples):
                 "detection-ssdlite",
             ],
             input_types=["Tensor", "PIL", "Datapoint"],
-            num_samples=10_000,
+            num_samples=1_000,
         )
 
         print("#" * 60)
diff --git a/results/20230403073901.log b/results/20230403073901.log
deleted file mode 100644
index 354de98..0000000
--- a/results/20230403073901.log
+++ /dev/null
@@ -1,363 +0,0 @@
-############################################################
-classification-simple
-############################################################
-input_type='Tensor', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          106 µs +-      8 µs
-RandomResizedCropWithoutResizeV1      50 µs +-      7 µs
-Resize                               619 µs +-    157 µs
-RandomHorizontalFlip                  24 µs +-      9 µs
-ConvertImageDtype                     46 µs +-      7 µs
-Normalize                             73 µs +-      8 µs
-
-total                                918 µs
-------------------------------------------------------------
-input_type='Tensor', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          118 µs +-     10 µs
-RandomResizedCropWithoutResizeV2      54 µs +-      7 µs
-Resize                               647 µs +-    168 µs
-RandomHorizontalFlip                  34 µs +-     11 µs
-ConvertDtype                          43 µs +-      4 µs
-Normalize                             63 µs +-      7 µs
-
-total                                959 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV1      76 µs +-     12 µs
-Resize                               583 µs +-    160 µs
-RandomHorizontalFlip                  52 µs +-     23 µs
-PILToTensor                           53 µs +-      5 µs
-ConvertImageDtype                     52 µs +-      6 µs
-Normalize                            451 µs +-     44 µs
-
-total                               1265 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV2      77 µs +-     11 µs
-Resize                               578 µs +-    159 µs
-RandomHorizontalFlip                  59 µs +-     24 µs
-PILToTensor                           60 µs +-      4 µs
-ConvertDtype                          45 µs +-      3 µs
-Normalize                            430 µs +-     36 µs
-
-total                               1248 µs
-------------------------------------------------------------
-input_type='Datapoint', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-ToImageTensor                        121 µs +-      8 µs
-RandomResizedCropWithoutResizeV2      60 µs +-      7 µs
-Resize                               651 µs +-    163 µs
-RandomHorizontalFlip                  38 µs +-     13 µs
-ConvertDtype                          47 µs +-      4 µs
-Normalize                             66 µs +-      7 µs
-
-total                                983 µs
-------------------------------------------------------------
-
-Summaries
-
-           v2 / v1
-Tensor        1.04
-PIL           0.99
-
-               x / PIL, v1
-Tensor, v1            0.73
-Tensor, v2            0.76
-PIL, v1               1.00
-PIL, v2               0.99
-Datapoint, v2         0.78
-############################################################
-classification-complex
-############################################################
-input_type='Tensor', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          114 µs +-      8 µs
-RandomResizedCropWithoutResizeV1      55 µs +-      6 µs
-Resize                               649 µs +-    165 µs
-RandomHorizontalFlip                  27 µs +-      9 µs
-AutoAugment                          803 µs +-    565 µs
-RandomErasing                         15 µs +-     36 µs
-ConvertImageDtype                     50 µs +-      4 µs
-Normalize                             79 µs +-      5 µs
-
-total                               1793 µs
-------------------------------------------------------------
-input_type='Tensor', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-PILToTensor                          120 µs +-     10 µs
-RandomResizedCropWithoutResizeV2      56 µs +-      7 µs
-Resize                               655 µs +-    164 µs
-RandomHorizontalFlip                  28 µs +-     11 µs
-AutoAugment                          637 µs +-    467 µs
-RandomErasing                         19 µs +-     39 µs
-ConvertDtype                          44 µs +-      4 µs
-Normalize                             64 µs +-      6 µs
-
-total                               1624 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV1      80 µs +-     16 µs
-Resize                               568 µs +-    157 µs
-RandomHorizontalFlip                  24 µs +-     22 µs
-AutoAugment                          332 µs +-    228 µs
-PILToTensor                           55 µs +-      7 µs
-RandomErasing                         15 µs +-     34 µs
-ConvertImageDtype                     52 µs +-     13 µs
-Normalize                            441 µs +-     39 µs
-
-total                               1566 µs
-------------------------------------------------------------
-input_type='PIL', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-RandomResizedCropWithoutResizeV2      80 µs +-     12 µs
-Resize                               588 µs +-    159 µs
-RandomHorizontalFlip                  60 µs +-     25 µs
-AutoAugment                          288 µs +-    238 µs
-PILToTensor                           65 µs +-      6 µs
-RandomErasing                         19 µs +-     38 µs
-ConvertDtype                          46 µs +-      4 µs
-Normalize                            433 µs +-     38 µs
-
-total                               1579 µs
-------------------------------------------------------------
-input_type='Datapoint', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                  median          std   
-ToImageTensor                        126 µs +-     14 µs
-RandomResizedCropWithoutResizeV2      63 µs +-      8 µs
-Resize                               641 µs +-    163 µs
-RandomHorizontalFlip                  40 µs +-     14 µs
-AutoAugment                          626 µs +-    414 µs
-RandomErasing                         20 µs +-     41 µs
-ConvertDtype                          48 µs +-      4 µs
-Normalize                             68 µs +-     10 µs
-
-total                               1633 µs
-------------------------------------------------------------
-
-Summaries
-
-           v2 / v1
-Tensor        0.91
-PIL           1.01
-
-               x / PIL, v1
-Tensor, v1            1.14
-Tensor, v2            1.04
-PIL, v1               1.00
-PIL, v2               1.01
-Datapoint, v2         1.04
-############################################################
-detection-ssdlite
-############################################################
-loading annotations into memory...
-Done (t=9.03s)
-creating index...
-index created!
-Caching 10000 COCO samples
-input_type='Tensor', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                            median          std   
-DetectionReferenceConvertCocoPolysToMaskV1    2876 µs +-   4445 µs
-DetectionReferencePILToTensorV1                269 µs +-     71 µs
-DetectionReferenceRandomIoUCropV1              453 µs +-   7086 µs
-DetectionReferenceRandomHorizontalFlipV1        29 µs +-    243 µs
-DetectionReferenceConvertImageDtypeV1          293 µs +-    184 µs
-
-total                                         3921 µs
-------------------------------------------------------------
-loading annotations into memory...
-Done (t=12.46s)
-creating index...
-index created!
-Caching 10000 COCO samples
-input_type='Tensor', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                                 median          std   
-WrapCocoDetectionReferenceSampleForTransformsV2    1610 µs +-   2546 µs
-PILToTensor                                         757 µs +-  18357 µs
-RandomIoUCrop                                      1829 µs +-  20137 µs
-RandomHorizontalFlip                                554 µs +-  19808 µs
-ConvertDtype                                        771 µs +-  18949 µs
-SanitizeBoundingBox                                1001 µs +-  16996 µs
-
-total                                              6521 µs
-------------------------------------------------------------
-loading annotations into memory...
-Done (t=14.08s)
-creating index...
-index created!
-Caching 10000 COCO samples
-input_type='PIL', api_version='v1'
-
-Results computed for 10_000 samples
-
-                                            median          std   
-DetectionReferenceConvertCocoPolysToMaskV1    3006 µs +-   4571 µs
-DetectionReferenceRandomIoUCropV1              604 µs +-   7082 µs
-DetectionReferenceRandomHorizontalFlipV1       132 µs +-    255 µs
-DetectionReferencePILToTensorV1                193 µs +-    133 µs
-DetectionReferenceConvertImageDtypeV1          332 µs +-    170 µs
-
-total                                         4268 µs
-------------------------------------------------------------
-loading annotations into memory...
-Done (t=10.88s)
-creating index...
-index created!
-Caching 10000 COCO samples
-input_type='PIL', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                                 median          std   
-WrapCocoDetectionReferenceSampleForTransformsV2    1677 µs +-   2587 µs
-RandomIoUCrop                                      1807 µs +-  21534 µs
-RandomHorizontalFlip                                579 µs +-  21147 µs
-PILToTensor                                         692 µs +-  16187 µs
-ConvertDtype                                        792 µs +-  19596 µs
-SanitizeBoundingBox                                1016 µs +-  18356 µs
-
-total                                              6562 µs
-------------------------------------------------------------
-loading annotations into memory...
-Done (t=13.01s)
-creating index...
-index created!
-Caching 10000 COCO samples
-input_type='Datapoint', api_version='v2'
-
-Results computed for 10_000 samples
-
-                                                 median          std   
-WrapCocoDetectionReferenceSampleForTransformsV2    1699 µs +-   2608 µs
-ToImageTensor                                       883 µs +-  19872 µs
-RandomIoUCrop                                      1614 µs +-  22483 µs
-RandomHorizontalFlip                                548 µs +-  16108 µs
-ConvertDtype                                        748 µs +-  20230 µs
-SanitizeBoundingBox                                1009 µs +-  20313 µs
-
-total                                              6500 µs
-------------------------------------------------------------
-
-Summaries
-
-           v2 / v1
-Tensor        1.66
-PIL           1.54
-
-               x / PIL, v1
-Tensor, v1            0.92
-Tensor, v2            1.53
-PIL, v1               1.00
-PIL, v2               1.54
-Datapoint, v2         1.52
-############################################################
-Collecting environment information...
-PyTorch version: 2.1.0.dev20230326+cpu
-Is debug build: False
-CUDA used to build PyTorch: Could not collect
-ROCM used to build PyTorch: N/A
-
-OS: Arch Linux (x86_64)
-GCC version: (GCC) 12.2.1 20230201
-Clang version: 15.0.7
-CMake version: version 3.25.3
-Libc version: glibc-2.37
-
-Python version: 3.8.16 (default, Mar  2 2023, 03:21:46)  [GCC 11.2.0] (64-bit runtime)
-Python platform: Linux-6.2.6-arch1-1-x86_64-with-glibc2.17
-Is CUDA available: False
-CUDA runtime version: 11.7.99
-CUDA_MODULE_LOADING set to: N/A
-GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1080
-Nvidia driver version: 525.89.02
-cuDNN version: Could not collect
-HIP runtime version: N/A
-MIOpen runtime version: N/A
-Is XNNPACK available: True
-
-CPU:
-Architecture:                    x86_64
-CPU op-mode(s):                  32-bit, 64-bit
-Address sizes:                   48 bits physical, 48 bits virtual
-Byte Order:                      Little Endian
-CPU(s):                          24
-On-line CPU(s) list:             0-23
-Vendor ID:                       AuthenticAMD
-Model name:                      AMD Ryzen 9 5900X 12-Core Processor
-CPU family:                      25
-Model:                           33
-Thread(s) per core:              2
-Core(s) per socket:              12
-Socket(s):                       1
-Stepping:                        0
-Frequency boost:                 enabled
-CPU(s) scaling MHz:              52%
-CPU max MHz:                     4950,1948
-CPU min MHz:                     2200,0000
-BogoMIPS:                        7389,03
-Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
-Virtualization:                  AMD-V
-L1d cache:                       384 KiB (12 instances)
-L1i cache:                       384 KiB (12 instances)
-L2 cache:                        6 MiB (12 instances)
-L3 cache:                        64 MiB (2 instances)
-NUMA node(s):                    1
-NUMA node0 CPU(s):               0-23
-Vulnerability Itlb multihit:     Not affected
-Vulnerability L1tf:              Not affected
-Vulnerability Mds:               Not affected
-Vulnerability Meltdown:          Not affected
-Vulnerability Mmio stale data:   Not affected
-Vulnerability Retbleed:          Not affected
-Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
-Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
-Vulnerability Spectre v2:        Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected
-Vulnerability Srbds:             Not affected
-Vulnerability Tsx async abort:   Not affected
-
-Versions of relevant libraries:
-[pip3] light-the-torch==0.7.2
-[pip3] mypy-extensions==1.0.0
-[pip3] numpy==1.24.1
-[pip3] torch==2.1.0.dev20230326+cpu
-[pip3] torchvision==0.16.0.dev20230326+cpu
-[conda] Could not collect

From a859c0931005aeb9a30d0b4496c8b0d422e23c6b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Apr 2023 14:47:17 +0200
Subject: [PATCH 3/4] drop DetectionReference prefix

---
 transforms.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/transforms.py b/transforms.py
index 7828557..0b1ae97 100644
--- a/transforms.py
+++ b/transforms.py
@@ -139,25 +139,25 @@ def detection_ssdlite_pipeline_builder(*, input_type, api_version):
 
     pipeline = []
     if api_version == "v1":
-        pipeline.append(DetectionReferenceConvertCocoPolysToMaskV1())
+        pipeline.append(ConvertCocoPolysToMaskV1())
 
         if input_type == "Tensor":
-            pipeline.append(DetectionReferencePILToTensorV1())
+            pipeline.append(PILToTensorV1())
 
         pipeline.extend(
             [
-                DetectionReferenceRandomIoUCropV1(),
-                DetectionReferenceRandomHorizontalFlipV1(p=0.5),
+                RandomIoUCropV1(),
+                RandomHorizontalFlipV1(p=0.5),
             ]
         )
 
         if input_type == "PIL":
-            pipeline.append(DetectionReferencePILToTensorV1())
+            pipeline.append(PILToTensorV1())
 
-        pipeline.append(DetectionReferenceConvertImageDtypeV1(torch.float))
+        pipeline.append(ConvertImageDtypeV1(torch.float))
 
     elif api_version == "v2":
-        pipeline.append(WrapCocoDetectionReferenceSampleForTransformsV2())
+        pipeline.append(WrapCocoSampleForTransformsV2())
 
         if input_type == "Tensor":
             pipeline.append(transforms_v2.PILToTensor())
@@ -197,7 +197,7 @@ def _transform(self, inpt, params):
         return F_v2.crop(inpt, **params)
 
 
-class WrapCocoDetectionReferenceSampleForTransformsV2:
+class WrapCocoSampleForTransformsV2:
     def __init__(self):
         num_samples = 117_266
         wrapper_factory = WRAPPER_FACTORIES[datasets.CocoDetection]
@@ -237,7 +237,7 @@ def convert_coco_poly_to_mask(segmentations, height, width):
     return masks
 
 
-class DetectionReferenceConvertCocoPolysToMaskV1:
+class ConvertCocoPolysToMaskV1:
     def __call__(self, image, target):
         w, h = image.size
 
@@ -296,7 +296,7 @@ def __call__(self, image, target):
         return image, target
 
 
-class DetectionReferenceRandomHorizontalFlipV1(transforms_v1.RandomHorizontalFlip):
+class RandomHorizontalFlipV1(transforms_v1.RandomHorizontalFlip):
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
     ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
@@ -314,7 +314,7 @@ def forward(
         return image, target
 
 
-class DetectionReferencePILToTensorV1(nn.Module):
+class PILToTensorV1(nn.Module):
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
     ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
@@ -322,7 +322,7 @@ def forward(
         return image, target
 
 
-class DetectionReferenceConvertImageDtypeV1(nn.Module):
+class ConvertImageDtypeV1(nn.Module):
     def __init__(self, dtype: torch.dtype) -> None:
         super().__init__()
         self.dtype = dtype
@@ -334,7 +334,7 @@ def forward(
         return image, target
 
 
-class DetectionReferenceRandomIoUCropV1(nn.Module):
+class RandomIoUCropV1(nn.Module):
     def __init__(
         self,
         min_scale: float = 0.3,

From 05350be4215589059d41c656fccbec0068ebf023 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 4 Apr 2023 11:27:19 +0200
Subject: [PATCH 4/4] cleanup

---
 datasets.py                |   6 +-
 main.py                    | 129 +++++++-------
 results/20230404093341.log | 339 +++++++++++++++++++++++++++++++++++++
 transforms.py              |  15 +-
 4 files changed, 417 insertions(+), 72 deletions(-)
 create mode 100644 results/20230404093341.log

diff --git a/datasets.py b/datasets.py
index b4e3a7c..704a1e5 100644
--- a/datasets.py
+++ b/datasets.py
@@ -33,9 +33,9 @@ def detection_dataset_builder(*, api_version, rng, num_samples):
 
     dataset = _coco_remove_images_without_annotations(dataset)
 
-    idcs = torch.randperm(len(dataset), generator=rng)[:num_samples]
-    print(f"Caching {num_samples} COCO samples")
-    return [dataset[idx] for idx in tqdm(idcs.tolist())]
+    idcs = torch.randperm(len(dataset), generator=rng)[:num_samples].tolist()
+    print(f"Caching {num_samples} ({idcs[:3]} ... {idcs[-3:]}) COCO samples")
+    return [dataset[idx] for idx in tqdm(idcs)]
 
 
 # everything below is copy-pasted from
diff --git a/main.py b/main.py
index 48fb6d2..a883169 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,5 @@
 import contextlib
+import itertools
 import pathlib
 import string
 import sys
@@ -33,87 +34,87 @@ def main(*, input_types, tasks, num_samples):
     # https://github.com/pytorch/pytorch/blob/19162083f8831be87be01bb84f186310cad1d348/torch/utils/data/_utils/worker.py#L222
     torch.set_num_threads(1)
 
+    dataset_rng = torch.Generator()
+    dataset_rng.manual_seed(0)
+    dataset_rng_state = dataset_rng.get_state()
+
     for task_name in tasks:
         print("#" * 60)
         print(task_name)
         print("#" * 60)
 
         medians = {input_type: {} for input_type in input_types}
-        for input_type in input_types:
-            dataset_rng = torch.Generator()
-            dataset_rng.manual_seed(0)
-            dataset_rng_state = dataset_rng.get_state()
-
-            for api_version in ["v1", "v2"]:
-                dataset_rng.set_state(dataset_rng_state)
-                task = make_task(
-                    task_name,
-                    input_type=input_type,
-                    api_version=api_version,
-                    dataset_rng=dataset_rng,
-                    num_samples=num_samples,
-                )
-                if task is None:
-                    continue
-
-                print(f"{input_type=}, {api_version=}")
-                print()
-                print(f"Results computed for {num_samples:_} samples")
-                print()
-
-                pipeline, dataset = task
-
-                for sample in dataset:
-                    pipeline(sample)
-
-                results = pipeline.extract_times()
-                field_len = max(len(name) for name in results)
-                print(f"{' ' * field_len}  {'median   ':>9}    {'std   ':>9}")
-                medians[input_type][api_version] = 0.0
-                for transform_name, times in results.items():
-                    median = float(times.median())
-                    print(
-                        f"{transform_name:{field_len}}  {median * 1e6:6.0f} µs +- {float(times.std()) * 1e6:6.0f} µs"
-                    )
-                    medians[input_type][api_version] += median
+        for input_type, api_version in itertools.product(input_types, ["v1", "v2"]):
+            dataset_rng.set_state(dataset_rng_state)
+            task = make_task(
+                task_name,
+                input_type=input_type,
+                api_version=api_version,
+                dataset_rng=dataset_rng,
+                num_samples=num_samples,
+            )
+            if task is None:
+                continue
 
-                print(
-                    f"\n{'total':{field_len}}  {medians[input_type][api_version] * 1e6:6.0f} µs"
-                )
-                print("-" * 60)
+            print(f"{input_type=}, {api_version=}")
+            print()
+            print(f"Results computed for {num_samples:_} samples")
+            print()
 
-        print()
-        print("Summaries")
-        print()
+            pipeline, dataset = task
 
-        field_len = max(len(input_type) for input_type in medians)
-        print(f"{' ' * field_len}  v2 / v1")
-        for input_type, api_versions in medians.items():
-            if len(api_versions) < 2:
-                continue
+            torch.manual_seed(0)
+            for sample in dataset:
+                pipeline(sample)
+
+            results = pipeline.extract_times()
+            field_len = max(len(name) for name in results)
+            print(f"{' ' * field_len}  {'median   ':>9}    {'std   ':>9}")
+            medians[input_type][api_version] = 0.0
+            for transform_name, times in results.items():
+                median = float(times.median())
+                print(
+                    f"{transform_name:{field_len}}  {median * 1e6:6.0f} µs +- {float(times.std()) * 1e6:6.0f} µs"
+                )
+                medians[input_type][api_version] += median
 
             print(
-                f"{input_type:{field_len}}  {api_versions['v2'] / api_versions['v1']:>7.2f}"
+                f"\n{'total':{field_len}}  {medians[input_type][api_version] * 1e6:6.0f} µs"
             )
+            print("-" * 60)
 
-        print()
+    print()
+    print("Summaries")
+    print()
 
-        medians_flat = {
-            f"{input_type}, {api_version}": median
-            for input_type, api_versions in medians.items()
-            for api_version, median in api_versions.items()
-        }
-        field_len = max(len(label) for label in medians_flat)
+    field_len = max(len(input_type) for input_type in medians)
+    print(f"{' ' * field_len}  v2 / v1")
+    for input_type, api_versions in medians.items():
+        if len(api_versions) < 2:
+            continue
 
         print(
-            f"{' ' * (field_len + 5)}  {'  '.join(f' [{id}]' for _, id in zip(range(len(medians_flat)), string.ascii_lowercase))}"
+            f"{input_type:{field_len}}  {api_versions['v2'] / api_versions['v1']:>7.2f}"
         )
-        for (label, val), id in zip(medians_flat.items(), string.ascii_lowercase):
-            print(
-                f"{label:>{field_len}}, [{id}]  {'  '.join(f'{val / ref:4.2f}' for ref in medians_flat.values())}"
-            )
-        print()
-        print("Slowdown as row / col")
+
+    print()
+
+    medians_flat = {
+        f"{input_type}, {api_version}": median
+        for input_type, api_versions in medians.items()
+        for api_version, median in api_versions.items()
+    }
+    field_len = max(len(label) for label in medians_flat)
+
+    print(
+        f"{' ' * (field_len + 5)}  {'  '.join(f' [{id}]' for _, id in zip(range(len(medians_flat)), string.ascii_lowercase))}"
+    )
+    for (label, val), id in zip(medians_flat.items(), string.ascii_lowercase):
+        print(
+            f"{label:>{field_len}}, [{id}]  {'  '.join(f'{val / ref:4.2f}' for ref in medians_flat.values())}"
+        )
+    print()
+    print("Slowdown as row / col")
 
 
 if __name__ == "__main__":
diff --git a/results/20230404093341.log b/results/20230404093341.log
new file mode 100644
index 0000000..fbf71c2
--- /dev/null
+++ b/results/20230404093341.log
@@ -0,0 +1,339 @@
+############################################################
+classification-simple
+############################################################
+input_type='Tensor', api_version='v1'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+PILToTensor                          110 µs +-     10 µs
+RandomResizedCropWithoutResizeV1      52 µs +-      9 µs
+Resize                               636 µs +-    184 µs
+RandomHorizontalFlip                  25 µs +-     10 µs
+ConvertImageDtype                     47 µs +-     11 µs
+Normalize                             75 µs +-     14 µs
+
+total                                945 µs
+------------------------------------------------------------
+input_type='Tensor', api_version='v2'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+PILToTensor                          116 µs +-      8 µs
+RandomResizedCropWithoutResizeV2      55 µs +-      6 µs
+Resize                               618 µs +-    159 µs
+RandomHorizontalFlip                  35 µs +-     10 µs
+ConvertDtype                          42 µs +-      3 µs
+Normalize                             61 µs +-      4 µs
+
+total                                926 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v1'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV1      76 µs +-     11 µs
+Resize                               548 µs +-    152 µs
+RandomHorizontalFlip                  51 µs +-     22 µs
+PILToTensor                           52 µs +-      5 µs
+ConvertImageDtype                     50 µs +-      6 µs
+Normalize                            438 µs +-     36 µs
+
+total                               1214 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v2'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV2      73 µs +-     10 µs
+Resize                               540 µs +-    150 µs
+RandomHorizontalFlip                  58 µs +-     23 µs
+PILToTensor                           57 µs +-      3 µs
+ConvertDtype                          43 µs +-      3 µs
+Normalize                            417 µs +-     33 µs
+
+total                               1189 µs
+------------------------------------------------------------
+input_type='Datapoint', api_version='v2'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+ToImageTensor                        122 µs +-      9 µs
+RandomResizedCropWithoutResizeV2      60 µs +-      7 µs
+Resize                               619 µs +-    163 µs
+RandomHorizontalFlip                  37 µs +-     12 µs
+ConvertDtype                          45 µs +-      6 µs
+Normalize                             64 µs +-      5 µs
+
+total                                948 µs
+------------------------------------------------------------
+############################################################
+classification-complex
+############################################################
+input_type='Tensor', api_version='v1'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+PILToTensor                          109 µs +-     12 µs
+RandomResizedCropWithoutResizeV1      53 µs +-      7 µs
+Resize                               630 µs +-    166 µs
+RandomHorizontalFlip                  18 µs +-      8 µs
+AutoAugment                          765 µs +-    623 µs
+RandomErasing                         14 µs +-     36 µs
+ConvertImageDtype                     48 µs +-      5 µs
+Normalize                             74 µs +-      6 µs
+
+total                               1711 µs
+------------------------------------------------------------
+input_type='Tensor', api_version='v2'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+PILToTensor                          116 µs +-     10 µs
+RandomResizedCropWithoutResizeV2      55 µs +-      7 µs
+Resize                               632 µs +-    166 µs
+RandomHorizontalFlip                  24 µs +-     10 µs
+AutoAugment                          611 µs +-    606 µs
+RandomErasing                         18 µs +-     36 µs
+ConvertDtype                          42 µs +-      3 µs
+Normalize                             62 µs +-      5 µs
+
+total                               1560 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v1'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV1      80 µs +-     15 µs
+Resize                               587 µs +-    156 µs
+RandomHorizontalFlip                  22 µs +-     23 µs
+AutoAugment                          339 µs +-    234 µs
+PILToTensor                           57 µs +-      6 µs
+RandomErasing                         15 µs +-     36 µs
+ConvertImageDtype                     54 µs +-     16 µs
+Normalize                            459 µs +-     39 µs
+
+total                               1613 µs
+------------------------------------------------------------
+input_type='PIL', api_version='v2'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+RandomResizedCropWithoutResizeV2      78 µs +-     12 µs
+Resize                               577 µs +-    160 µs
+RandomHorizontalFlip                  27 µs +-     24 µs
+AutoAugment                          307 µs +-    242 µs
+PILToTensor                           64 µs +-      4 µs
+RandomErasing                         18 µs +-     36 µs
+ConvertDtype                          45 µs +-      6 µs
+Normalize                            427 µs +-     34 µs
+
+total                               1543 µs
+------------------------------------------------------------
+input_type='Datapoint', api_version='v2'
+
+Results computed for 1_000 samples
+
+                                  median          std   
+ToImageTensor                        125 µs +-      9 µs
+RandomResizedCropWithoutResizeV2      61 µs +-      6 µs
+Resize                               646 µs +-    167 µs
+RandomHorizontalFlip                  22 µs +-     13 µs
+AutoAugment                          630 µs +-    381 µs
+RandomErasing                         18 µs +-     39 µs
+ConvertDtype                          48 µs +-      3 µs
+Normalize                             68 µs +-      6 µs
+
+total                               1617 µs
+------------------------------------------------------------
+############################################################
+detection-ssdlite
+############################################################
+loading annotations into memory...
+Done (t=9.71s)
+creating index...
+index created!
+Caching 1000 ([89444, 73295, 101719] ... [31395, 96727, 47807]) COCO samples
+input_type='Tensor', api_version='v1'
+
+Results computed for 1_000 samples
+
+                          median          std   
+ConvertCocoPolysToMaskV1    2799 µs +-   4403 µs
+PILToTensorV1                268 µs +-     77 µs
+RandomIoUCropV1              467 µs +-   7166 µs
+RandomHorizontalFlipV1        18 µs +-    218 µs
+ConvertImageDtypeV1          267 µs +-    178 µs
+
+total                       3820 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=8.87s)
+creating index...
+index created!
+Caching 1000 ([89444, 73295, 101719] ... [31395, 96727, 47807]) COCO samples
+input_type='Tensor', api_version='v2'
+
+Results computed for 1_000 samples
+
+                               median          std   
+WrapCocoSampleForTransformsV2    1487 µs +-   2446 µs
+PILToTensor                       748 µs +-   5328 µs
+RandomIoUCrop                    1835 µs +-   7046 µs
+RandomHorizontalFlip              559 µs +-   2322 µs
+ConvertDtype                      760 µs +-   5410 µs
+SanitizeBoundingBox              1004 µs +-   4817 µs
+
+total                            6394 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=9.84s)
+creating index...
+index created!
+Caching 1000 ([89444, 73295, 101719] ... [31395, 96727, 47807]) COCO samples
+input_type='PIL', api_version='v1'
+
+Results computed for 1_000 samples
+
+                          median          std   
+ConvertCocoPolysToMaskV1    2816 µs +-   4427 µs
+RandomIoUCropV1              556 µs +-   7177 µs
+RandomHorizontalFlipV1        20 µs +-    212 µs
+PILToTensorV1                180 µs +-    112 µs
+ConvertImageDtypeV1          281 µs +-    168 µs
+
+total                       3851 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=9.73s)
+creating index...
+index created!
+Caching 1000 ([89444, 73295, 101719] ... [31395, 96727, 47807]) COCO samples
+input_type='PIL', api_version='v2'
+
+Results computed for 1_000 samples
+
+                               median          std   
+WrapCocoSampleForTransformsV2    1536 µs +-   2480 µs
+RandomIoUCrop                    1809 µs +-   9065 µs
+RandomHorizontalFlip              582 µs +-   4570 µs
+PILToTensor                       653 µs +-   4991 µs
+ConvertDtype                      777 µs +-   5354 µs
+SanitizeBoundingBox              1012 µs +-   6233 µs
+
+total                            6369 µs
+------------------------------------------------------------
+loading annotations into memory...
+Done (t=9.91s)
+creating index...
+index created!
+Caching 1000 ([89444, 73295, 101719] ... [31395, 96727, 47807]) COCO samples
+input_type='Datapoint', api_version='v2'
+
+Results computed for 1_000 samples
+
+                               median          std   
+WrapCocoSampleForTransformsV2    1537 µs +-   2505 µs
+ToImageTensor                     833 µs +-   2973 µs
+RandomIoUCrop                    1717 µs +-   8842 µs
+RandomHorizontalFlip              547 µs +-   5286 µs
+ConvertDtype                      725 µs +-   6290 µs
+SanitizeBoundingBox              1021 µs +-   5869 µs
+
+total                            6380 µs
+------------------------------------------------------------
+
+Summaries
+
+           v2 / v1
+Tensor        1.67
+PIL           1.65
+
+                     [a]   [b]   [c]   [d]   [e]
+   Tensor, v1, [a]  1.00  0.60  0.99  0.60  0.60
+   Tensor, v2, [b]  1.67  1.00  1.66  1.00  1.00
+      PIL, v1, [c]  1.01  0.60  1.00  0.60  0.60
+      PIL, v2, [d]  1.67  1.00  1.65  1.00  1.00
+Datapoint, v2, [e]  1.67  1.00  1.66  1.00  1.00
+
+Slowdown as row / col
+############################################################
+Collecting environment information...
+PyTorch version: 2.1.0.dev20230403+cpu
+Is debug build: False
+CUDA used to build PyTorch: Could not collect
+ROCM used to build PyTorch: N/A
+
+OS: Arch Linux (x86_64)
+GCC version: (GCC) 12.2.1 20230201
+Clang version: 15.0.7
+CMake version: version 3.25.3
+Libc version: glibc-2.37
+
+Python version: 3.8.16 (default, Mar  2 2023, 03:21:46)  [GCC 11.2.0] (64-bit runtime)
+Python platform: Linux-6.2.6-arch1-1-x86_64-with-glibc2.17
+Is CUDA available: False
+CUDA runtime version: 11.7.99
+CUDA_MODULE_LOADING set to: N/A
+GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1080
+Nvidia driver version: 525.89.02
+cuDNN version: Could not collect
+HIP runtime version: N/A
+MIOpen runtime version: N/A
+Is XNNPACK available: True
+
+CPU:
+Architecture:                    x86_64
+CPU op-mode(s):                  32-bit, 64-bit
+Address sizes:                   48 bits physical, 48 bits virtual
+Byte Order:                      Little Endian
+CPU(s):                          24
+On-line CPU(s) list:             0-23
+Vendor ID:                       AuthenticAMD
+Model name:                      AMD Ryzen 9 5900X 12-Core Processor
+CPU family:                      25
+Model:                           33
+Thread(s) per core:              2
+Core(s) per socket:              12
+Socket(s):                       1
+Stepping:                        0
+Frequency boost:                 enabled
+CPU(s) scaling MHz:              56%
+CPU max MHz:                     4950,1948
+CPU min MHz:                     2200,0000
+BogoMIPS:                        7388,29
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
+Virtualization:                  AMD-V
+L1d cache:                       384 KiB (12 instances)
+L1i cache:                       384 KiB (12 instances)
+L2 cache:                        6 MiB (12 instances)
+L3 cache:                        64 MiB (2 instances)
+NUMA node(s):                    1
+NUMA node0 CPU(s):               0-23
+Vulnerability Itlb multihit:     Not affected
+Vulnerability L1tf:              Not affected
+Vulnerability Mds:               Not affected
+Vulnerability Meltdown:          Not affected
+Vulnerability Mmio stale data:   Not affected
+Vulnerability Retbleed:          Not affected
+Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl
+Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+Vulnerability Spectre v2:        Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected
+Vulnerability Srbds:             Not affected
+Vulnerability Tsx async abort:   Not affected
+
+Versions of relevant libraries:
+[pip3] light-the-torch==0.7.2
+[pip3] mypy-extensions==1.0.0
+[pip3] numpy==1.24.1
+[pip3] torch==2.1.0.dev20230403+cpu
+[pip3] torchvision==0.16.0.dev20230403+cpu
+[conda] Could not collect
diff --git a/transforms.py b/transforms.py
index 0b1ae97..c337d13 100644
--- a/transforms.py
+++ b/transforms.py
@@ -199,14 +199,19 @@ def _transform(self, inpt, params):
 
 class WrapCocoSampleForTransformsV2:
     def __init__(self):
-        num_samples = 117_266
         wrapper_factory = WRAPPER_FACTORIES[datasets.CocoDetection]
-        mock_dataset = SimpleNamespace(ids=list(range(num_samples)))
+        # The v2 wrapper depends on the `.ids` attribute of a `CocoDetection` dataset.
+        # However, this is eliminated above while filtering out images without
+        # annotations. Thus, we fake it here
+        mock_dataset = SimpleNamespace(ids=["invalid"])
         wrapper = wrapper_factory(mock_dataset)
-        self.wrapper = functools.partial(wrapper, num_samples // 2)
+        # The wrapper gets passed the index alongside the sample to wrap. The former is
+        # only used to retrieve the image ID by accessing the `.ids` attribute. Thus, we
+        # need to use any value so `.ids[idx]` works.
+        self.wrapper = functools.partial(wrapper, 0)
 
-    def __call__(self, *inputs):
-        return self.wrapper(inputs if len(inputs) > 1 else inputs[0])
+    def __call__(self, image, target):
+        return self.wrapper((image, target))
 
 
 # everything below is copy-pasted from