From 90b87143bc1d61a16fd321d5050226e5ebbebf9c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 8 Feb 2022 11:22:17 +0100
Subject: [PATCH 1/6] add LSUN prototype dataset

---
 test/builtin_dataset_mocks.py                 |  46 +++++
 .../prototype/datasets/_builtin/__init__.py   |   1 +
 .../prototype/datasets/_builtin/lsun.py       | 186 ++++++++++++++++++
 .../prototype/datasets/utils/_internal.py     |  12 +-
 4 files changed, 241 insertions(+), 4 deletions(-)
 create mode 100644 torchvision/prototype/datasets/_builtin/lsun.py

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index d79cd78d1ff..133cda4f18b 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -9,6 +9,7 @@
 import pathlib
 import pickle
 import random
+import string
 import xml.etree.ElementTree as ET
 from collections import defaultdict, Counter
 
@@ -16,6 +17,7 @@
 import PIL.Image
 import pytest
 import torch
+from common_utils import get_tmp_dir
 from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
@@ -1340,3 +1342,47 @@ def pcam(info, root, config):
             compressed_file.write(compressed_data)
 
     return num_images
+
+
+@register_mock
+def lsun(info, root, config):
+    def make_lmdb(path):
+        import lmdb
+
+        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]
+
+        num_samples = torch.randint(1, 4, size=()).item()
+        format = "png"
+
+        with get_tmp_dir() as tmp_dir:
+            files = create_image_folder(tmp_dir, "tmp", lambda idx: f"{idx}.{format}", num_samples)
+
+            values = []
+            for file in files:
+                buffer = io.BytesIO()
+                PIL.Image.open(file).save(buffer, format)
+                buffer.seek(0)
+                values.append(buffer.read())
+
+        with lmdb.open(str(path)) as env, env.begin(write=True) as txn:
+            for value in values:
+                key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode()
+                txn.put(key, value)
+
+        return num_samples
+
+    if config.split == "test":
+        names = ["test_lmdb"]
+    else:
+        names = [f"{category}_{config.split}_lmdb" for category in info.categories]
+
+    num_samples = 0
+    for name in names:
+        data_folder = root / name
+        data_folder.mkdir()
+
+        num_samples += make_lmdb(data_folder)
+
+        make_zip(root, data_folder.with_suffix(".zip").name)
+
+    return num_samples
diff --git a/torchvision/prototype/datasets/_builtin/__init__.py b/torchvision/prototype/datasets/_builtin/__init__.py
index 9fdfca904f5..abdad63928a 100644
--- a/torchvision/prototype/datasets/_builtin/__init__.py
+++ b/torchvision/prototype/datasets/_builtin/__init__.py
@@ -8,6 +8,7 @@
 from .fer2013 import FER2013
 from .gtsrb import GTSRB
 from .imagenet import ImageNet
+from .lsun import Lsun
 from .mnist import MNIST, FashionMNIST, KMNIST, EMNIST, QMNIST
 from .oxford_iiit_pet import OxfordIITPet
 from .pcam import PCAM
diff --git a/torchvision/prototype/datasets/_builtin/lsun.py b/torchvision/prototype/datasets/_builtin/lsun.py
new file mode 100644
index 00000000000..3fba37efe97
--- /dev/null
+++ b/torchvision/prototype/datasets/_builtin/lsun.py
@@ -0,0 +1,186 @@
+import functools
+import io
+import pathlib
+import re
+from typing import Any, Callable, Dict, List, Optional, Tuple, Iterator
+
+import torch
+from torchdata.datapipes.iter import IterDataPipe, Mapper, OnDiskCacheHolder, Concater, IterableWrapper
+from torchvision.prototype.datasets.utils import (
+    Dataset,
+    DatasetConfig,
+    DatasetInfo,
+    HttpResource,
+    OnlineResource,
+    DatasetType,
+)
+from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
+from torchvision.prototype.features import Label
+
+# We need lmdb.Environment as annotation, but lmdb is an optional requirement at import
+try:
+    import lmdb
+
+    Environment = lmdb.Environment
+except ImportError:
+    Environment = Any
+
+
+class LmdbKeyExtractor(IterDataPipe[Tuple[str, bytes]]):
+    def __init__(self, datapipe: IterDataPipe[str]) -> None:
+        self.datapipe = datapipe
+
+    def __iter__(self) -> Iterator[Tuple[str, bytes]]:
+        import lmdb
+
+        for path in self.datapipe:
+            with lmdb.open(path, readonly=True) as env:
+                with env.begin(write=False) as txn:
+                    keys = b"\n".join(key for key in txn.cursor().iternext(keys=True, values=False))
+                    yield path, keys
+
+
+class LmdbLoader(IterDataPipe[Tuple[Environment, bytes]]):
+    def __init__(self, datapipe: IterDataPipe[str]) -> None:
+        self.datapipe = datapipe
+
+    def __iter__(self) -> Iterator[Tuple[Environment, bytes]]:  # type: ignore[valid-type]
+        import lmdb
+
+        for cache_path in self.datapipe:
+            env = lmdb.open(str(pathlib.Path(cache_path).parent), readonly=True)
+
+            with open(cache_path, "rb") as file:
+                for key in file:
+                    yield env, key.strip()
+
+
+class LmdbReader(IterDataPipe):
+    def __init__(self, datapipe: IterDataPipe[Tuple[Environment, bytes]]):
+        self.datapipe = datapipe
+
+    def __iter__(self) -> Iterator[Tuple[str, bytes, io.BytesIO]]:
+        for env, key in self.datapipe:
+            with env.begin(write=False) as txn:
+                yield env.path(), key, io.BytesIO(txn.get(key))
+
+
+class LsunHttpResource(HttpResource):
+    def __init__(self, *args: Any, extract: bool = True, **kwargs: Any) -> None:
+        super().__init__(*args, extract=extract, **kwargs)
+
+    def _loader(self, path: pathlib.Path) -> IterDataPipe[str]:
+        # LMDB datasets cannot be loaded through an open file handle, but have to be loaded through the path of the
+        # parent directory.
+        return IterableWrapper([str(next(path.rglob("data.mdb")).parent)])
+
+
+class Lsun(Dataset):
+    def _make_info(self) -> DatasetInfo:
+        return DatasetInfo(
+            "lsun",
+            type=DatasetType.IMAGE,
+            categories=(
+                "bedroom",
+                "bridge",
+                "church_outdoor",
+                "classroom",
+                "conference_room",
+                "dining_room",
+                "kitchen",
+                "living_room",
+                "restaurant",
+                "tower",
+            ),
+            valid_options=dict(split=("train", "val", "test")),
+            dependencies=("lmdb",),
+            homepage="https://www.yf.io/p/lsun",
+        )
+
+    _CHECKSUMS = {
+        ("train", "bedroom"): "",
+        ("train", "bridge"): "",
+        ("train", "church_outdoor"): "",
+        ("train", "classroom"): "",
+        ("train", "conference_room"): "",
+        ("train", "dining_room"): "",
+        ("train", "kitchen"): "",
+        ("train", "living_room"): "",
+        ("train", "restaurant"): "",
+        ("train", "tower"): "",
+        ("val", "bedroom"): "5d022e781b241c25ec2e1f1f769afcdb8091d7fd58362667aec03137b8114b12",
+        ("val", "bridge"): "83216a2974d6068c2e1d18086006e7380ff58540216f955ce87fe049b460cb0d",
+        ("val", "church_outdoor"): "34635b7547a3e51a15f942a4a4082dd6bc9cca381a953515cb2275c0eed50584",
+        ("val", "classroom"): "5e0e9a375d94091dfe1fa3be87d4a92f41c03f1c0b8e376acc7e05651de512d7",
+        ("val", "conference_room"): "927c94df52e10b9b374748c2b83b28b5860e946b3186dfd587985e274834650f",
+        ("val", "dining_room"): "bd604d4b91bb5a9611d4e0b85475efd20758390d1a4eb57b53973fcbb5aa8ab6",
+        ("val", "kitchen"): "329165f35ec61c4cf49f809246de300b8baad3ffcbda1ac30c27bdd32c84369a",
+        ("val", "living_room"): "30a23d9a3db5414e9c97865f60ffb2ee973bfa658a23dbca7188ea514c97c9fc",
+        ("val", "restaurant"): "efaa7bcb898ad6cb73b07b89fec3a9c670f4622912eea22fab3986c2cf9a1c20",
+        ("val", "tower"): "7f5257847bc01f4e40d4a1b3e24dd8fcd37063f12ca8cf31e726c2ee0b1ae104",
+    }
+
+    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
+        url_root = "http://dl.yf.io/lsun/scenes"
+        if config.split == "test":
+            return [
+                LsunHttpResource(
+                    f"{url_root}/test_lmdb.zip",
+                    sha256="5ee4f929363f26d1f3c7db6e40e3f7a8415cf777b3c5527f5f38bf3e9520ff22",
+                )
+            ]
+        else:
+            return [
+                LsunHttpResource(
+                    f"{url_root}/{category}_{config.split}_lmdb.zip",
+                    sha256=self._CHECKSUMS[(config.split, category)],
+                )
+                for category in self.categories
+            ]
+
+    _FOLDER_PATTERN = re.compile(r"(?P<category>\w*?)_(?P<split>(train|val))_lmdb")
+
+    def _collate_and_decode_sample(
+        self,
+        data: Tuple[str, bytes, io.BytesIO],
+        *,
+        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
+    ) -> Dict[str, Any]:
+        path, key, buffer = data
+
+        match = self._FOLDER_PATTERN.match(pathlib.Path(path).parent.name)
+        if match:
+            category = match["category"]
+            label = Label(self.categories.index(category), category=category)
+        else:
+            label = None
+
+        return dict(
+            path=path,
+            key=key,
+            image=decoder(buffer) if decoder else buffer,
+            label=label,
+        )
+
+    def _filepath_fn(self, path: str) -> str:
+        return str(pathlib.Path(path).joinpath("keys.cache"))
+
+    def _make_datapipe(
+        self,
+        resource_dps: List[IterDataPipe],
+        *,
+        config: DatasetConfig,
+        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
+    ) -> IterDataPipe[Dict[str, Any]]:
+        dp = Concater(*resource_dps)
+
+        # LMDB datasets are indexed, but extracting all keys is expensive. Since we need them for shuffling, we cache
+        # the keys on disk and subsequently only read them from there.
+        dp = OnDiskCacheHolder(dp, filepath_fn=self._filepath_fn)
+        dp = LmdbKeyExtractor(dp).end_caching(mode="wb", same_filepath_fn=True, skip_read=True)
+
+        dp = LmdbLoader(dp)
+        dp = hint_sharding(dp)
+        dp = hint_shuffling(dp)
+        dp = LmdbReader(dp)
+        return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 1b437d50b85..b886a2c11bc 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -129,12 +129,16 @@ def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[
     return functools.partial(_path_accessor_closure, getter=getter)
 
 
-def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool:
-    return accessor(data) == value
+def _path_comparator_closure(
+    data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D, inv: bool
+) -> bool:
+    return (accessor(data) == value) ^ inv
 
 
-def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]:
-    return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value)
+def path_comparator(
+    getter: Union[str, Callable[[pathlib.Path], D]], value: D, *, inv: bool = False
+) -> Callable[[Tuple[str, Any]], bool]:
+    return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value, inv=inv)
 
 
 class CompressionType(enum.Enum):

From 18b7d5e59e161dd5568a1335e5eea3d9b17e1824 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 8 Feb 2022 12:05:42 +0100
Subject: [PATCH 2/6] revert unrelated changes

---
 torchvision/prototype/datasets/utils/_internal.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index b886a2c11bc..1b437d50b85 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -129,16 +129,12 @@ def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[
     return functools.partial(_path_accessor_closure, getter=getter)
 
 
-def _path_comparator_closure(
-    data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D, inv: bool
-) -> bool:
-    return (accessor(data) == value) ^ inv
+def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool:
+    return accessor(data) == value
 
 
-def path_comparator(
-    getter: Union[str, Callable[[pathlib.Path], D]], value: D, *, inv: bool = False
-) -> Callable[[Tuple[str, Any]], bool]:
-    return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value, inv=inv)
+def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]:
+    return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value)
 
 
 class CompressionType(enum.Enum):

From 10ddf73bca3741334e13bc371039fddb2af6285b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 8 Feb 2022 12:07:51 +0100
Subject: [PATCH 3/6] improve mock data generation

---
 test/builtin_dataset_mocks.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
index 133cda4f18b..1d8978f9a6e 100644
--- a/test/builtin_dataset_mocks.py
+++ b/test/builtin_dataset_mocks.py
@@ -18,7 +18,7 @@
 import pytest
 import torch
 from common_utils import get_tmp_dir
-from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file
+from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, create_random_string
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
 from torchvision.prototype.datasets._api import find
@@ -1349,8 +1349,6 @@ def lsun(info, root, config):
     def make_lmdb(path):
         import lmdb
 
-        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]
-
         num_samples = torch.randint(1, 4, size=()).item()
         format = "png"
 
@@ -1366,7 +1364,7 @@ def make_lmdb(path):
 
         with lmdb.open(str(path)) as env, env.begin(write=True) as txn:
             for value in values:
-                key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode()
+                key = create_random_string(40, string.digits + string.ascii_lowercase[:6]).encode()
                 txn.put(key, value)
 
         return num_samples

From 94431d4f6fb2f3a4e1ef412da40ce5ce62608211 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 8 Feb 2022 14:54:05 +0100
Subject: [PATCH 4/6] add lmdb to CI requirements

---
 .circleci/config.yml    | 2 +-
 .circleci/config.yml.in | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index f1ddaf861ac..079f721d7e0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -351,7 +351,7 @@ jobs:
       - install_torchvision
       - install_prototype_dependencies
       - pip_install:
-          args: scipy pycocotools h5py
+          args: scipy pycocotools h5py lmdb
           descr: Install optional dependencies
       - run:
           name: Enable prototype tests
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index 4bd2e14147a..a8d6c5e0b17 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -351,7 +351,7 @@ jobs:
       - install_torchvision
       - install_prototype_dependencies
       - pip_install:
-          args: scipy pycocotools h5py
+          args: scipy pycocotools h5py lmdb
           descr: Install optional dependencies
       - run:
           name: Enable prototype tests

From 7a262821dbb5784e7cc8231d253b31995dce25a7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 15 Feb 2022 08:53:34 +0100
Subject: [PATCH 5/6] cleanup

---
 .../prototype/datasets/_builtin/lsun.py       | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/lsun.py b/torchvision/prototype/datasets/_builtin/lsun.py
index 3fba37efe97..8d877f18689 100644
--- a/torchvision/prototype/datasets/_builtin/lsun.py
+++ b/torchvision/prototype/datasets/_builtin/lsun.py
@@ -98,16 +98,16 @@ def _make_info(self) -> DatasetInfo:
         )
 
     _CHECKSUMS = {
-        ("train", "bedroom"): "",
-        ("train", "bridge"): "",
-        ("train", "church_outdoor"): "",
-        ("train", "classroom"): "",
-        ("train", "conference_room"): "",
-        ("train", "dining_room"): "",
-        ("train", "kitchen"): "",
-        ("train", "living_room"): "",
-        ("train", "restaurant"): "",
-        ("train", "tower"): "",
+        ("train", "bedroom"): "a15644c2e7153106867f0adb3491fc41224102d1f01b24494fb47d9f5d1f174e",
+        ("train", "bridge"): "2701b2d421bbc4d8a5e9b0652ff7c3b57cd6495da8e0e85c39533275b5a925a3",
+        ("train", "church_outdoor"): "91128ae026840ac0c5982b4445ab5fc4e092d6847cca76793b2b1a0815c2e74a",
+        ("train", "classroom"): "73a8a3e318819e1cc602f229673c3f51a68f3ece61e3764ce22df6abea4d0873",
+        ("train", "conference_room"): "fa0a4cf72e7acfb103392eaf33640d5508a728e971f31877b01d64e1bde6068c",
+        ("train", "dining_room"): "e4ee24c7c309360c3bf019123ce5bbf17434b2ba33abec2b0b07cfae715a52cb",
+        ("train", "kitchen"): "b1993cf639aece5d207a27eb9ff872bcca9dff6472d8227a052c79d40ee753c4",
+        ("train", "living_room"): "bd2c52b812c80f73ce3062a221396d13a52b5cce2f813b4cdf61937651281d7a",
+        ("train", "restaurant"): "11a0a924b960cd0900e9b7477d684fb338bd99cc5f72db1caac592e4f497e09a",
+        ("train", "tower"): "440caec74c9641cb51fd235a5970c8e4931c3af875aeb044a8bcca956e106309",
         ("val", "bedroom"): "5d022e781b241c25ec2e1f1f769afcdb8091d7fd58362667aec03137b8114b12",
         ("val", "bridge"): "83216a2974d6068c2e1d18086006e7380ff58540216f955ce87fe049b460cb0d",
         ("val", "church_outdoor"): "34635b7547a3e51a15f942a4a4082dd6bc9cca381a953515cb2275c0eed50584",
@@ -163,7 +163,7 @@ def _collate_and_decode_sample(
         )
 
     def _filepath_fn(self, path: str) -> str:
-        return str(pathlib.Path(path).joinpath("keys.cache"))
+        return str(pathlib.Path(path) / "keys.cache")
 
     def _make_datapipe(
         self,
@@ -174,7 +174,7 @@ def _make_datapipe(
     ) -> IterDataPipe[Dict[str, Any]]:
         dp = Concater(*resource_dps)
 
-        # LMDB datasets are indexed, but extracting all keys is expensive. Since we need them for shuffling, we cache
+        # LMDB databases are indexed, but extracting all keys is expensive. Since we need them for shuffling, we cache
         # the keys on disk and subsequently only read them from there.
         dp = OnDiskCacheHolder(dp, filepath_fn=self._filepath_fn)
         dp = LmdbKeyExtractor(dp).end_caching(mode="wb", same_filepath_fn=True, skip_read=True)

From 66ba84a66d00bd1441790568f69857c667a1e9cd Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 15 Feb 2022 09:01:04 +0100
Subject: [PATCH 6/6] remove decoder

---
 .../prototype/datasets/_builtin/lsun.py       | 31 +++++--------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/lsun.py b/torchvision/prototype/datasets/_builtin/lsun.py
index 8d877f18689..fda4f797490 100644
--- a/torchvision/prototype/datasets/_builtin/lsun.py
+++ b/torchvision/prototype/datasets/_builtin/lsun.py
@@ -1,10 +1,8 @@
-import functools
 import io
 import pathlib
 import re
-from typing import Any, Callable, Dict, List, Optional, Tuple, Iterator
+from typing import Any, Dict, List, Tuple, Iterator
 
-import torch
 from torchdata.datapipes.iter import IterDataPipe, Mapper, OnDiskCacheHolder, Concater, IterableWrapper
 from torchvision.prototype.datasets.utils import (
     Dataset,
@@ -12,10 +10,9 @@
     DatasetInfo,
     HttpResource,
     OnlineResource,
-    DatasetType,
 )
 from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Label
+from torchvision.prototype.features import Label, EncodedImage
 
 # We need lmdb.Environment as annotation, but lmdb is an optional requirement at import
 try:
@@ -79,7 +76,6 @@ class Lsun(Dataset):
     def _make_info(self) -> DatasetInfo:
         return DatasetInfo(
             "lsun",
-            type=DatasetType.IMAGE,
             categories=(
                 "bedroom",
                 "bridge",
@@ -140,25 +136,16 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]:
 
     _FOLDER_PATTERN = re.compile(r"(?P<category>\w*?)_(?P<split>(train|val))_lmdb")
 
-    def _collate_and_decode_sample(
-        self,
-        data: Tuple[str, bytes, io.BytesIO],
-        *,
-        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
-    ) -> Dict[str, Any]:
+    def _prepare_sample(self, data: Tuple[str, bytes, io.BytesIO]) -> Dict[str, Any]:
         path, key, buffer = data
 
         match = self._FOLDER_PATTERN.match(pathlib.Path(path).parent.name)
-        if match:
-            category = match["category"]
-            label = Label(self.categories.index(category), category=category)
-        else:
-            label = None
+        label = Label.from_category(match["category"], categories=self.categories) if match else None
 
         return dict(
             path=path,
             key=key,
-            image=decoder(buffer) if decoder else buffer,
+            image=EncodedImage.from_file(buffer),
             label=label,
         )
 
@@ -166,11 +153,7 @@ def _filepath_fn(self, path: str) -> str:
         return str(pathlib.Path(path) / "keys.cache")
 
     def _make_datapipe(
-        self,
-        resource_dps: List[IterDataPipe],
-        *,
-        config: DatasetConfig,
-        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
+        self, resource_dps: List[IterDataPipe], *, config: DatasetConfig
     ) -> IterDataPipe[Dict[str, Any]]:
         dp = Concater(*resource_dps)
 
@@ -183,4 +166,4 @@ def _make_datapipe(
         dp = hint_sharding(dp)
         dp = hint_shuffling(dp)
         dp = LmdbReader(dp)
-        return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
+        return Mapper(dp, self._prepare_sample)