From 90b87143bc1d61a16fd321d5050226e5ebbebf9c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Feb 2022 11:22:17 +0100 Subject: [PATCH 1/6] add LSUN prototype dataset --- test/builtin_dataset_mocks.py | 46 +++++ .../prototype/datasets/_builtin/__init__.py | 1 + .../prototype/datasets/_builtin/lsun.py | 186 ++++++++++++++++++ .../prototype/datasets/utils/_internal.py | 12 +- 4 files changed, 241 insertions(+), 4 deletions(-) create mode 100644 torchvision/prototype/datasets/_builtin/lsun.py diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index d79cd78d1ff..133cda4f18b 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -9,6 +9,7 @@ import pathlib import pickle import random +import string import xml.etree.ElementTree as ET from collections import defaultdict, Counter @@ -16,6 +17,7 @@ import PIL.Image import pytest import torch +from common_utils import get_tmp_dir from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file from torch.nn.functional import one_hot from torch.testing import make_tensor as _make_tensor @@ -1340,3 +1342,47 @@ def pcam(info, root, config): compressed_file.write(compressed_data) return num_images + + +@register_mock +def lsun(info, root, config): + def make_lmdb(path): + import lmdb + + hexdigits_lowercase = string.digits + string.ascii_lowercase[:6] + + num_samples = torch.randint(1, 4, size=()).item() + format = "png" + + with get_tmp_dir() as tmp_dir: + files = create_image_folder(tmp_dir, "tmp", lambda idx: f"{idx}.{format}", num_samples) + + values = [] + for file in files: + buffer = io.BytesIO() + PIL.Image.open(file).save(buffer, format) + buffer.seek(0) + values.append(buffer.read()) + + with lmdb.open(str(path)) as env, env.begin(write=True) as txn: + for value in values: + key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode() + txn.put(key, value) + + return num_samples + + if config.split == "test": + names = ["test_lmdb"] + else: + names = [f"{category}_{config.split}_lmdb" for category in info.categories] + + num_samples = 0 + for name in names: + data_folder = root / name + data_folder.mkdir() + + num_samples += make_lmdb(data_folder) + + make_zip(root, data_folder.with_suffix(".zip").name) + + return num_samples diff --git a/torchvision/prototype/datasets/_builtin/__init__.py b/torchvision/prototype/datasets/_builtin/__init__.py index 9fdfca904f5..abdad63928a 100644 --- a/torchvision/prototype/datasets/_builtin/__init__.py +++ b/torchvision/prototype/datasets/_builtin/__init__.py @@ -8,6 +8,7 @@ from .fer2013 import FER2013 from .gtsrb import GTSRB from .imagenet import ImageNet +from .lsun import Lsun from .mnist import MNIST, FashionMNIST, KMNIST, EMNIST, QMNIST from .oxford_iiit_pet import OxfordIITPet from .pcam import PCAM diff --git a/torchvision/prototype/datasets/_builtin/lsun.py b/torchvision/prototype/datasets/_builtin/lsun.py new file mode 100644 index 00000000000..3fba37efe97 --- /dev/null +++ b/torchvision/prototype/datasets/_builtin/lsun.py @@ -0,0 +1,186 @@ +import functools +import io +import pathlib +import re +from typing import Any, Callable, Dict, List, Optional, Tuple, Iterator + +import torch +from torchdata.datapipes.iter import IterDataPipe, Mapper, OnDiskCacheHolder, Concater, IterableWrapper +from torchvision.prototype.datasets.utils import ( + Dataset, + DatasetConfig, + DatasetInfo, + HttpResource, + OnlineResource, + DatasetType, +) +from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.features import Label + +# We need lmdb.Environment as annotation, but lmdb is an optional requirement at import +try: + import lmdb + + Environment = lmdb.Environment +except ImportError: + Environment = Any + + +class LmdbKeyExtractor(IterDataPipe[Tuple[str, bytes]]): + def __init__(self, datapipe: IterDataPipe[str]) -> None: + self.datapipe = datapipe + + def __iter__(self) -> Iterator[Tuple[str, bytes]]: + import lmdb + + for path in self.datapipe: + with lmdb.open(path, readonly=True) as env: + with env.begin(write=False) as txn: + keys = b"\n".join(key for key in txn.cursor().iternext(keys=True, values=False)) + yield path, keys + + +class LmdbLoader(IterDataPipe[Tuple[Environment, bytes]]): + def __init__(self, datapipe: IterDataPipe[str]) -> None: + self.datapipe = datapipe + + def __iter__(self) -> Iterator[Tuple[Environment, bytes]]: # type: ignore[valid-type] + import lmdb + + for cache_path in self.datapipe: + env = lmdb.open(str(pathlib.Path(cache_path).parent), readonly=True) + + with open(cache_path, "rb") as file: + for key in file: + yield env, key.strip() + + +class LmdbReader(IterDataPipe): + def __init__(self, datapipe: IterDataPipe[Tuple[Environment, bytes]]): + self.datapipe = datapipe + + def __iter__(self) -> Iterator[Tuple[str, bytes, io.BytesIO]]: + for env, key in self.datapipe: + with env.begin(write=False) as txn: + yield env.path(), key, io.BytesIO(txn.get(key)) + + +class LsunHttpResource(HttpResource): + def __init__(self, *args: Any, extract: bool = True, **kwargs: Any) -> None: + super().__init__(*args, extract=extract, **kwargs) + + def _loader(self, path: pathlib.Path) -> IterDataPipe[str]: + # LMDB datasets cannot be loaded through an open file handle, but have to be loaded through the path of the + # parent directory. + return IterableWrapper([str(next(path.rglob("data.mdb")).parent)]) + + +class Lsun(Dataset): + def _make_info(self) -> DatasetInfo: + return DatasetInfo( + "lsun", + type=DatasetType.IMAGE, + categories=( + "bedroom", + "bridge", + "church_outdoor", + "classroom", + "conference_room", + "dining_room", + "kitchen", + "living_room", + "restaurant", + "tower", + ), + valid_options=dict(split=("train", "val", "test")), + dependencies=("lmdb",), + homepage="https://www.yf.io/p/lsun", + ) + + _CHECKSUMS = { + ("train", "bedroom"): "", + ("train", "bridge"): "", + ("train", "church_outdoor"): "", + ("train", "classroom"): "", + ("train", "conference_room"): "", + ("train", "dining_room"): "", + ("train", "kitchen"): "", + ("train", "living_room"): "", + ("train", "restaurant"): "", + ("train", "tower"): "", + ("val", "bedroom"): "5d022e781b241c25ec2e1f1f769afcdb8091d7fd58362667aec03137b8114b12", + ("val", "bridge"): "83216a2974d6068c2e1d18086006e7380ff58540216f955ce87fe049b460cb0d", + ("val", "church_outdoor"): "34635b7547a3e51a15f942a4a4082dd6bc9cca381a953515cb2275c0eed50584", + ("val", "classroom"): "5e0e9a375d94091dfe1fa3be87d4a92f41c03f1c0b8e376acc7e05651de512d7", + ("val", "conference_room"): "927c94df52e10b9b374748c2b83b28b5860e946b3186dfd587985e274834650f", + ("val", "dining_room"): "bd604d4b91bb5a9611d4e0b85475efd20758390d1a4eb57b53973fcbb5aa8ab6", + ("val", "kitchen"): "329165f35ec61c4cf49f809246de300b8baad3ffcbda1ac30c27bdd32c84369a", + ("val", "living_room"): "30a23d9a3db5414e9c97865f60ffb2ee973bfa658a23dbca7188ea514c97c9fc", + ("val", "restaurant"): "efaa7bcb898ad6cb73b07b89fec3a9c670f4622912eea22fab3986c2cf9a1c20", + ("val", "tower"): "7f5257847bc01f4e40d4a1b3e24dd8fcd37063f12ca8cf31e726c2ee0b1ae104", + } + + def resources(self, config: DatasetConfig) -> List[OnlineResource]: + url_root = "http://dl.yf.io/lsun/scenes" + if config.split == "test": + return [ + LsunHttpResource( + f"{url_root}/test_lmdb.zip", + sha256="5ee4f929363f26d1f3c7db6e40e3f7a8415cf777b3c5527f5f38bf3e9520ff22", + ) + ] + else: + return [ + LsunHttpResource( + f"{url_root}/{category}_{config.split}_lmdb.zip", + sha256=self._CHECKSUMS[(config.split, category)], + ) + for category in self.categories + ] + + _FOLDER_PATTERN = re.compile(r"(?P\w*?)_(?P(train|val))_lmdb") + + def _collate_and_decode_sample( + self, + data: Tuple[str, bytes, io.BytesIO], + *, + decoder: Optional[Callable[[io.IOBase], torch.Tensor]], + ) -> Dict[str, Any]: + path, key, buffer = data + + match = self._FOLDER_PATTERN.match(pathlib.Path(path).parent.name) + if match: + category = match["category"] + label = Label(self.categories.index(category), category=category) + else: + label = None + + return dict( + path=path, + key=key, + image=decoder(buffer) if decoder else buffer, + label=label, + ) + + def _filepath_fn(self, path: str) -> str: + return str(pathlib.Path(path).joinpath("keys.cache")) + + def _make_datapipe( + self, + resource_dps: List[IterDataPipe], + *, + config: DatasetConfig, + decoder: Optional[Callable[[io.IOBase], torch.Tensor]], + ) -> IterDataPipe[Dict[str, Any]]: + dp = Concater(*resource_dps) + + # LMDB datasets are indexed, but extracting all keys is expensive. Since we need them for shuffling, we cache + # the keys on disk and subsequently only read them from there. + dp = OnDiskCacheHolder(dp, filepath_fn=self._filepath_fn) + dp = LmdbKeyExtractor(dp).end_caching(mode="wb", same_filepath_fn=True, skip_read=True) + + dp = LmdbLoader(dp) + dp = hint_sharding(dp) + dp = hint_shuffling(dp) + dp = LmdbReader(dp) + return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder)) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 1b437d50b85..b886a2c11bc 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -129,12 +129,16 @@ def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[ return functools.partial(_path_accessor_closure, getter=getter) -def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool: - return accessor(data) == value +def _path_comparator_closure( + data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D, inv: bool +) -> bool: + return (accessor(data) == value) ^ inv -def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]: - return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value) +def path_comparator( + getter: Union[str, Callable[[pathlib.Path], D]], value: D, *, inv: bool = False +) -> Callable[[Tuple[str, Any]], bool]: + return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value, inv=inv) class CompressionType(enum.Enum): From 18b7d5e59e161dd5568a1335e5eea3d9b17e1824 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Feb 2022 12:05:42 +0100 Subject: [PATCH 2/6] revert unrelated changes --- torchvision/prototype/datasets/utils/_internal.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index b886a2c11bc..1b437d50b85 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -129,16 +129,12 @@ def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[ return functools.partial(_path_accessor_closure, getter=getter) -def _path_comparator_closure( - data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D, inv: bool -) -> bool: - return (accessor(data) == value) ^ inv +def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool: + return accessor(data) == value -def path_comparator( - getter: Union[str, Callable[[pathlib.Path], D]], value: D, *, inv: bool = False -) -> Callable[[Tuple[str, Any]], bool]: - return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value, inv=inv) +def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]: + return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value) class CompressionType(enum.Enum): From 10ddf73bca3741334e13bc371039fddb2af6285b Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Feb 2022 12:07:51 +0100 Subject: [PATCH 3/6] improve mock data generation --- test/builtin_dataset_mocks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 133cda4f18b..1d8978f9a6e 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -18,7 +18,7 @@ import pytest import torch from common_utils import get_tmp_dir -from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file +from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, create_random_string from torch.nn.functional import one_hot from torch.testing import make_tensor as _make_tensor from torchvision.prototype.datasets._api import find @@ -1349,8 +1349,6 @@ def lsun(info, root, config): def make_lmdb(path): import lmdb - hexdigits_lowercase = string.digits + string.ascii_lowercase[:6] - num_samples = torch.randint(1, 4, size=()).item() format = "png" @@ -1366,7 +1364,7 @@ def make_lmdb(path): with lmdb.open(str(path)) as env, env.begin(write=True) as txn: for value in values: - key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode() + key = create_random_string(40, string.digits + string.ascii_lowercase[:6]).encode() txn.put(key, value) return num_samples From 94431d4f6fb2f3a4e1ef412da40ce5ce62608211 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Feb 2022 14:54:05 +0100 Subject: [PATCH 4/6] add lmdb to CI requirements --- .circleci/config.yml | 2 +- .circleci/config.yml.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f1ddaf861ac..079f721d7e0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -351,7 +351,7 @@ jobs: - install_torchvision - install_prototype_dependencies - pip_install: - args: scipy pycocotools h5py + args: scipy pycocotools h5py lmdb descr: Install optional dependencies - run: name: Enable prototype tests diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 4bd2e14147a..a8d6c5e0b17 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -351,7 +351,7 @@ jobs: - install_torchvision - install_prototype_dependencies - pip_install: - args: scipy pycocotools h5py + args: scipy pycocotools h5py lmdb descr: Install optional dependencies - run: name: Enable prototype tests From 7a262821dbb5784e7cc8231d253b31995dce25a7 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 15 Feb 2022 08:53:34 +0100 Subject: [PATCH 5/6] cleanup --- .../prototype/datasets/_builtin/lsun.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/lsun.py b/torchvision/prototype/datasets/_builtin/lsun.py index 3fba37efe97..8d877f18689 100644 --- a/torchvision/prototype/datasets/_builtin/lsun.py +++ b/torchvision/prototype/datasets/_builtin/lsun.py @@ -98,16 +98,16 @@ def _make_info(self) -> DatasetInfo: ) _CHECKSUMS = { - ("train", "bedroom"): "", - ("train", "bridge"): "", - ("train", "church_outdoor"): "", - ("train", "classroom"): "", - ("train", "conference_room"): "", - ("train", "dining_room"): "", - ("train", "kitchen"): "", - ("train", "living_room"): "", - ("train", "restaurant"): "", - ("train", "tower"): "", + ("train", "bedroom"): "a15644c2e7153106867f0adb3491fc41224102d1f01b24494fb47d9f5d1f174e", + ("train", "bridge"): "2701b2d421bbc4d8a5e9b0652ff7c3b57cd6495da8e0e85c39533275b5a925a3", + ("train", "church_outdoor"): "91128ae026840ac0c5982b4445ab5fc4e092d6847cca76793b2b1a0815c2e74a", + ("train", "classroom"): "73a8a3e318819e1cc602f229673c3f51a68f3ece61e3764ce22df6abea4d0873", + ("train", "conference_room"): "fa0a4cf72e7acfb103392eaf33640d5508a728e971f31877b01d64e1bde6068c", + ("train", "dining_room"): "e4ee24c7c309360c3bf019123ce5bbf17434b2ba33abec2b0b07cfae715a52cb", + ("train", "kitchen"): "b1993cf639aece5d207a27eb9ff872bcca9dff6472d8227a052c79d40ee753c4", + ("train", "living_room"): "bd2c52b812c80f73ce3062a221396d13a52b5cce2f813b4cdf61937651281d7a", + ("train", "restaurant"): "11a0a924b960cd0900e9b7477d684fb338bd99cc5f72db1caac592e4f497e09a", + ("train", "tower"): "440caec74c9641cb51fd235a5970c8e4931c3af875aeb044a8bcca956e106309", ("val", "bedroom"): "5d022e781b241c25ec2e1f1f769afcdb8091d7fd58362667aec03137b8114b12", ("val", "bridge"): "83216a2974d6068c2e1d18086006e7380ff58540216f955ce87fe049b460cb0d", ("val", "church_outdoor"): "34635b7547a3e51a15f942a4a4082dd6bc9cca381a953515cb2275c0eed50584", @@ -163,7 +163,7 @@ def _collate_and_decode_sample( ) def _filepath_fn(self, path: str) -> str: - return str(pathlib.Path(path).joinpath("keys.cache")) + return str(pathlib.Path(path) / "keys.cache") def _make_datapipe( self, @@ -174,7 +174,7 @@ def _make_datapipe( ) -> IterDataPipe[Dict[str, Any]]: dp = Concater(*resource_dps) - # LMDB datasets are indexed, but extracting all keys is expensive. Since we need them for shuffling, we cache + # LMDB databases are indexed, but extracting all keys is expensive. Since we need them for shuffling, we cache # the keys on disk and subsequently only read them from there. dp = OnDiskCacheHolder(dp, filepath_fn=self._filepath_fn) dp = LmdbKeyExtractor(dp).end_caching(mode="wb", same_filepath_fn=True, skip_read=True) From 66ba84a66d00bd1441790568f69857c667a1e9cd Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 15 Feb 2022 09:01:04 +0100 Subject: [PATCH 6/6] remove decoder --- .../prototype/datasets/_builtin/lsun.py | 31 +++++-------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/lsun.py b/torchvision/prototype/datasets/_builtin/lsun.py index 8d877f18689..fda4f797490 100644 --- a/torchvision/prototype/datasets/_builtin/lsun.py +++ b/torchvision/prototype/datasets/_builtin/lsun.py @@ -1,10 +1,8 @@ -import functools import io import pathlib import re -from typing import Any, Callable, Dict, List, Optional, Tuple, Iterator +from typing import Any, Dict, List, Tuple, Iterator -import torch from torchdata.datapipes.iter import IterDataPipe, Mapper, OnDiskCacheHolder, Concater, IterableWrapper from torchvision.prototype.datasets.utils import ( Dataset, @@ -12,10 +10,9 @@ DatasetInfo, HttpResource, OnlineResource, - DatasetType, ) from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling -from torchvision.prototype.features import Label +from torchvision.prototype.features import Label, EncodedImage # We need lmdb.Environment as annotation, but lmdb is an optional requirement at import try: @@ -79,7 +76,6 @@ class Lsun(Dataset): def _make_info(self) -> DatasetInfo: return DatasetInfo( "lsun", - type=DatasetType.IMAGE, categories=( "bedroom", "bridge", @@ -140,25 +136,16 @@ def resources(self, config: DatasetConfig) -> List[OnlineResource]: _FOLDER_PATTERN = re.compile(r"(?P\w*?)_(?P(train|val))_lmdb") - def _collate_and_decode_sample( - self, - data: Tuple[str, bytes, io.BytesIO], - *, - decoder: Optional[Callable[[io.IOBase], torch.Tensor]], - ) -> Dict[str, Any]: + def _prepare_sample(self, data: Tuple[str, bytes, io.BytesIO]) -> Dict[str, Any]: path, key, buffer = data match = self._FOLDER_PATTERN.match(pathlib.Path(path).parent.name) - if match: - category = match["category"] - label = Label(self.categories.index(category), category=category) - else: - label = None + label = Label.from_category(match["category"], categories=self.categories) if match else None return dict( path=path, key=key, - image=decoder(buffer) if decoder else buffer, + image=EncodedImage.from_file(buffer), label=label, ) @@ -166,11 +153,7 @@ def _filepath_fn(self, path: str) -> str: return str(pathlib.Path(path) / "keys.cache") def _make_datapipe( - self, - resource_dps: List[IterDataPipe], - *, - config: DatasetConfig, - decoder: Optional[Callable[[io.IOBase], torch.Tensor]], + self, resource_dps: List[IterDataPipe], *, config: DatasetConfig ) -> IterDataPipe[Dict[str, Any]]: dp = Concater(*resource_dps) @@ -183,4 +166,4 @@ def _make_datapipe( dp = hint_sharding(dp) dp = hint_shuffling(dp) dp = LmdbReader(dp) - return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder)) + return Mapper(dp, self._prepare_sample)