From aba8779b15ae225858a169c577426c5ef5bab626 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 15:46:23 +0100 Subject: [PATCH 01/29] add base class for datasets tests --- test/common_utils.py | 8 + test/datasets_testcases.py | 318 +++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 test/datasets_testcases.py diff --git a/test/common_utils.py b/test/common_utils.py index 76cdfdc006b..a0fb7781899 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -393,3 +393,11 @@ def int_dtypes(): def float_dtypes(): return torch.testing.floating_types() + + +@contextlib.contextmanager +def disable_console_output(): + with contextlib.ExitStack() as stack, open(os.devnull, "w") as devnull: + stack.enter_context(contextlib.redirect_stdout(devnull)) + stack.enter_context(contextlib.redirect_stderr(devnull)) + yield diff --git a/test/datasets_testcases.py b/test/datasets_testcases.py new file mode 100644 index 00000000000..5d4cb36e575 --- /dev/null +++ b/test/datasets_testcases.py @@ -0,0 +1,318 @@ +import contextlib +import functools +import importlib +import inspect +import itertools +import unittest +import unittest.mock + +from PIL import Image + +import torchvision.datasets + +from datasets_utils import tmpdir, disable_console_output + + +__all__ = ["DatasetTestCase", "ImageDatasetTestCase", "VideoDatasetTestCase", "test_all_configs"] + + +class UsageError(RuntimeError): + """Should be raised instead of a generic ``RuntimeError`` in case a test case is not correctly configured.""" + + +# As of Python 3.7 this is provided by contextlib +# https://docs.python.org/3.7/library/contextlib.html#contextlib.nullcontext +# TODO: If the minimum Python requirement is >= 3.7, replace this +@contextlib.contextmanager +def nullcontext(enter_result=None): + yield enter_result + + +def test_all_configs(test): + """Decorator to run test against all configurations. + + Add this as decorator to an arbitrary test to run it against all configurations. The current configuration is + provided as the first parameter: + + .. code-block:: + + @test_all_configs + def test_foo(self, config): + pass + """ + + @functools.wraps(test) + def wrapper(self): + for config in self.CONFIGS: + with self.subTest(**config): + test(self, config) + + return wrapper + + +class DatasetTestCase(unittest.TestCase): + """Abstract base class for all dataset testcases. + + You have to overwrite the following class attributes: + + - DATASET_CLASS (torchvision.datasets.VisionDataset): Class of dataset to be tested. + - FEATURE_TYPES (Sequence[Any]): Types of the elements returned by index access of the dataset. Instead of + providing these manually, you can instead subclass ``ImageDatasetTestCase`` or ``VideoDatasetTestCase```to + get a reasonable default, that should work for most cases. + + Optionally, you can overwrite the following class attributes: + + - CONFIGS (Sequence[Dict[str, Any]]): Additional configs that should be tested. Each dictonary can contain an + arbitrary combination of dataset parameters that are **not** ``transform``, ``target_transform``, + ``transforms``, or ``download``. The first element will be used as default configuration. + - REQUIRED_PACKAGES (Iterable[str]): Additional dependencies to use the dataset. If these packages are not + available, the tests are skipped. + + Additionally, you need to overwrite the ``inject_fake_data()`` method that provides the data that the tests rely on. + The fake data should resemble the original data as close as necessary, while containing only few examples. During + the creation of the dataset check-, download-, and extract-functions from ``torchvision.datasets.utils`` are + disabled. + + Without further configuration, the testcase will test if + + 1. the dataset raises a ``RuntimeError`` if the data files are not found, + 2. the dataset inherits from `torchvision.datasets.VisionDataset`, + 3. the dataset can be turned into a string, + 4. the feature types of a returned example matches ``FEATURE_TYPES``, and + 5. the number of examples matches the injected fake data. + + Case 3., 4., and 5. are tested against all configurations in ``CONFIGS``. + + To add dataset-specific tests, create a new method that takes no arguments with ``test_`` as a name prefix: + + .. code-block:: + + def test_foo(self): + pass + + If you want to run the test against all configs, add the ``@test_all_configs`` decorator to the definition and + accept a single argument: + + .. code-block:: + + @test_all_configs + def test_bar(self, config): + pass + + Within the test you can use the ``create_dataset()`` method that yields the dataset as well as additional information + provided by the ``ìnject_fake_data()`` method: + + .. code-block:: + + def test_baz(self): + with self.create_dataset() as (dataset, info): + pass + """ + + DATASET_CLASS = None + FEATURE_TYPES = None + + CONFIGS = None + REQUIRED_PACKAGES = None + + _SPECIAL_KWARGS = { + "transform", + "target_transform", + "transforms", + "download", + } + _HAS_SPECIAL_KWARG = None + + _CHECK_FUNCTIONS = { + "check_md5", + "check_integrity", + } + _DOWNLOAD_EXTRACT_FUNCTIONS = { + "download_url", + "download_file_from_google_drive", + "extract_archive", + "download_and_extract_archive", + } + + def inject_fake_data(self, root: str, config: dict[str, Any]) -> dict[str, Any]: + """Inject fake data into the root of the dataset. + + Args: + root (str): Root of the dataset. + config (dict[str, Any]): Configuration that will be used to create the dataset. + + Returns: + info (dict[str, Any]): Additional information about the injected fake data. Must contain the field + ``"num_examples"`` that corresponds to the length of the dataset to be created. + """ + raise NotImplementedError("You need to provide fake data in order for the tests to run.") + + @contextlib.contextmanager + def create_dataset(self, config=None, inject_fake_data=True, disable_download_extract=None, **kwargs): + r"""Create the dataset in a temporary directory. + + Args: + config (Optional[dict[str, Any]]): Configuration that will be used to create the dataset. If omitted, the + default configuration is used. + inject_fake_data (bool): If ``True`` (default) inject the fake data with :meth:`.inject_fake_data` before + creating the dataset. + disable_download_extract (Optional[bool]): If ``True`` disable download and extract logic while creating + the dataset. If ``None`` (default) this takes the same value as ``inject_fake_data``. + **kwargs (Any): Additional parameters passed to the dataset. These parameters take precedence in case they + overlap with ``config``. + + Yields: + dataset (torchvision.dataset.VisionDataset): Dataset. + info (dict[str, Any]): Additional information about the injected fake data. See :meth:`.inject_fake_data` + for details. + """ + if config is None: + config = self.CONFIGS[0] + + special_kwargs, other_kwargs = self._split_kwargs(kwargs) + config.update(other_kwargs) + + if disable_download_extract is None: + disable_download_extract = inject_fake_data + + with tmpdir() as root: + info = self.inject_fake_data(root, config) if inject_fake_data else None + if info is None or "num_examples" not in info: + raise UsageError( + "The method 'inject_fake_data' needs to return a dictionary that contains at least a " + "'num_examples' field." + ) + + cm = self._disable_download_extract if disable_download_extract else nullcontext + with cm(special_kwargs), disable_console_output(): + dataset = self.DATASET_CLASS(root, **config, **special_kwargs) + + yield dataset, info + + @classmethod + def setUpClass(cls): + cls._verify_required_public_class_attributes() + cls._populate_private_class_attributes() + cls._process_optional_public_class_attributes() + super().setUpClass() + + @classmethod + def _verify_required_public_class_attributes(cls): + if cls.DATASET_CLASS is None: + raise UsageError( + "The class attribute 'DATASET_CLASS' needs to be overwritten. " + "It should contain the class of the dataset to be tested." + ) + if cls.FEATURE_TYPES is None: + raise UsageError( + "The class attribute 'FEATURE_TYPES' needs to be overwritten. " + "It should contain a sequence of types that the dataset returns when accessed by index." + ) + + @property + @classmethod + def _argspec(cls): + return inspect.getfullargspec(cls.DATASET_CLASS.__init__) + + @property + @classmethod + def _name(cls): + return cls.DATASET_CLASS.__name__ + + @classmethod + def _populate_private_class_attributes(cls): + cls._HAS_SPECIAL_KWARG = {name: name in cls._argspec.args for name in cls._SPECIAL_KWARGS} + + @classmethod + def _process_optional_public_class_attributes(cls): + argspec = cls._argspec + if cls.CONFIGS is None: + config = { + kwarg: default + for kwarg, default in zip(argspec.args[-len(argspec.defaults) :], argspec.defaults) + if kwarg not in cls._SPECIAL_KWARGS + } + cls.CONFIGS = (config,) + + if cls.REQUIRED_PACKAGES is not None: + try: + for pkg in cls.REQUIRED_PACKAGES: + importlib.import_module(pkg) + except ImportError as error: + raise unittest.SkipTest( + f"The package '{error.name}' is required to load the dataset '{cls._name}' but is not installed." + ) + + def _split_kwargs(self, kwargs): + special_kwargs = kwargs.copy() + other_kwargs = {key: special_kwargs.pop(key) for key in set(special_kwargs.keys()) - self._SPECIAL_KWARGS} + return special_kwargs, other_kwargs + + @contextlib.contextmanager + def _disable_download_extract(self, special_kwargs): + inject_download_kwarg = self._HAS_SPECIAL_KWARG["download"] and "download" not in special_kwargs + if inject_download_kwarg: + special_kwargs["download"] = False + + module = inspect.getmodule(self.DATASET_CLASS).__name__ + with contextlib.ExitStack() as stack: + mocks = {} + for function, kwargs in itertools.chain( + zip(self._CHECK_FUNCTIONS, [dict(return_value=True)] * len(self._CHECK_FUNCTIONS)), + zip(self._DOWNLOAD_EXTRACT_FUNCTIONS, [dict()] * len(self._DOWNLOAD_EXTRACT_FUNCTIONS)), + ): + with contextlib.suppress(AttributeError): + patcher = unittest.mock.patch(f"{module}.{function}", **kwargs) + mocks[function] = stack.enter_context(patcher) + + try: + yield mocks + finally: + if inject_download_kwarg: + del special_kwargs["download"] + + def test_not_found(self): + with self.assertRaises(RuntimeError): + with self.create_dataset(inject_fake_data=False): + pass + + def test_smoke(self, config): + with self.create_dataset(config) as (dataset, _): + self.assertIsInstance(dataset, torchvision.datasets.VisionDataset) + + @test_all_configs + def test_str_smoke(self, config): + with self.create_dataset(config) as (dataset, _): + self.assertIsInstance(str(dataset), str) + + @test_all_configs + def test_feature_types(self, config): + with self.create_dataset(config) as (dataset, _): + example = dataset[0] + + actual = len(example) + expected = len(self.FEATURE_TYPES) + self.assertEqual( + actual, + expected, + f"The number of the returned features does not match the the number of elements in in FEATURE_TYPES: " + f"{actual} != {expected}", + ) + + for idx, (feature, expected_feature_type) in enumerate(zip(example, self.FEATURE_TYPES)): + with self.subTest(idx=idx): + self.assertIsInstance(feature, expected_feature_type) + + @test_all_configs + def test_num_examples(self, config): + with self.create_dataset(config) as (dataset, info): + self.assertEqual(len(dataset), info["num_examples"]) + + +class ImageDatasetTestCase(DatasetTestCase): + FEATURE_TYPES = (Image.Image, int) + + +class VideoDatasetTestCase(DatasetTestCase): + FEATURE_TYPES = (torch.Tensor, torch.Tensor, int) + REQUIRED_PACKAGES = ("av",) From fd8d48adee8d2da4fbd1baef7bd6304eaf497ba5 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 16:18:14 +0100 Subject: [PATCH 02/29] add better type hints --- ...atasets_testcases.py => datasets_utils.py} | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) rename test/{datasets_testcases.py => datasets_utils.py} (94%) diff --git a/test/datasets_testcases.py b/test/datasets_utils.py similarity index 94% rename from test/datasets_testcases.py rename to test/datasets_utils.py index 5d4cb36e575..cdf1159694c 100644 --- a/test/datasets_testcases.py +++ b/test/datasets_utils.py @@ -5,6 +5,7 @@ import itertools import unittest import unittest.mock +from typing import Any, Iterator, Sequence, Tuple, Union from PIL import Image @@ -134,25 +135,31 @@ def test_baz(self): "download_and_extract_archive", } - def inject_fake_data(self, root: str, config: dict[str, Any]) -> dict[str, Any]: + def inject_fake_data(self, root: str, config: Dict[str, Any]) -> Dict[str, Any]: """Inject fake data into the root of the dataset. Args: root (str): Root of the dataset. - config (dict[str, Any]): Configuration that will be used to create the dataset. + config (Dict[str, Any]): Configuration that will be used to create the dataset. Returns: - info (dict[str, Any]): Additional information about the injected fake data. Must contain the field + info (Dict[str, Any]): Additional information about the injected fake data. Must contain the field ``"num_examples"`` that corresponds to the length of the dataset to be created. """ raise NotImplementedError("You need to provide fake data in order for the tests to run.") @contextlib.contextmanager - def create_dataset(self, config=None, inject_fake_data=True, disable_download_extract=None, **kwargs): + def create_dataset( + self, + config: Optional[Dict[str, Any]] = None, + inject_fake_data: bool = True, + disable_download_extract: Optional[bool] = None, + **kwargs: Any, + ) -> Iterator[Tuple[torchvision.datasets.VisionDataset, Dict[str, Any]]]: r"""Create the dataset in a temporary directory. Args: - config (Optional[dict[str, Any]]): Configuration that will be used to create the dataset. If omitted, the + config (Optional[Dict[str, Any]]): Configuration that will be used to create the dataset. If omitted, the default configuration is used. inject_fake_data (bool): If ``True`` (default) inject the fake data with :meth:`.inject_fake_data` before creating the dataset. @@ -163,7 +170,7 @@ def create_dataset(self, config=None, inject_fake_data=True, disable_download_ex Yields: dataset (torchvision.dataset.VisionDataset): Dataset. - info (dict[str, Any]): Additional information about the injected fake data. See :meth:`.inject_fake_data` + info (Dict[str, Any]): Additional information about the injected fake data. See :meth:`.inject_fake_data` for details. """ if config is None: From cb29187fa1770436a1f9652cce137f8afa1985d9 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 16:24:00 +0100 Subject: [PATCH 03/29] add documentation to subclasses --- test/datasets_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index cdf1159694c..39afd3fb592 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -317,9 +317,21 @@ def test_num_examples(self, config): class ImageDatasetTestCase(DatasetTestCase): + """Abstract base class for image dataset testcases. + + - Overwrites the FEATURE_TYPES class attribute to expect a :class:`PIL.Image.Image` and an integer label. + """ + FEATURE_TYPES = (Image.Image, int) class VideoDatasetTestCase(DatasetTestCase): + """Abstract base class for video dataset testcases. + + - Overwrites the FEATURE_TYPES class attribute to expect two :class:`torch.Tensor` s for the video and audio as + well as an integer label. + - Overwrites the REQUIRED_PACKAGES class attribute to require PyAV (``av``). + """ + FEATURE_TYPES = (torch.Tensor, torch.Tensor, int) REQUIRED_PACKAGES = ("av",) From 11b27e480ed86b5f7db2991978a8da0653747150 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:04:40 +0100 Subject: [PATCH 04/29] add utility functions to create files / folders of random images and videos --- test/datasets_utils.py | 195 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 4 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 39afd3fb592..919299a6837 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -3,22 +3,57 @@ import importlib import inspect import itertools +import os +import pathlib import unittest import unittest.mock from typing import Any, Iterator, Sequence, Tuple, Union -from PIL import Image +import PIL.Image +import torch import torchvision.datasets +from common_utils import get_tmp_dir from datasets_utils import tmpdir, disable_console_output +try: + from torchvision.io import write_video -__all__ = ["DatasetTestCase", "ImageDatasetTestCase", "VideoDatasetTestCase", "test_all_configs"] + PYAV_AVAILABLE = True + +except ImportError: + write_video = None + PYAV_AVAILABLE = False + + +__all__ = [ + "UsageError", + "test_all_configs", + "DatasetTestCase", + "ImageDatasetTestCase", + "VideoDatasetTestCase", + "create_image_or_video_tensor", + "create_image_file", + "create_image_folder", + "create_video_file", + "create_video_folder", +] class UsageError(RuntimeError): - """Should be raised instead of a generic ``RuntimeError`` in case a test case is not correctly configured.""" + """Should be raised in case an error happens in the setup rather than the test.""" + + +def requires_pyav(fn): + @functools.wraps(fn) + def wrapper(*args, **kwargs): + if not PYAV_AVAILABLE: + raise UsageError("PyAV (av) is required but not available.") + + return fn(*args, **kwargs) + + return wrapper # As of Python 3.7 this is provided by contextlib @@ -322,7 +357,7 @@ class ImageDatasetTestCase(DatasetTestCase): - Overwrites the FEATURE_TYPES class attribute to expect a :class:`PIL.Image.Image` and an integer label. """ - FEATURE_TYPES = (Image.Image, int) + FEATURE_TYPES = (PIL.Image.Image, int) class VideoDatasetTestCase(DatasetTestCase): @@ -335,3 +370,155 @@ class VideoDatasetTestCase(DatasetTestCase): FEATURE_TYPES = (torch.Tensor, torch.Tensor, int) REQUIRED_PACKAGES = ("av",) + + +def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor: + r"""Create a random uint8 tensor. + + Args: + size (Sequence[int]): Size of the tensor. + """ + return torch.randint(0, 256, size, dtype=torch.uint8) + + +def create_image_file( + root: Union[pathlib.Path, str], name: Union[pathlib.Path, str], size: Union[Sequence[int], int] = 10, **kwargs: Any +) -> None: + """Create an image file from random data. + + Args: + root (Union[str, pathlib.Path]): Root directory the image file will be placed in. + name (Union[str, pathlib.Path]): Name of the image file. + size (Union[Sequence[int], int]): Size of the image that represents the ``(num_channels, height, width)``. If + scalar, the value is used for the height and width. If not provided, three channels are assumed. + kwargs (Any): Additional parameters passed to :meth:`PIL.Image.Image.save`. + """ + if isinstance(size, int): + size = (size, size) + if len(size) == 2: + size = (3, *size) + if len(size) != 3: + raise UsageError( + f"The 'size' argument should either be an int or a sequence of length 2 or 3. Got {len(size)} instead" + ) + + image = create_image_or_video_tensor(size) + PIL.Image.fromarray(image.permute(2, 1, 0).numpy()).save(pathlib.Path(root) / name) + + +def create_image_folder( + root: Union[pathlib.Path, str], + name: Union[pathlib.Path, str], + file_name_fn: Callable[[idx], str], + num_examples: int, + size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, + **kwargs: Any, +): + """Create a folder of random images. + + Args: + root (Union[str, pathlib.Path]): Root directory the image folder will be placed in. + name (Union[str, pathlib.Path]): Name of the image folder. + file_name_fn (Callable[[idx], str]): Should return a file name if called with the file index. + num_examples (int): Number of images to create. + size (Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]]): Size of the images. If + callable, will be called with the index of the corresponding file. If omitted, a random height and width + between 3 and 10 pixels is selected on a per-image basis. + kwargs (Any): Additional parameters passed to :func:`create_image_file`. + """ + if size is None: + + def size(idx: int) -> Tuple[int, int, int]: + num_channels = 3 + height, width = torch.randint(3, 11, size=(2,), dtype=np.int).tolist() + return (num_channels, height, width) + + root = pathlib.Path(root) / name + os.makedirs(root) + + for idx in range(num_examples): + create_image_file(root, file_name_fn(idx), size=size(idx) if callable(size) else size, **kwargs) + + +@requires_pyav +def create_video_file( + root: Union[pathlib.Path, str], + name: Union[pathlib.Path, str], + size: Union[Sequence[int], int] = (25, 3, 10, 10), + fps: float = 25, + **kwargs: Any, +) -> None: + """Create an video file from random data. + + Args: + root (Union[str, pathlib.Path]): Root directory the video file will be placed in. + name (Union[str, pathlib.Path]): Name of the video file. + size (Union[Sequence[int], int]): Size of the video that represents the + ``(length, num_channels, height, width)``. If scalar, the value is used for the height and width. + If not provided, three channels are assumed. If not provided, the length is set to one second. + fps (float): Frame rate in frames per second. + kwargs (Any): Additional parameters passed to :func:`torchvision.io.write_video`. + + Raises: + UsageError: If PyAV is not available. + """ + if not PYAV_AVAILABLE: + raise PyAVNotAvailableError + + if isinstance(size, int): + size = (size, size) + if len(size) == 2: + size = (3, *size) + if len(size) == 3: + size = (fps, *size) + if len(size) != 4: + raise UsageError( + f"The 'size' argument should either be an int or a sequence of length 2, 3, or 4. Got {len(size)} instead" + ) + + video = create_image_or_video_tensor(size) + write_video(str(pathlib.Path(root) / name), video.permute(0, 2, 3, 1), fps, **kwargs) + + +@requires_pyav +def create_video_folder( + root: Union[str, pathlib.Path], + name: Union[str, pathlib.Path], + file_name_fn: Callable[[idx], str], + num_examples: int, + size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, + fps=25, + **kwargs, +): + """Create a folder of random videos. + + Args: + root (Union[str, pathlib.Path]): Root directory the image folder will be placed in. + name (Union[str, pathlib.Path]): Name of the image folder. + file_name_fn (Callable[[idx], str]): Should return a file name if called with the file index. + num_examples (int): Number of images to create. + size (Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]]): Size of the images. If + callable, will be called with the index of the corresponding file. If omitted, a random length between 0.5 + and 1.5 seconds as well as random even height and width between 4 and 10 pixels are selected on a + per-video basis. + fps (float): Frame rate in frames per second. + kwargs (Any): Additional parameters passed to :func:`create_video_file`. + + Raises: + UsageError: If PyAV is not available. + """ + if size is None: + + def size(idx): + length = int((torch.rand(()).item() + 0.5) * fps) + num_channels = 3 + # The 'libx264' video codec, which is the default of torchvision.io.write_video, requires the height and + # width of the video to be divisible by 2. + height, width = (torch.randint(2, 6, size=(2,), dtype=np.int) * 2).tolist() + return (length, num_channels, height, width) + + root = pathlib.Path(root) / name + os.makedirs(root) + + for idx in range(num_examples): + create_video_file(root, file_name_fn(idx), size=size(idx) if callable(size) else size) From 3115e1afec54255335a6505827162508df8c9e34 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:23:08 +0100 Subject: [PATCH 05/29] fix imports --- test/datasets_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 919299a6837..95c7de8f937 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -7,15 +7,14 @@ import pathlib import unittest import unittest.mock -from typing import Any, Iterator, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Iterator, Optional, Sequence, Tuple, Union import PIL.Image import torch import torchvision.datasets -from common_utils import get_tmp_dir -from datasets_utils import tmpdir, disable_console_output +from common_utils import get_tmp_dir, disable_console_output try: from torchvision.io import write_video @@ -217,7 +216,7 @@ def create_dataset( if disable_download_extract is None: disable_download_extract = inject_fake_data - with tmpdir() as root: + with get_tmp_dir() as root: info = self.inject_fake_data(root, config) if inject_fake_data else None if info is None or "num_examples" not in info: raise UsageError( From 5f32d77832c319f18a7e181715aea7af29007f42 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:23:41 +0100 Subject: [PATCH 06/29] remove class properties --- test/datasets_utils.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 95c7de8f937..5b4e1f75b46 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -250,23 +250,14 @@ def _verify_required_public_class_attributes(cls): "It should contain a sequence of types that the dataset returns when accessed by index." ) - @property - @classmethod - def _argspec(cls): - return inspect.getfullargspec(cls.DATASET_CLASS.__init__) - - @property - @classmethod - def _name(cls): - return cls.DATASET_CLASS.__name__ - @classmethod def _populate_private_class_attributes(cls): - cls._HAS_SPECIAL_KWARG = {name: name in cls._argspec.args for name in cls._SPECIAL_KWARGS} + argspec = inspect.getfullargspec(cls.DATASET_CLASS.__init__) + cls._HAS_SPECIAL_KWARG = {name: name in argspec.args for name in cls._SPECIAL_KWARGS} @classmethod def _process_optional_public_class_attributes(cls): - argspec = cls._argspec + argspec = inspect.getfullargspec(cls.DATASET_CLASS.__init__) if cls.CONFIGS is None: config = { kwarg: default @@ -281,7 +272,8 @@ def _process_optional_public_class_attributes(cls): importlib.import_module(pkg) except ImportError as error: raise unittest.SkipTest( - f"The package '{error.name}' is required to load the dataset '{cls._name}' but is not installed." + f"The package '{error.name}' is required to load the dataset '{cls.DATASET_CLASS.__name__}' but is " + f"not installed." ) def _split_kwargs(self, kwargs): From 2cc4d778737e501ccff05539b28ea44a688c756e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:23:54 +0100 Subject: [PATCH 07/29] fix smoke test --- test/datasets_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 5b4e1f75b46..25888feb66f 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -309,8 +309,8 @@ def test_not_found(self): with self.create_dataset(inject_fake_data=False): pass - def test_smoke(self, config): - with self.create_dataset(config) as (dataset, _): + def test_smoke(self): + with self.create_dataset() as (dataset, _): self.assertIsInstance(dataset, torchvision.datasets.VisionDataset) @test_all_configs From 29645772110a1831046edafa9cc568d5dffcb716 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:24:15 +0100 Subject: [PATCH 08/29] fix type hints --- test/datasets_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 25888feb66f..f3ab1892cdf 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -400,7 +400,7 @@ def create_image_file( def create_image_folder( root: Union[pathlib.Path, str], name: Union[pathlib.Path, str], - file_name_fn: Callable[[idx], str], + file_name_fn: Callable[[int], str], num_examples: int, size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, **kwargs: Any, @@ -410,7 +410,7 @@ def create_image_folder( Args: root (Union[str, pathlib.Path]): Root directory the image folder will be placed in. name (Union[str, pathlib.Path]): Name of the image folder. - file_name_fn (Callable[[idx], str]): Should return a file name if called with the file index. + file_name_fn (Callable[[int], str]): Should return a file name if called with the file index. num_examples (int): Number of images to create. size (Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]]): Size of the images. If callable, will be called with the index of the corresponding file. If omitted, a random height and width @@ -475,7 +475,7 @@ def create_video_file( def create_video_folder( root: Union[str, pathlib.Path], name: Union[str, pathlib.Path], - file_name_fn: Callable[[idx], str], + file_name_fn: Callable[[int], str], num_examples: int, size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, fps=25, @@ -486,7 +486,7 @@ def create_video_folder( Args: root (Union[str, pathlib.Path]): Root directory the image folder will be placed in. name (Union[str, pathlib.Path]): Name of the image folder. - file_name_fn (Callable[[idx], str]): Should return a file name if called with the file index. + file_name_fn (Callable[[int], str]): Should return a file name if called with the file index. num_examples (int): Number of images to create. size (Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]]): Size of the images. If callable, will be called with the index of the corresponding file. If omitted, a random length between 0.5 From 5993f8c4d1d4d183beadd14fbe4e370e35ca16a2 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:24:29 +0100 Subject: [PATCH 09/29] fix random size generation --- test/datasets_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index f3ab1892cdf..40e4c2db678 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -421,7 +421,7 @@ def create_image_folder( def size(idx: int) -> Tuple[int, int, int]: num_channels = 3 - height, width = torch.randint(3, 11, size=(2,), dtype=np.int).tolist() + height, width = torch.randint(3, 11, size=(2,), dtype=torch.int).tolist() return (num_channels, height, width) root = pathlib.Path(root) / name @@ -505,7 +505,7 @@ def size(idx): num_channels = 3 # The 'libx264' video codec, which is the default of torchvision.io.write_video, requires the height and # width of the video to be divisible by 2. - height, width = (torch.randint(2, 6, size=(2,), dtype=np.int) * 2).tolist() + height, width = (torch.randint(2, 6, size=(2,), dtype=torch.int) * 2).tolist() return (length, num_channels, height, width) root = pathlib.Path(root) / name From 8dca6e4911bd91584f9459316854bec03d21edb4 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 17:29:25 +0100 Subject: [PATCH 10/29] add Caltech256 as example --- test/test_datasets.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/test_datasets.py b/test/test_datasets.py index ff8e0281e7c..087bc2d75a5 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -15,6 +15,9 @@ import xml.etree.ElementTree as ET from urllib.request import Request, urlopen import itertools +import datasets_utils +import pathlib +from torchvision import datasets try: @@ -466,5 +469,25 @@ def test_repr_smoke(self): self.assertIsInstance(repr(dataset), str) +class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): + DATASET_CLASS = datasets.Caltech256 + + def inject_fake_data(self, root, config): + root = pathlib.Path(root) / "caltech256" / "256_ObjectCategories" + + categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter")) + num_images_per_category = 2 + + for idx, category in categories: + datasets_utils.create_image_folder( + root, + name=f"{idx:03d}.{category}", + file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx:04d}.jpg", + num_examples=num_images_per_category, + ) + + return dict(num_examples=num_images_per_category * len(categories)) + + if __name__ == '__main__': unittest.main() From 37eeff7192d41b5a3ba377d25c119fa06728ca86 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 18:04:41 +0100 Subject: [PATCH 11/29] add utility function to create grid of combinations --- test/datasets_utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 40e4c2db678..db75f8f5b18 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -85,6 +85,23 @@ def wrapper(self): return wrapper +def combinations_grid(**kwargs): + """Creates a grid of input combinations. + + Each element in the returned sequence is a dictionary containing one possible combination as values. + + Example: + >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) + [ + {'foo': 'bar', 'spam': 'eggs'}, + {'foo': 'bar', 'spam': 'ham'}, + {'foo': 'baz', 'spam': 'eggs'}, + {'foo': 'baz', 'spam': 'ham'} + ] + """ + return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] + + class DatasetTestCase(unittest.TestCase): """Abstract base class for all dataset testcases. From a9526e1782e2d4d17498efd8d9365c5e35c178ba Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 18:06:16 +0100 Subject: [PATCH 12/29] add CIFAR100? as example --- test/test_datasets.py | 72 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/test/test_datasets.py b/test/test_datasets.py index 087bc2d75a5..5b618fb071f 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -489,5 +489,75 @@ def inject_fake_data(self, root, config): return dict(num_examples=num_images_per_category * len(categories)) -if __name__ == '__main__': +class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase): + DATASET_CLASS = datasets.CIFAR10 + CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + + _VERSION_CONFIG = dict( + base_folder="cifar-10-batches-py", + train_files=tuple(f"data_batch_{idx}" for idx in range(1, 6)), + test_files=("test_batch",), + labels_key="labels", + meta_file="batches.meta", + num_categories=10, + categories_key="label_names", + ) + + def inject_fake_data(self, root, config): + root = pathlib.Path(root) / self._VERSION_CONFIG["base_folder"] + os.makedirs(root) + + num_images_per_file = 1 + for name in itertools.chain(self._VERSION_CONFIG["train_files"], self._VERSION_CONFIG["test_files"]): + self._create_batch_file(root, name, num_images_per_file) + + categories = self._create_meta_file(root) + + return dict( + num_examples=num_images_per_file + * len(self._VERSION_CONFIG["train_files"] if config["train"] else self._VERSION_CONFIG["test_files"]), + categories=categories, + ) + + def _create_batch_file(self, root, name, num_images): + data = datasets_utils.create_image_or_video_tensor((num_images, 32 * 32 * 3)) + labels = np.random.randint(0, self._VERSION_CONFIG["num_categories"], size=num_images).tolist() + self._create_binary_file(root, name, {"data": data, self._VERSION_CONFIG["labels_key"]: labels}) + + def _create_meta_file(self, root): + categories = [ + f"{idx:0{len(str(self._VERSION_CONFIG['num_categories'] - 1))}d}" + for idx in range(self._VERSION_CONFIG["num_categories"]) + ] + self._create_binary_file( + root, self._VERSION_CONFIG["meta_file"], {self._VERSION_CONFIG["categories_key"]: categories} + ) + return categories + + def _create_binary_file(self, root, name, content): + with open(pathlib.Path(root) / name, "wb") as fh: + pickle.dump(content, fh) + + def test_class_to_idx(self): + with self.create_dataset() as (dataset, info): + expected = {category: label for label, category in enumerate(info["categories"])} + actual = dataset.class_to_idx + self.assertEqual(actual, expected) + + +class CIFAR100(CIFAR10TestCase): + DATASET_CLASS = datasets.CIFAR100 + + _VERSION_CONFIG = dict( + base_folder="cifar-100-python", + train_files=("train",), + test_files=("test",), + labels_key="fine_labels", + meta_file="meta", + num_categories=100, + categories_key="fine_label_names", + ) + + +if __name__ == "__main__": unittest.main() From 857c5a8c12fa95bdb1d5ff589bab2845535c51ca Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 15 Feb 2021 18:19:06 +0100 Subject: [PATCH 13/29] lint --- test/datasets_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index db75f8f5b18..685898125bc 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -151,8 +151,8 @@ def test_foo(self): def test_bar(self, config): pass - Within the test you can use the ``create_dataset()`` method that yields the dataset as well as additional information - provided by the ``ìnject_fake_data()`` method: + Within the test you can use the ``create_dataset()`` method that yields the dataset as well as additional + information provided by the ``ìnject_fake_data()`` method: .. code-block:: @@ -278,7 +278,7 @@ def _process_optional_public_class_attributes(cls): if cls.CONFIGS is None: config = { kwarg: default - for kwarg, default in zip(argspec.args[-len(argspec.defaults) :], argspec.defaults) + for kwarg, default in zip(argspec.args[-len(argspec.defaults):], argspec.defaults) if kwarg not in cls._SPECIAL_KWARGS } cls.CONFIGS = (config,) From e85f9964f683deee37f51df4c3e552a873a6ccb1 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 10:24:33 +0100 Subject: [PATCH 14/29] add missing import --- test/test_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_datasets.py b/test/test_datasets.py index 5b618fb071f..11b3f90db2a 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -17,6 +17,7 @@ import itertools import datasets_utils import pathlib +import pickle from torchvision import datasets From 5ecd061f6e35c68c7cb7efcc694a3e97790fa7a4 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 14:38:11 +0100 Subject: [PATCH 15/29] improve documentation --- test/datasets_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 685898125bc..94ea95659ab 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -433,6 +433,11 @@ def create_image_folder( callable, will be called with the index of the corresponding file. If omitted, a random height and width between 3 and 10 pixels is selected on a per-image basis. kwargs (Any): Additional parameters passed to :func:`create_image_file`. + + + .. seealso:: + + - :func:`create_image_file` """ if size is None: @@ -514,6 +519,10 @@ def create_video_folder( Raises: UsageError: If PyAV is not available. + + .. seealso:: + + - :func:`create_video_file` """ if size is None: From 1175a326659c9803330070b6bf335bcb6c0a2259 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 14:38:59 +0100 Subject: [PATCH 16/29] create 1 frame videos by default --- test/datasets_utils.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 94ea95659ab..6217842fa13 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -457,7 +457,7 @@ def size(idx: int) -> Tuple[int, int, int]: def create_video_file( root: Union[pathlib.Path, str], name: Union[pathlib.Path, str], - size: Union[Sequence[int], int] = (25, 3, 10, 10), + size: Union[Sequence[int], int] = (1, 3, 10, 10), fps: float = 25, **kwargs: Any, ) -> None: @@ -467,8 +467,8 @@ def create_video_file( root (Union[str, pathlib.Path]): Root directory the video file will be placed in. name (Union[str, pathlib.Path]): Name of the video file. size (Union[Sequence[int], int]): Size of the video that represents the - ``(length, num_channels, height, width)``. If scalar, the value is used for the height and width. - If not provided, three channels are assumed. If not provided, the length is set to one second. + ``(num_frames, num_channels, height, width)``. If scalar, the value is used for the height and width. + If not provided, ``num_frames=1`` and ``num_channels=3`` are assumed. fps (float): Frame rate in frames per second. kwargs (Any): Additional parameters passed to :func:`torchvision.io.write_video`. @@ -483,7 +483,7 @@ def create_video_file( if len(size) == 2: size = (3, *size) if len(size) == 3: - size = (fps, *size) + size = (1, *size) if len(size) != 4: raise UsageError( f"The 'size' argument should either be an int or a sequence of length 2, 3, or 4. Got {len(size)} instead" @@ -510,10 +510,9 @@ def create_video_folder( name (Union[str, pathlib.Path]): Name of the image folder. file_name_fn (Callable[[int], str]): Should return a file name if called with the file index. num_examples (int): Number of images to create. - size (Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]]): Size of the images. If - callable, will be called with the index of the corresponding file. If omitted, a random length between 0.5 - and 1.5 seconds as well as random even height and width between 4 and 10 pixels are selected on a - per-video basis. + size (Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]]): Size of the videos. If + callable, will be called with the index of the corresponding file. If omitted, a random even height and + width between 4 and 10 pixels is selected on a per-video basis. fps (float): Frame rate in frames per second. kwargs (Any): Additional parameters passed to :func:`create_video_file`. @@ -527,7 +526,7 @@ def create_video_folder( if size is None: def size(idx): - length = int((torch.rand(()).item() + 0.5) * fps) + num_frames = 1 num_channels = 3 # The 'libx264' video codec, which is the default of torchvision.io.write_video, requires the height and # width of the video to be divisible by 2. From d164ea9c14916e00269f302db9d1c3e8348804fd Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 14:39:23 +0100 Subject: [PATCH 17/29] remove obsolete check --- test/datasets_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 6217842fa13..93b06f452b4 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -475,9 +475,6 @@ def create_video_file( Raises: UsageError: If PyAV is not available. """ - if not PYAV_AVAILABLE: - raise PyAVNotAvailableError - if isinstance(size, int): size = (size, size) if len(size) == 2: From 9cadab16d78a2e4eb4339cea6fed6c9a6a6a7a3d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 14:43:21 +0100 Subject: [PATCH 18/29] return path of files created with utility functions --- test/datasets_utils.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 93b06f452b4..7b2757cfa45 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -7,7 +7,7 @@ import pathlib import unittest import unittest.mock -from typing import Any, Callable, Dict, Iterator, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union import PIL.Image @@ -391,7 +391,7 @@ def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor: def create_image_file( root: Union[pathlib.Path, str], name: Union[pathlib.Path, str], size: Union[Sequence[int], int] = 10, **kwargs: Any -) -> None: +) -> pathlib.Path: """Create an image file from random data. Args: @@ -400,6 +400,9 @@ def create_image_file( size (Union[Sequence[int], int]): Size of the image that represents the ``(num_channels, height, width)``. If scalar, the value is used for the height and width. If not provided, three channels are assumed. kwargs (Any): Additional parameters passed to :meth:`PIL.Image.Image.save`. + + Returns: + pathlib.Path: Path to the created image file. """ if isinstance(size, int): size = (size, size) @@ -411,7 +414,9 @@ def create_image_file( ) image = create_image_or_video_tensor(size) - PIL.Image.fromarray(image.permute(2, 1, 0).numpy()).save(pathlib.Path(root) / name) + file = pathlib.Path(root) / name + PIL.Image.fromarray(image.permute(2, 1, 0).numpy()).save(file) + return file def create_image_folder( @@ -421,7 +426,7 @@ def create_image_folder( num_examples: int, size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, **kwargs: Any, -): +) -> List[pathlib.Path]: """Create a folder of random images. Args: @@ -434,6 +439,8 @@ def create_image_folder( between 3 and 10 pixels is selected on a per-image basis. kwargs (Any): Additional parameters passed to :func:`create_image_file`. + Returns: + List[pathlib.Path]: Paths to all created image files. .. seealso:: @@ -449,8 +456,10 @@ def size(idx: int) -> Tuple[int, int, int]: root = pathlib.Path(root) / name os.makedirs(root) - for idx in range(num_examples): + return [ create_image_file(root, file_name_fn(idx), size=size(idx) if callable(size) else size, **kwargs) + for idx in range(num_examples) + ] @requires_pyav @@ -460,7 +469,7 @@ def create_video_file( size: Union[Sequence[int], int] = (1, 3, 10, 10), fps: float = 25, **kwargs: Any, -) -> None: +) -> pathlib.Path: """Create an video file from random data. Args: @@ -472,6 +481,9 @@ def create_video_file( fps (float): Frame rate in frames per second. kwargs (Any): Additional parameters passed to :func:`torchvision.io.write_video`. + Returns: + pathlib.Path: Path to the created image file. + Raises: UsageError: If PyAV is not available. """ @@ -487,7 +499,9 @@ def create_video_file( ) video = create_image_or_video_tensor(size) - write_video(str(pathlib.Path(root) / name), video.permute(0, 2, 3, 1), fps, **kwargs) + file = pathlib.Path(root) / name + write_video(str(file), video.permute(0, 2, 3, 1), fps, **kwargs) + return file @requires_pyav @@ -499,7 +513,7 @@ def create_video_folder( size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, fps=25, **kwargs, -): +) -> List[pathlib.Path]: """Create a folder of random videos. Args: @@ -513,6 +527,9 @@ def create_video_folder( fps (float): Frame rate in frames per second. kwargs (Any): Additional parameters passed to :func:`create_video_file`. + Returns: + List[pathlib.Path]: Paths to all created video files. + Raises: UsageError: If PyAV is not available. @@ -533,5 +550,7 @@ def size(idx): root = pathlib.Path(root) / name os.makedirs(root) - for idx in range(num_examples): + return [ create_video_file(root, file_name_fn(idx), size=size(idx) if callable(size) else size) + for idx in range(num_examples) + ] From 77fa7168cb11c4a10e6630a37bb721e49ad0449f Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 15:50:19 +0100 Subject: [PATCH 19/29] [test] close PIL file handles before deletion --- test/datasets_utils.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 7b2757cfa45..015a91a2672 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -367,6 +367,42 @@ class ImageDatasetTestCase(DatasetTestCase): FEATURE_TYPES = (PIL.Image.Image, int) + @contextlib.contextmanager + def create_dataset( + self, + config: Optional[Dict[str, Any]] = None, + inject_fake_data: bool = True, + disable_download_extract: Optional[bool] = None, + **kwargs: Any, + ) -> Iterator[Tuple[torchvision.datasets.VisionDataset, Dict[str, Any]]]: + with super().create_dataset( + config=config, + inject_fake_data=inject_fake_data, + disable_download_extract=disable_download_extract, + **kwargs, + ) as (dataset, info): + with self._eagerly_load_pil_images(): + yield dataset, info + + @contextlib.contextmanager + def _eagerly_load_pil_images(self): + lazily_opened_files = set() + + open = PIL.Image.open + + def new(fp, *args, **kwargs): + image = open(fp, *args, **kwargs) + if isinstance(fp, (str, pathlib.Path)): + lazily_opened_files.add(image.fp) + return image + + with unittest.mock.patch("torchvision.datasets.caltech.Image.open", new=new): + try: + yield + finally: + for fh in lazily_opened_files: + fh.close() + class VideoDatasetTestCase(DatasetTestCase): """Abstract base class for video dataset testcases. From c2b3b0a7a7febba06ebccce3a8344adee10671b1 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 15:52:52 +0100 Subject: [PATCH 20/29] fix video folder creation --- test/datasets_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 015a91a2672..860aaa44c60 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -581,7 +581,7 @@ def size(idx): # The 'libx264' video codec, which is the default of torchvision.io.write_video, requires the height and # width of the video to be divisible by 2. height, width = (torch.randint(2, 6, size=(2,), dtype=torch.int) * 2).tolist() - return (length, num_channels, height, width) + return (num_frames, num_channels, height, width) root = pathlib.Path(root) / name os.makedirs(root) From 6f05ca06207dd2ed017618aa7d422b814afa423e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Feb 2021 16:27:51 +0100 Subject: [PATCH 21/29] generalize file handle closing --- test/datasets_utils.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 860aaa44c60..b49f9d9d503 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -9,6 +9,7 @@ import unittest.mock from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union +import PIL import PIL.Image import torch @@ -381,11 +382,32 @@ def create_dataset( disable_download_extract=disable_download_extract, **kwargs, ) as (dataset, info): - with self._eagerly_load_pil_images(): + # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access + # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we + # track all lazily opened images and close the file handle before the file is deleted. + # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an + # image, but never use the underlying data. During normal operation it is reasonable to assume that the + # user wants to work with the image he just opened rather than deleting the underlying file. + with self._close_image_handles(): yield dataset, info @contextlib.contextmanager - def _eagerly_load_pil_images(self): + def _close_image_handles(self): + module = inspect.getmodule(self.DATASET_CLASS) + + def resolve_patch_object(): + with contextlib.suppress(StopIteration): + return next(name for name, attr in vars(module).items() if attr is PIL.Image) + + with contextlib.suppress(StopIteration): + name = next(name for name, attr in vars(module).items() if attr is PIL) + return f"{name}.Image" + + obj = resolve_patch_object() + if not obj: + yield + return + lazily_opened_files = set() open = PIL.Image.open @@ -396,7 +418,7 @@ def new(fp, *args, **kwargs): lazily_opened_files.add(image.fp) return image - with unittest.mock.patch("torchvision.datasets.caltech.Image.open", new=new): + with unittest.mock.patch(f"{module.__name__}.{obj}.open", new=new): try: yield finally: From d3f92683fb585d35904ad15c94607cfe3552b9bf Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 14:05:46 +0100 Subject: [PATCH 22/29] fix lazy imports --- test/datasets_utils.py | 74 +++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index b49f9d9d503..b67ffc8ee22 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -14,21 +14,14 @@ import torch import torchvision.datasets +import torchvision.io from common_utils import get_tmp_dir, disable_console_output -try: - from torchvision.io import write_video - - PYAV_AVAILABLE = True - -except ImportError: - write_video = None - PYAV_AVAILABLE = False - __all__ = [ "UsageError", + "lazy_importer", "test_all_configs", "DatasetTestCase", "ImageDatasetTestCase", @@ -45,15 +38,58 @@ class UsageError(RuntimeError): """Should be raised in case an error happens in the setup rather than the test.""" -def requires_pyav(fn): - @functools.wraps(fn) - def wrapper(*args, **kwargs): - if not PYAV_AVAILABLE: - raise UsageError("PyAV (av) is required but not available.") +class LazyImporter: + r"""Lazy importer for additional dependicies. - return fn(*args, **kwargs) + Some datasets require additional packages that are no direct dependencies of torchvision. Instances of this class + provide modules listed in MODULES as attributes. They are only imported when accessed. - return wrapper + """ + MODULES = ( + "av", + "lmdb", + "pandas", + "pycocotools", + "requests", + "scipy.io", + ) + + def __init__(self): + cls = type(self) + for module in self.MODULES: + # We need the quirky module=module argument to the lambda since otherwise the lookup for module in this + # scope happens at runtime rather than at definition. Thus, without it every property would try to import + # the last module in MODULES + setattr(cls, module.split(".", 1)[0], property(lambda self, module=module: LazyImporter._import(module))) + + @staticmethod + def _import(module): + try: + importlib.import_module(module) + return importlib.import_module(module.split(".", 1)[0]) + except ImportError as error: + raise UsageError( + f"Failed to import module '{module}'. " + f"This probably means that the current test case needs '{module}' installed, " + f"but it is not a dependency of torchvision. " + f"You need to install it manually, for example 'pip install {module}'." + ) from error + + +lazy_importer = LazyImporter() + + +def requires_lazy_imports(*modules): + def outer_wrapper(fn): + @functools.wraps(fn) + def inner_wrapper(*args, **kwargs): + for module in modules: + getattr(lazy_importer, module.replace(".", "_")) + return fn(*args, **kwargs) + + return inner_wrapper + + return outer_wrapper # As of Python 3.7 this is provided by contextlib @@ -520,7 +556,7 @@ def size(idx: int) -> Tuple[int, int, int]: ] -@requires_pyav +@requires_lazy_imports("av") def create_video_file( root: Union[pathlib.Path, str], name: Union[pathlib.Path, str], @@ -558,11 +594,11 @@ def create_video_file( video = create_image_or_video_tensor(size) file = pathlib.Path(root) / name - write_video(str(file), video.permute(0, 2, 3, 1), fps, **kwargs) + torchvision.io.write_video(str(file), video.permute(0, 2, 3, 1), fps, **kwargs) return file -@requires_pyav +@requires_lazy_imports("av") def create_video_folder( root: Union[str, pathlib.Path], name: Union[str, pathlib.Path], From e7d1675dfa6e054be4fce681fbc62bd6d4eb193d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 14:33:41 +0100 Subject: [PATCH 23/29] add test for transforms --- test/datasets_utils.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index b67ffc8ee22..e47c983ea33 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -167,10 +167,11 @@ class DatasetTestCase(unittest.TestCase): 1. the dataset raises a ``RuntimeError`` if the data files are not found, 2. the dataset inherits from `torchvision.datasets.VisionDataset`, 3. the dataset can be turned into a string, - 4. the feature types of a returned example matches ``FEATURE_TYPES``, and - 5. the number of examples matches the injected fake data. + 4. the feature types of a returned example matches ``FEATURE_TYPES``, + 5. the number of examples matches the injected fake data, and + 6. the dataset calls ``transform``, ``target_transform``, or ``transforms`` if available when accessing data. - Case 3., 4., and 5. are tested against all configurations in ``CONFIGS``. + Case 3. to 6. are tested against all configurations in ``CONFIGS``. To add dataset-specific tests, create a new method that takes no arguments with ``test_`` as a name prefix: @@ -204,10 +205,13 @@ def test_baz(self): CONFIGS = None REQUIRED_PACKAGES = None - _SPECIAL_KWARGS = { + _TRANSFORM_KWARGS = { "transform", "target_transform", "transforms", + } + _SPECIAL_KWARGS = { + *_TRANSFORM_KWARGS, "download", } _HAS_SPECIAL_KWARG = None @@ -307,7 +311,7 @@ def _verify_required_public_class_attributes(cls): @classmethod def _populate_private_class_attributes(cls): argspec = inspect.getfullargspec(cls.DATASET_CLASS.__init__) - cls._HAS_SPECIAL_KWARG = {name: name in argspec.args for name in cls._SPECIAL_KWARGS} + cls._HAS_SPECIAL_KWARG = {name for name in cls._SPECIAL_KWARGS if name in argspec.args} @classmethod def _process_optional_public_class_attributes(cls): @@ -337,7 +341,7 @@ def _split_kwargs(self, kwargs): @contextlib.contextmanager def _disable_download_extract(self, special_kwargs): - inject_download_kwarg = self._HAS_SPECIAL_KWARG["download"] and "download" not in special_kwargs + inject_download_kwarg = "download" in self._HAS_SPECIAL_KWARG and "download" not in special_kwargs if inject_download_kwarg: special_kwargs["download"] = False @@ -395,6 +399,21 @@ def test_num_examples(self, config): with self.create_dataset(config) as (dataset, info): self.assertEqual(len(dataset), info["num_examples"]) + @test_all_configs + def test_transforms(self, config): + mock = unittest.mock.Mock(wraps=lambda *args: args[0] if len(args) == 1 else args) + for kwarg in self._TRANSFORM_KWARGS: + if not kwarg in self._HAS_SPECIAL_KWARG: + continue + + mock.reset_mock() + + with self.subTest(kwarg=kwarg): + with self.create_dataset(config, **{kwarg: mock}) as (dataset, _): + dataset[0] + + mock.assert_called() + class ImageDatasetTestCase(DatasetTestCase): """Abstract base class for image dataset testcases. From 06625331e1fca2326c1757cc48e0f5f9401a07de Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 14:35:42 +0100 Subject: [PATCH 24/29] fix explanation comment --- test/datasets_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index e47c983ea33..9b462976d76 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -57,9 +57,9 @@ class LazyImporter: def __init__(self): cls = type(self) for module in self.MODULES: - # We need the quirky module=module argument to the lambda since otherwise the lookup for module in this - # scope happens at runtime rather than at definition. Thus, without it every property would try to import - # the last module in MODULES + # We need the quirky 'module=module' argument to the lambda since otherwise the lookup for 'module' in this + # scope would happen at runtime rather than at definition. Thus, without it, every property would try to + # import the last 'module' in MODULES. setattr(cls, module.split(".", 1)[0], property(lambda self, module=module: LazyImporter._import(module))) @staticmethod From 9773089541aebcd34c097bf9867cd6cfd300c319 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 14:38:00 +0100 Subject: [PATCH 25/29] lint --- test/datasets_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 9b462976d76..8da2b9b63e7 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -403,7 +403,7 @@ def test_num_examples(self, config): def test_transforms(self, config): mock = unittest.mock.Mock(wraps=lambda *args: args[0] if len(args) == 1 else args) for kwarg in self._TRANSFORM_KWARGS: - if not kwarg in self._HAS_SPECIAL_KWARG: + if kwarg not in self._HAS_SPECIAL_KWARG: continue mock.reset_mock() From c517a5a2ba32a2fa031a787fc771abde5876f11d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 15:49:14 +0100 Subject: [PATCH 26/29] force load opened PIL images --- test/datasets_utils.py | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 8da2b9b63e7..46df015cbbd 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -439,46 +439,25 @@ def create_dataset( ) as (dataset, info): # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we - # track all lazily opened images and close the file handle before the file is deleted. + # force-load opened images. # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an # image, but never use the underlying data. During normal operation it is reasonable to assume that the # user wants to work with the image he just opened rather than deleting the underlying file. - with self._close_image_handles(): + with self._force_load_images(): yield dataset, info @contextlib.contextmanager - def _close_image_handles(self): - module = inspect.getmodule(self.DATASET_CLASS) - - def resolve_patch_object(): - with contextlib.suppress(StopIteration): - return next(name for name, attr in vars(module).items() if attr is PIL.Image) - - with contextlib.suppress(StopIteration): - name = next(name for name, attr in vars(module).items() if attr is PIL) - return f"{name}.Image" - - obj = resolve_patch_object() - if not obj: - yield - return - - lazily_opened_files = set() - + def _force_load_images(self): open = PIL.Image.open def new(fp, *args, **kwargs): image = open(fp, *args, **kwargs) if isinstance(fp, (str, pathlib.Path)): - lazily_opened_files.add(image.fp) + image.load() return image - with unittest.mock.patch(f"{module.__name__}.{obj}.open", new=new): - try: - yield - finally: - for fh in lazily_opened_files: - fh.close() + with unittest.mock.patch(f"PIL.Image.open", new=new): + yield class VideoDatasetTestCase(DatasetTestCase): From 4c1ff7ce1cd2ff461c6f56ef596a39147fd1b8b3 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 15:55:35 +0100 Subject: [PATCH 27/29] lint --- test/datasets_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 46df015cbbd..9450488646f 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -456,7 +456,7 @@ def new(fp, *args, **kwargs): image.load() return image - with unittest.mock.patch(f"PIL.Image.open", new=new): + with unittest.mock.patch("PIL.Image.open", new=new): yield From 170f700a17a0ca06cff40dc59b04ba1f607542d4 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 17:43:46 +0100 Subject: [PATCH 28/29] copy default config to avoid inplace modification --- test/datasets_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 9450488646f..88c167f2771 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -266,7 +266,7 @@ def create_dataset( for details. """ if config is None: - config = self.CONFIGS[0] + config = self.CONFIGS[0].copy() special_kwargs, other_kwargs = self._split_kwargs(kwargs) config.update(other_kwargs) From aba3ee005ea8ca976a56ad4b0d3bb86a1772bfe8 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 17 Feb 2021 17:57:45 +0100 Subject: [PATCH 29/29] enable additional arg forwarding --- test/datasets_utils.py | 57 +++++++++++++++++++++++++++++++++--------- test/test_datasets.py | 20 +++++++-------- 2 files changed, 55 insertions(+), 22 deletions(-) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index 88c167f2771..aa3e3f61be3 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -1,3 +1,4 @@ +import collections.abc import contextlib import functools import importlib @@ -227,16 +228,27 @@ def test_baz(self): "download_and_extract_archive", } - def inject_fake_data(self, root: str, config: Dict[str, Any]) -> Dict[str, Any]: - """Inject fake data into the root of the dataset. + def inject_fake_data( + self, tmpdir: str, config: Dict[str, Any] + ) -> Union[int, Dict[str, Any], Tuple[Sequence[Any], Union[int, Dict[str, Any]]]]: + """Inject fake data for dataset into a temporary directory. Args: - root (str): Root of the dataset. + tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset + to be created and in turn also for the fake data injected here. config (Dict[str, Any]): Configuration that will be used to create the dataset. - Returns: - info (Dict[str, Any]): Additional information about the injected fake data. Must contain the field - ``"num_examples"`` that corresponds to the length of the dataset to be created. + Needs to return one of the following: + + 1. (int): Number of examples in the dataset to be created, + 2. (Dict[str, Any]): Additional information about the injected fake data. Must contain the field + ``"num_examples"`` that corresponds to the number of examples in the dataset to be created, or + 3. (Tuple[Sequence[Any], Union[int, Dict[str, Any]]]): Additional required parameters that are passed to + the dataset constructor. The second element corresponds to cases 1. and 2. + + If no ``args`` is returned (case 1. and 2.), the ``tmp_dir`` is passed as first parameter to the dataset + constructor. In most cases this corresponds to ``root``. If the dataset has more parameters without default + values you need to explicitly pass them as explained in case 3. """ raise NotImplementedError("You need to provide fake data in order for the tests to run.") @@ -274,17 +286,38 @@ def create_dataset( if disable_download_extract is None: disable_download_extract = inject_fake_data - with get_tmp_dir() as root: - info = self.inject_fake_data(root, config) if inject_fake_data else None - if info is None or "num_examples" not in info: + with get_tmp_dir() as tmpdir: + output = self.inject_fake_data(tmpdir, config) if inject_fake_data else None + if output is None: + raise UsageError( + "The method 'inject_fake_data' needs to return at least an integer indicating the number of " + "examples for the current configuration." + ) + + if isinstance(output, collections.abc.Sequence) and len(output) == 2: + args, info = output + else: + args = (tmpdir,) + info = output + + if isinstance(info, int): + info = dict(num_examples=info) + elif isinstance(info, dict): + if "num_examples" not in info: + raise UsageError( + "The information dictionary returned by the method 'inject_fake_data' must contain a " + "'num_examples' field that holds the number of examples for the current configuration." + ) + else: raise UsageError( - "The method 'inject_fake_data' needs to return a dictionary that contains at least a " - "'num_examples' field." + f"The additional information returned by the method 'inject_fake_data' must be either an integer " + f"indicating the number of examples for the current configuration or a dictionary with the the " + f"same content. Got {type(info)} instead." ) cm = self._disable_download_extract if disable_download_extract else nullcontext with cm(special_kwargs), disable_console_output(): - dataset = self.DATASET_CLASS(root, **config, **special_kwargs) + dataset = self.DATASET_CLASS(*args, **config, **special_kwargs) yield dataset, info diff --git a/test/test_datasets.py b/test/test_datasets.py index 11b3f90db2a..8ec5be7de19 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -473,21 +473,21 @@ def test_repr_smoke(self): class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Caltech256 - def inject_fake_data(self, root, config): - root = pathlib.Path(root) / "caltech256" / "256_ObjectCategories" + def inject_fake_data(self, tmpdir, config): + tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories" categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter")) num_images_per_category = 2 for idx, category in categories: datasets_utils.create_image_folder( - root, + tmpdir, name=f"{idx:03d}.{category}", - file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx:04d}.jpg", + file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg", num_examples=num_images_per_category, ) - return dict(num_examples=num_images_per_category * len(categories)) + return num_images_per_category * len(categories) class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase): @@ -504,15 +504,15 @@ class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase): categories_key="label_names", ) - def inject_fake_data(self, root, config): - root = pathlib.Path(root) / self._VERSION_CONFIG["base_folder"] - os.makedirs(root) + def inject_fake_data(self, tmpdir, config): + tmpdir = pathlib.Path(tmpdir) / self._VERSION_CONFIG["base_folder"] + os.makedirs(tmpdir) num_images_per_file = 1 for name in itertools.chain(self._VERSION_CONFIG["train_files"], self._VERSION_CONFIG["test_files"]): - self._create_batch_file(root, name, num_images_per_file) + self._create_batch_file(tmpdir, name, num_images_per_file) - categories = self._create_meta_file(root) + categories = self._create_meta_file(tmpdir) return dict( num_examples=num_images_per_file