Skip to content

Skip download of dataset if raw data exists #3555

@crcrpar

Description

@crcrpar

🚀 Feature

Skip the download of raw dataset if exists and just extract and convert.

Motivation

Reduce the download of datasets as possible.

Pitch

This will not be a new feature nor a big change.
Currently, some torchvision datasets download the raw datasets (e.g. MNIST) before extract and convert them to appropriate formats, e.g. (train/test).pt for MNIST dataset if download argument is True.

I misunderstood this option that if there exists a raw dataset, torchvision skips the download of raw data and only extracts and converts the raw dataset. But, actually, the criterion of skipping the download process is whether or not there exist the PyTorch files as follows.

def _check_exists(self) -> bool:
return (os.path.exists(os.path.join(self.processed_folder,
self.training_file)) and
os.path.exists(os.path.join(self.processed_folder,
self.test_file)))

And the below diff is the sketch of my proposal (not thoroughly validated).

diff --git a/torchvision/datasets/mnist.py b/torchvision/datasets/mnist.py
index e87cd46e..bfd59914 100644
--- a/torchvision/datasets/mnist.py
+++ b/torchvision/datasets/mnist.py
@@ -131,6 +131,11 @@ class MNIST(VisionDataset):
     def class_to_idx(self) -> Dict[str, int]:
         return {_class: i for i, _class in enumerate(self.classes)}
 
+    def _check_raw_data_exists(self) -> bool:
+        return all([
+            os.path.exists(os.path.join(self.raw_folder, filename)) for (filename, _) in self.resources
+        ])
+
     def _check_exists(self) -> bool:
         return (os.path.exists(os.path.join(self.processed_folder,
                                             self.training_file)) and
@@ -140,34 +145,36 @@ class MNIST(VisionDataset):
     def download(self) -> None:
         """Download the MNIST data if it doesn't exist in processed_folder already."""
 
+        if not self._check_raw_data_exists():
+            print('Dowanloading raw MNIST...')
+            os.makedirs(self.raw_folder, exist_ok=True)
+            os.makedirs(self.processed_folder, exist_ok=True)
+
+            # download files
+            for filename, md5 in self.resources:
+                for mirror in self.mirrors:
+                    url = "{}{}".format(mirror, filename)
+                    try:
+                        print("Downloading {}".format(url))
+                        download_and_extract_archive(
+                            url, download_root=self.raw_folder,
+                            filename=filename,
+                            md5=md5
+                        )
+                    except URLError as error:
+                        print(
+                            "Failed to download (trying next):\n{}".format(error)
+                        )
+                        continue
+                    finally:
+                        print()
+                    break
+                else:
+                    raise RuntimeError("Error downloading {}".format(filename))
+
         if self._check_exists():
             return
 
-        os.makedirs(self.raw_folder, exist_ok=True)
-        os.makedirs(self.processed_folder, exist_ok=True)
-
-        # download files
-        for filename, md5 in self.resources:
-            for mirror in self.mirrors:
-                url = "{}{}".format(mirror, filename)
-                try:
-                    print("Downloading {}".format(url))
-                    download_and_extract_archive(
-                        url, download_root=self.raw_folder,
-                        filename=filename,
-                        md5=md5
-                    )
-                except URLError as error:
-                    print(
-                        "Failed to download (trying next):\n{}".format(error)
-                    )
-                    continue
-                finally:
-                    print()
-                break
-            else:
-                raise RuntimeError("Error downloading {}".format(filename))
-
         # process and save as torch files
         print('Processing...')

Alternatives

Additional context

cc @pmeier

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions