with urllib.

vincentqb · vincentqb · commit 927e46258fa4 · 2019-10-29T16:44:42.000-04:00
diff --git a/requirements.txt b/requirements.txt
@@ -18,5 +18,3 @@ pytest
 
 # Testing only Py3 compat
 backports.tempfile
-
-requests
diff --git a/test/test_datasets.py b/test/test_datasets.py
@@ -3,9 +3,9 @@
 
 from torchaudio.datasets.commonvoice import COMMONVOICE
 from torchaudio.datasets.librispeech import LIBRISPEECH
-from torchaudio.datasets.utils import DiskCache
 from torchaudio.datasets.vctk import VCTK
 from torchaudio.datasets.yesno import YESNO
+from torchaudio.datasets.utils import DiskCache, download_url, download_url
 
 
 class TestDatasets(unittest.TestCase):
diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
@@ -8,7 +8,6 @@
 import tarfile
 import zipfile
 
-import requests
 import six
 import torch
 import torchaudio
@@ -66,44 +65,6 @@ def makedir_exist_ok(dirpath):
             raise
 
 
-def download_url_resume(url, download_folder, resume_byte_pos=None):
-    """Download url to disk with possible resumption.
-
-    Args:
-        url (str): Url.
-        download_folder (str): Folder to download file.
-        resume_byte_pos (int): Position of byte from where to resume the download.
-    """
-    # Get size of file
-    r = requests.head(url)
-    file_size = int(r.headers.get("content-length", 0))
-
-    # Append information to resume download at specific byte position to header
-    resume_header = (
-        {"Range": "bytes={}-".format(resume_byte_pos)} if resume_byte_pos else None
-    )
-
-    # Establish connection
-    r = requests.get(url, stream=True, headers=resume_header)
-
-    # Set configuration
-    n_block = 32
-    block_size = 1024
-    initial_pos = resume_byte_pos if resume_byte_pos else 0
-    mode = "ab" if resume_byte_pos else "wb"
-
-    filename = os.path.basename(url)
-    filepath = os.path.join(download_folder, os.path.basename(url))
-
-    with open(filepath, mode) as f:
-        with tqdm(
-            unit="B", unit_scale=True, unit_divisor=1024, total=file_size
-        ) as pbar:
-            for chunk in r.iter_content(n_block * block_size):
-                f.write(chunk)
-                pbar.update(len(chunk))
-
-
 def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
     """Execute the correct download operation.
     Depending on the size of the file online and offline, resume the
@@ -115,48 +76,58 @@ def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
         hash_value (str): Hash for url.
         hash_type (str): Hash type.
     """
-    # Establish connection to header of file
-    r = requests.head(url)
 
-    # Get filesize of online and offline file
-    file_size_online = int(r.headers.get("content-length", 0))
     filepath = os.path.join(download_folder, os.path.basename(url))
 
+    req = urllib.request.Request(url)
     if os.path.exists(filepath):
-        file_size_offline = os.path.getsize(filepath)
-
-        if file_size_online != file_size_offline:
-            # Resume download
-            print("File {} is incomplete. Resume download.".format(filepath))
-            download_url_resume(url, download_folder, file_size_offline)
-        elif hash_value:
-            if validate_download_url(url, download_folder, hash_value, hash_type):
-                print("File {} is validated. Skip download.".format(filepath))
-            else:
-                print(
-                    "File {} is corrupt. Delete it manually and retry.".format(filepath)
-                )
-        else:
-            # Skip download
-            print("File {} is complete. Skip download.".format(filepath))
+        mode = "ab"
+        local_size = os.path.getsize(filepath)
+
+        # If the file exists, then download only the remainder
+        req.headers["Range"] = "bytes={}-".format(local_size)
     else:
-        # Start download
-        print("File {} has not been downloaded. Start download.".format(filepath))
-        download_url_resume(url, download_folder)
+        mode = "wb"
+        local_size = 0
+
+    # If we already have the whole file, there is no need to download it again
+    url_size = int(urllib.request.urlopen(url).info().get("Content-Length", -1))
+    if url_size == local_size:
+        if hash_value and not validate_download_url(filepath, hash_value, hash_type):
+            raise RuntimeError(
+                "The hash of {} does not match. Delete the file manually and retry.".format(
+                    filepath
+                )
+            )
+
+        return
 
+    with open(filepath, mode) as fpointer, urllib.request.urlopen(
+        req
+    ) as upointer, tqdm(
+        unit="B", unit_scale=True, unit_divisor=1024, total=url_size
+    ) as pbar:
 
-def validate_download_url(url, download_folder, hash_value, hash_type="sha256"):
+        num_bytes = 0
+        block_size = 32 * 1024
+        while True:
+            chunk = upointer.read(block_size)
+            if not chunk:
+                break
+            fpointer.write(chunk)
+            num_bytes += len(chunk)
+            pbar.update(len(chunk))
+
+
+def validate_download_url(filepath, hash_value, hash_type="sha256"):
     """Validate a given file with its hash.
-    The downloaded file is hashed and compared to a pre-registered
-    has value to validate the download procedure.
 
     Args:
         url (str): Url.
         download_folder (str): Folder to download file.
         hash_value (str): Hash for url.
         hash_type (str): Hash type.
     """
-    filepath = os.path.join(download_folder, os.path.basename(url))
 
     if hash_type == "sha256":
         sha = hashlib.sha256()