From 1d6360133b2e01b26441662087475740f479c579 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Tue, 29 Oct 2019 12:52:34 -0400
Subject: [PATCH 01/12] resume download, validate with md5 or sha256.

---
 requirements.txt             |   2 +
 torchaudio/datasets/utils.py | 140 +++++++++++++++++++++++++----------
 2 files changed, 103 insertions(+), 39 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 686fa2d996..76e4acd48a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,5 @@ pytest
 
 # Testing only Py3 compat
 backports.tempfile
+
+requests
diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index f3daebcd12..c0ac326258 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -10,9 +10,11 @@
 import zipfile
 from queue import Queue
 
+import requests
 import six
 import torch
 import torchaudio
+from six.moves import urllib
 from torch.utils.data import Dataset
 from torch.utils.model_zoo import tqdm
 
@@ -53,18 +55,6 @@ def unicode_csv_reader(unicode_csv_data, **kwargs):
             yield line
 
 
-def gen_bar_updater():
-    pbar = tqdm(total=None)
-
-    def bar_update(count, block_size, total_size):
-        if pbar.total is None and total_size:
-            pbar.total = total_size
-        progress_bytes = count * block_size
-        pbar.update(progress_bytes - pbar.n)
-
-    return bar_update
-
-
 def makedir_exist_ok(dirpath):
     """
     Python2 support for os.makedirs(.., exist_ok=True)
@@ -78,41 +68,113 @@ def makedir_exist_ok(dirpath):
             raise
 
 
-def download_url(url, root, filename=None, md5=None):
-    """Download a file from a url and place it in root.
+def download_url_resume(url, download_folder, resume_byte_pos=None):
+    """Download url to disk with possible resumption.
 
     Args:
-        url (str): URL to download file from
-        root (str): Directory to place downloaded file in
-        filename (str, optional): Name to save the file under. If None, use the basename of the URL
-        md5 (str, optional): MD5 checksum of the download. If None, do not check
+        url (str): Url.
+        download_folder (str): Folder to download file.
+        resume_byte_pos (int): Position of byte from where to resume the download.
     """
-    from six.moves import urllib
+    # Get size of file
+    r = requests.head(url)
+    file_size = int(r.headers.get("content-length", 0))
 
-    root = os.path.expanduser(root)
-    if not filename:
-        filename = os.path.basename(url)
-    fpath = os.path.join(root, filename)
+    # Append information to resume download at specific byte position to header
+    resume_header = (
+        {"Range": "bytes={}-".format(resume_byte_pos)} if resume_byte_pos else None
+    )
 
-    makedir_exist_ok(root)
+    # Establish connection
+    r = requests.get(url, stream=True, headers=resume_header)
 
-    # downloads file
-    if os.path.isfile(fpath):
-        print("Using downloaded file: " + fpath)
-    else:
-        try:
-            print("Downloading " + url + " to " + fpath)
-            urllib.request.urlretrieve(url, fpath, reporthook=gen_bar_updater())
-        except (urllib.error.URLError, IOError) as e:
-            if url[:5] == "https":
-                url = url.replace("https:", "http:")
+    # Set configuration
+    n_block = 32
+    block_size = 1024
+    initial_pos = resume_byte_pos if resume_byte_pos else 0
+    mode = "ab" if resume_byte_pos else "wb"
+
+    filename = os.path.basename(url)
+    filepath = os.path.join(download_folder, os.path.basename(url))
+
+    with open(filepath, mode) as f:
+        with tqdm(
+            unit="B", unit_scale=True, unit_divisor=1024, total=file_size
+        ) as pbar:
+            for chunk in r.iter_content(n_block * block_size):
+                f.write(chunk)
+                pbar.update(len(chunk))
+
+
+def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
+    """Execute the correct download operation.
+    Depending on the size of the file online and offline, resume the
+    download if the file offline is smaller than online.
+
+    Args:
+        url (str): Url.
+        download_folder (str): Folder to download file.
+        hash_value (str): Hash for url.
+        hash_type (str): Hash type.
+    """
+    # Establish connection to header of file
+    r = requests.head(url)
+
+    # Get filesize of online and offline file
+    file_size_online = int(r.headers.get("content-length", 0))
+    filepath = os.path.join(download_folder, os.path.basename(url))
+
+    if os.path.exists(filepath):
+        file_size_offline = os.path.getsize(filepath)
+
+        if file_size_online != file_size_offline:
+            # Resume download
+            print("File {} is incomplete. Resume download.".format(filepath))
+            download_url_resume(url, download_folder, file_size_offline)
+        elif hash_value:
+            if validate_download_url(url, download_folder, hash_value, hash_type):
+                print("File {} is validated. Skip download.".format(filepath))
+            else:
                 print(
-                    "Failed download. Trying https -> http instead."
-                    " Downloading " + url + " to " + fpath
+                    "File {} is corrupt. Delete it manually and retry.".format(filepath)
                 )
-                urllib.request.urlretrieve(url, fpath, reporthook=gen_bar_updater())
-            else:
-                raise e
+        else:
+            # Skip download
+            print("File {} is complete. Skip download.".format(filepath))
+    else:
+        # Start download
+        print("File {} has not been downloaded. Start download.".format(filepath))
+        download_url_resume(url, download_folder)
+
+
+def validate_download_url(url, download_folder, hash_value, hash_type="sha256"):
+    """Validate a given file with its hash.
+    The downloaded file is hashed and compared to a pre-registered
+    has value to validate the download procedure.
+
+    Args:
+        url (str): Url.
+        download_folder (str): Folder to download file.
+        hash_value (str): Hash for url.
+        hash_type (str): Hash type.
+    """
+    filepath = os.path.join(download_folder, os.path.basename(url))
+
+    if hash_type == "sha256":
+        sha = hashlib.sha256()
+    elif hash_type == "md5":
+        sha = hashlib.md5()
+    else:
+        raise ValueError
+
+    with open(filepath, "rb") as f:
+        while True:
+            chunk = f.read(1000 * 1000)  # 1MB so that memory is not exhausted
+            if not chunk:
+                break
+            sha.update(chunk)
+
+    return sha.hexdigest() == hash_value
 
 
 def extract_archive(from_path, to_path=None, overwrite=False):

From 9e7d84391fbfec7daf2ff6f040476b08922870ee Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Tue, 29 Oct 2019 16:01:21 -0400
Subject: [PATCH 02/12] with urllib.

---
 requirements.txt             |   2 -
 torchaudio/datasets/utils.py | 103 +++++++++++++----------------------
 2 files changed, 37 insertions(+), 68 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 76e4acd48a..686fa2d996 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,5 +18,3 @@ pytest
 
 # Testing only Py3 compat
 backports.tempfile
-
-requests
diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index c0ac326258..d27d1f9129 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -10,7 +10,6 @@
 import zipfile
 from queue import Queue
 
-import requests
 import six
 import torch
 import torchaudio
@@ -68,44 +67,6 @@ def makedir_exist_ok(dirpath):
             raise
 
 
-def download_url_resume(url, download_folder, resume_byte_pos=None):
-    """Download url to disk with possible resumption.
-
-    Args:
-        url (str): Url.
-        download_folder (str): Folder to download file.
-        resume_byte_pos (int): Position of byte from where to resume the download.
-    """
-    # Get size of file
-    r = requests.head(url)
-    file_size = int(r.headers.get("content-length", 0))
-
-    # Append information to resume download at specific byte position to header
-    resume_header = (
-        {"Range": "bytes={}-".format(resume_byte_pos)} if resume_byte_pos else None
-    )
-
-    # Establish connection
-    r = requests.get(url, stream=True, headers=resume_header)
-
-    # Set configuration
-    n_block = 32
-    block_size = 1024
-    initial_pos = resume_byte_pos if resume_byte_pos else 0
-    mode = "ab" if resume_byte_pos else "wb"
-
-    filename = os.path.basename(url)
-    filepath = os.path.join(download_folder, os.path.basename(url))
-
-    with open(filepath, mode) as f:
-        with tqdm(
-            unit="B", unit_scale=True, unit_divisor=1024, total=file_size
-        ) as pbar:
-            for chunk in r.iter_content(n_block * block_size):
-                f.write(chunk)
-                pbar.update(len(chunk))
-
-
 def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
     """Execute the correct download operation.
     Depending on the size of the file online and offline, resume the
@@ -117,40 +78,51 @@ def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
         hash_value (str): Hash for url.
         hash_type (str): Hash type.
     """
-    # Establish connection to header of file
-    r = requests.head(url)
 
-    # Get filesize of online and offline file
-    file_size_online = int(r.headers.get("content-length", 0))
     filepath = os.path.join(download_folder, os.path.basename(url))
 
+    req = urllib.request.Request(url)
     if os.path.exists(filepath):
-        file_size_offline = os.path.getsize(filepath)
-
-        if file_size_online != file_size_offline:
-            # Resume download
-            print("File {} is incomplete. Resume download.".format(filepath))
-            download_url_resume(url, download_folder, file_size_offline)
-        elif hash_value:
-            if validate_download_url(url, download_folder, hash_value, hash_type):
-                print("File {} is validated. Skip download.".format(filepath))
-            else:
-                print(
-                    "File {} is corrupt. Delete it manually and retry.".format(filepath)
-                )
-        else:
-            # Skip download
-            print("File {} is complete. Skip download.".format(filepath))
+        mode = "ab"
+        local_size = os.path.getsize(filepath)
+
+        # If the file exists, then download only the remainder
+        req.headers["Range"] = "bytes={}-".format(local_size)
     else:
-        # Start download
-        print("File {} has not been downloaded. Start download.".format(filepath))
-        download_url_resume(url, download_folder)
+        mode = "wb"
+        local_size = 0
+
+    # If we already have the whole file, there is no need to download it again
+    url_size = int(urllib.request.urlopen(url).info().get("Content-Length", -1))
+    if url_size == local_size:
+        if hash_value and not validate_download_url(filepath, hash_value, hash_type):
+            raise RuntimeError(
+                "The hash of {} does not match. Delete the file manually and retry.".format(
+                    filepath
+                )
+            )
+
+        return
 
+    with open(filepath, mode) as fpointer, urllib.request.urlopen(
+        req
+    ) as upointer, tqdm(
+        unit="B", unit_scale=True, unit_divisor=1024, total=url_size
+    ) as pbar:
 
-def validate_download_url(url, download_folder, hash_value, hash_type="sha256"):
+        num_bytes = 0
+        block_size = 32 * 1024
+        while True:
+            chunk = upointer.read(block_size)
+            if not chunk:
+                break
+            fpointer.write(chunk)
+            num_bytes += len(chunk)
+            pbar.update(len(chunk))
+
+
+def validate_download_url(filepath, hash_value, hash_type="sha256"):
     """Validate a given file with its hash.
-    The downloaded file is hashed and compared to a pre-registered
-    has value to validate the download procedure.
 
     Args:
         url (str): Url.
@@ -158,7 +130,6 @@ def validate_download_url(url, download_folder, hash_value, hash_type="sha256"):
         hash_value (str): Hash for url.
         hash_type (str): Hash type.
     """
-    filepath = os.path.join(download_folder, os.path.basename(url))
 
     if hash_type == "sha256":
         sha = hashlib.sha256()

From 46e92fe2633a1f3de1c316ce30a5785de61ea030 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Wed, 30 Oct 2019 12:08:57 -0400
Subject: [PATCH 03/12] split stream from saving. detect filename.

---
 torchaudio/datasets/utils.py | 106 ++++++++++++++++++++++++-----------
 1 file changed, 72 insertions(+), 34 deletions(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index d27d1f9129..4018e87b0e 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -67,58 +67,96 @@ def makedir_exist_ok(dirpath):
             raise
 
 
-def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
-    """Execute the correct download operation.
-    Depending on the size of the file online and offline, resume the
-    download if the file offline is smaller than online.
+def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True):
+    """Stream url by chunk
+
+    Args:
+        url (str): Url.
+        start_byte (Optional[int]): Start streaming at that point.
+        block_size (int): Size of chunks to stream.
+        progress_bar (bool): Display a progress bar.
+    """
+
+    # If we already have the whole file, there is no need to download it again
+    req = urllib.request.Request(url, method="HEAD")
+    url_size = int(urllib.request.urlopen(req).info().get("Content-Length", -1))
+    if url_size == start_byte:
+        raise StopIteration
+
+    req = urllib.request.Request(url)
+    if start_byte:
+        req.headers["Range"] = "bytes={}-".format(start_byte)
+
+    with urllib.request.urlopen(req) as upointer, tqdm(
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+        total=url_size,
+        disable=not progress_bar,
+    ) as pbar:
+
+        num_bytes = 0
+        while True:
+            chunk = upointer.read(block_size)
+            if not chunk:
+                break
+            yield chunk
+            num_bytes += len(chunk)
+            pbar.update(len(chunk))
+
+
+def download_url(
+    url,
+    download_folder,
+    filename=None,
+    hash_value=None,
+    hash_type="sha256",
+    progress_bar=True,
+    resume=False,
+):
+    """Download file to disk.
 
     Args:
         url (str): Url.
         download_folder (str): Folder to download file.
+        filename (str): Name of downloaded file. If None, it is inferred from the url.
         hash_value (str): Hash for url.
         hash_type (str): Hash type.
+        progress_bar (bool): Display a progress bar.
+        resume (bool): Enable resuming download.
     """
 
-    filepath = os.path.join(download_folder, os.path.basename(url))
+    # Detect filename
+    if filename is None:
+        req = urllib.request.Request(url, method="HEAD")
+        filename = urllib.request.urlopen(
+            req
+        ).info().get_filename() or os.path.basename(url)
 
-    req = urllib.request.Request(url)
-    if os.path.exists(filepath):
+    filepath = os.path.join(download_folder, filename)
+
+    if resume and os.path.exists(filepath):
         mode = "ab"
         local_size = os.path.getsize(filepath)
-
-        # If the file exists, then download only the remainder
-        req.headers["Range"] = "bytes={}-".format(local_size)
+    elif resume and os.path.exists(filepath):
+        raise RuntimeError(
+            "{} already exists. Delete the file manually and retry.".format(filepath)
+        )
     else:
         mode = "wb"
-        local_size = 0
+        local_size = None
 
     # If we already have the whole file, there is no need to download it again
-    url_size = int(urllib.request.urlopen(url).info().get("Content-Length", -1))
-    if url_size == local_size:
-        if hash_value and not validate_download_url(filepath, hash_value, hash_type):
-            raise RuntimeError(
-                "The hash of {} does not match. Delete the file manually and retry.".format(
-                    filepath
-                )
+    if hash_value and not validate_file(filepath, hash_value, hash_type):
+        raise RuntimeError(
+            "The hash of {} does not match. Delete the file manually and retry.".format(
+                filepath
             )
+        )
 
-        return
-
-    with open(filepath, mode) as fpointer, urllib.request.urlopen(
-        req
-    ) as upointer, tqdm(
-        unit="B", unit_scale=True, unit_divisor=1024, total=url_size
-    ) as pbar:
-
-        num_bytes = 0
-        block_size = 32 * 1024
-        while True:
-            chunk = upointer.read(block_size)
-            if not chunk:
-                break
+    with open(filepath, mode) as fpointer:
+        for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
             fpointer.write(chunk)
-            num_bytes += len(chunk)
-            pbar.update(len(chunk))
 
 
 def validate_download_url(filepath, hash_value, hash_type="sha256"):

From 0ad22e28c17f928d80c7a6970fffbd9a367fd89d Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Wed, 30 Oct 2019 12:09:15 -0400
Subject: [PATCH 04/12] not specific to url.

---
 torchaudio/datasets/utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 4018e87b0e..58b12f9458 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -159,31 +159,31 @@ def download_url(
             fpointer.write(chunk)
 
 
-def validate_download_url(filepath, hash_value, hash_type="sha256"):
+def validate_file(filepath, hash_value, hash_type="sha256"):
     """Validate a given file with its hash.
 
     Args:
-        url (str): Url.
-        download_folder (str): Folder to download file.
+        filepath (str): File to read.
         hash_value (str): Hash for url.
         hash_type (str): Hash type.
     """
 
     if hash_type == "sha256":
-        sha = hashlib.sha256()
+        hash_func = hashlib.sha256()
     elif hash_type == "md5":
-        sha = hashlib.md5()
+        hash_func = hashlib.md5()
     else:
         raise ValueError
 
     with open(filepath, "rb") as f:
         while True:
-            chunk = f.read(1000 * 1000)  # 1MB so that memory is not exhausted
+            # Read by chunk to avoid filling memory
+            chunk = f.read(1024 ** 2)
             if not chunk:
                 break
-            sha.update(chunk)
+            hash_func.update(chunk)
 
-    return sha.hexdigest() == hash_value
+    return hash_func.hexdigest() == hash_value
 
 
 def extract_archive(from_path, to_path=None, overwrite=False):

From 6d6813b29a8193d3fb23e8507911f1950f58fd2f Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Wed, 30 Oct 2019 12:26:01 -0400
Subject: [PATCH 05/12] validate at end too. check file size again.

---
 torchaudio/datasets/utils.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 58b12f9458..cbce3f9491 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -126,13 +126,11 @@ def download_url(
         resume (bool): Enable resuming download.
     """
 
-    # Detect filename
-    if filename is None:
-        req = urllib.request.Request(url, method="HEAD")
-        filename = urllib.request.urlopen(
-            req
-        ).info().get_filename() or os.path.basename(url)
+    req = urllib.request.Request(url, method="HEAD")
+    req_info = urllib.request.urlopen(req).info()
 
+    # Detect filename
+    filename = filename or req_info.get_filename() or os.path.basename(url)
     filepath = os.path.join(download_folder, filename)
 
     if resume and os.path.exists(filepath):
@@ -146,7 +144,20 @@ def download_url(
         mode = "wb"
         local_size = None
 
-    # If we already have the whole file, there is no need to download it again
+    if hash_value and local_size == int(req_info.get("Content-Length", -1)):
+        if validate_file(filepath, hash_value, hash_type):
+            return
+        else:
+            raise RuntimeError(
+                "The hash of {} does not match. Delete the file manually and retry.".format(
+                    filepath
+                )
+            )
+
+    with open(filepath, mode) as fpointer:
+        for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
+            fpointer.write(chunk)
+
     if hash_value and not validate_file(filepath, hash_value, hash_type):
         raise RuntimeError(
             "The hash of {} does not match. Delete the file manually and retry.".format(
@@ -154,10 +165,6 @@ def download_url(
             )
         )
 
-    with open(filepath, mode) as fpointer:
-        for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
-            fpointer.write(chunk)
-
 
 def validate_file(filepath, hash_value, hash_type="sha256"):
     """Validate a given file with its hash.

From fcb5d0a5585b8b3487cc1eaa92a161a6d98d0651 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Wed, 30 Oct 2019 16:07:32 -0400
Subject: [PATCH 06/12] futures.

---
 torchaudio/datasets/utils.py | 48 +++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index cbce3f9491..15a20281b8 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -1,6 +1,6 @@
+import concurrent.futures
 import csv
 import errno
-import gzip
 import hashlib
 import logging
 import os
@@ -12,7 +12,6 @@
 
 import six
 import torch
-import torchaudio
 from six.moves import urllib
 from torch.utils.data import Dataset
 from torch.utils.model_zoo import tqdm
@@ -81,7 +80,7 @@ def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True):
     req = urllib.request.Request(url, method="HEAD")
     url_size = int(urllib.request.urlopen(req).info().get("Content-Length", -1))
     if url_size == start_byte:
-        raise StopIteration
+        return
 
     req = urllib.request.Request(url)
     if start_byte:
@@ -105,7 +104,7 @@ def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True):
             pbar.update(len(chunk))
 
 
-def download_url(
+def download_single_url(
     url,
     download_folder,
     filename=None,
@@ -147,12 +146,11 @@ def download_url(
     if hash_value and local_size == int(req_info.get("Content-Length", -1)):
         if validate_file(filepath, hash_value, hash_type):
             return
-        else:
-            raise RuntimeError(
-                "The hash of {} does not match. Delete the file manually and retry.".format(
-                    filepath
-                )
+        raise RuntimeError(
+            "The hash of {} does not match. Delete the file manually and retry.".format(
+                filepath
             )
+        )
 
     with open(filepath, mode) as fpointer:
         for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
@@ -166,6 +164,38 @@ def download_url(
         )
 
 
+def download_url(urls, *args, max_workers=5, **kwargs):
+    """Download urls to disk. The other arguments are passed to download_single_url.
+
+    Args:
+        urls (str or List[str]): List of urls.
+        max_workers (int): Maximum number of workers.
+    """
+
+    if isinstance(urls, str):
+        return download_single_url(urls, *args, **kwargs)
+
+    # Turn arguments into lists
+    args = list(args)
+    for i, item in enumerate(args):
+        if not isinstance(item, list):
+            args[i] = [item] * len(urls)
+    args = list(zip(*args))
+
+    # Turn keyword arguments into lists
+    for key, value in kwargs.items():
+        if not isinstance(value, list):
+            kwargs[key] = [value] * len(urls)
+    kwargs = [dict(zip(kwargs.keys(), values)) for values in zip(*(kwargs.values()))]
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(download_single_url, url, *arg, **kwarg)
+            for url, arg, kwarg in zip(urls, args, kwargs)
+        ]
+        return concurrent.futures.as_completed(futures)
+
+
 def validate_file(filepath, hash_value, hash_type="sha256"):
     """Validate a given file with its hash.
 

From cb956957cbbacabf19c65fa99ca015eb3b3316d0 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Wed, 30 Oct 2019 16:44:40 -0400
Subject: [PATCH 07/12] expose choices of hash.

---
 torchaudio/datasets/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 15a20281b8..027bd0037b 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -120,7 +120,7 @@ def download_single_url(
         download_folder (str): Folder to download file.
         filename (str): Name of downloaded file. If None, it is inferred from the url.
         hash_value (str): Hash for url.
-        hash_type (str): Hash type.
+        hash_type (str): Hash type, among "sha256" and "md5".
         progress_bar (bool): Display a progress bar.
         resume (bool): Enable resuming download.
     """

From b301e1cbb39d32335f83df0a6c36bd820703c333 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Fri, 1 Nov 2019 14:59:17 -0400
Subject: [PATCH 08/12] update comment.

---
 torchaudio/datasets/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 027bd0037b..8073bb2a1f 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -202,7 +202,7 @@ def validate_file(filepath, hash_value, hash_type="sha256"):
     Args:
         filepath (str): File to read.
         hash_value (str): Hash for url.
-        hash_type (str): Hash type.
+        hash_type (str): Hash type, among "sha256" and "md5".
     """
 
     if hash_type == "sha256":

From 2ea30188b06127816a74ce2ed6c16d764e638095 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Tue, 5 Nov 2019 17:05:43 -0500
Subject: [PATCH 09/12] typo.

---
 torchaudio/datasets/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 8073bb2a1f..269a593fad 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -135,7 +135,7 @@ def download_single_url(
     if resume and os.path.exists(filepath):
         mode = "ab"
         local_size = os.path.getsize(filepath)
-    elif resume and os.path.exists(filepath):
+    elif not resume and os.path.exists(filepath):
         raise RuntimeError(
             "{} already exists. Delete the file manually and retry.".format(filepath)
         )

From edef3ca9d4e5b33dcf2c4e394b82d45d06890cd5 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Tue, 5 Nov 2019 17:06:14 -0500
Subject: [PATCH 10/12] remove parallel download.

---
 torchaudio/datasets/utils.py | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 269a593fad..f7b0df9b94 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -104,7 +104,7 @@ def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True):
             pbar.update(len(chunk))
 
 
-def download_single_url(
+def download_url(
     url,
     download_folder,
     filename=None,
@@ -164,38 +164,6 @@ def download_single_url(
         )
 
 
-def download_url(urls, *args, max_workers=5, **kwargs):
-    """Download urls to disk. The other arguments are passed to download_single_url.
-
-    Args:
-        urls (str or List[str]): List of urls.
-        max_workers (int): Maximum number of workers.
-    """
-
-    if isinstance(urls, str):
-        return download_single_url(urls, *args, **kwargs)
-
-    # Turn arguments into lists
-    args = list(args)
-    for i, item in enumerate(args):
-        if not isinstance(item, list):
-            args[i] = [item] * len(urls)
-    args = list(zip(*args))
-
-    # Turn keyword arguments into lists
-    for key, value in kwargs.items():
-        if not isinstance(value, list):
-            kwargs[key] = [value] * len(urls)
-    kwargs = [dict(zip(kwargs.keys(), values)) for values in zip(*(kwargs.values()))]
-
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = [
-            executor.submit(download_single_url, url, *arg, **kwarg)
-            for url, arg, kwarg in zip(urls, args, kwargs)
-        ]
-        return concurrent.futures.as_completed(futures)
-
-
 def validate_file(filepath, hash_value, hash_type="sha256"):
     """Validate a given file with its hash.
 

From a3ed7d6a3b9da17c01eebd658a84b0a53c7de282 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Mon, 11 Nov 2019 16:03:11 -0800
Subject: [PATCH 11/12] extra library.

---
 torchaudio/datasets/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index f7b0df9b94..1273274d75 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -1,4 +1,3 @@
-import concurrent.futures
 import csv
 import errno
 import hashlib

From f8abbcf30819a58074709f5ad7ae648f2b402f16 Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Tue, 12 Nov 2019 18:25:18 -0800
Subject: [PATCH 12/12] validate now operates on file object.

---
 torchaudio/datasets/utils.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py
index 1273274d75..545a12bc41 100644
--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -143,8 +143,9 @@ def download_url(
         local_size = None
 
     if hash_value and local_size == int(req_info.get("Content-Length", -1)):
-        if validate_file(filepath, hash_value, hash_type):
-            return
+        with open(filepath, "rb") as file_obj:
+            if validate_file(file_obj, hash_value, hash_type):
+                return
         raise RuntimeError(
             "The hash of {} does not match. Delete the file manually and retry.".format(
                 filepath
@@ -155,19 +156,20 @@ def download_url(
         for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
             fpointer.write(chunk)
 
-    if hash_value and not validate_file(filepath, hash_value, hash_type):
-        raise RuntimeError(
-            "The hash of {} does not match. Delete the file manually and retry.".format(
-                filepath
+    with open(filepath, "rb") as file_obj:
+        if hash_value and not validate_file(file_obj, hash_value, hash_type):
+            raise RuntimeError(
+                "The hash of {} does not match. Delete the file manually and retry.".format(
+                    filepath
+                )
             )
-        )
 
 
-def validate_file(filepath, hash_value, hash_type="sha256"):
-    """Validate a given file with its hash.
+def validate_file(file_obj, hash_value, hash_type="sha256"):
+    """Validate a given file object with its hash.
 
     Args:
-        filepath (str): File to read.
+        file_obj: File object to read from.
         hash_value (str): Hash for url.
         hash_type (str): Hash type, among "sha256" and "md5".
     """
@@ -179,13 +181,12 @@ def validate_file(filepath, hash_value, hash_type="sha256"):
     else:
         raise ValueError
 
-    with open(filepath, "rb") as f:
-        while True:
-            # Read by chunk to avoid filling memory
-            chunk = f.read(1024 ** 2)
-            if not chunk:
-                break
-            hash_func.update(chunk)
+    while True:
+        # Read by chunk to avoid filling memory
+        chunk = f.read(1024 ** 2)
+        if not chunk:
+            break
+        hash_func.update(chunk)
 
     return hash_func.hexdigest() == hash_value