From 1d6360133b2e01b26441662087475740f479c579 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 29 Oct 2019 12:52:34 -0400 Subject: [PATCH 01/12] resume download, validate with md5 or sha256. --- requirements.txt | 2 + torchaudio/datasets/utils.py | 140 +++++++++++++++++++++++++---------- 2 files changed, 103 insertions(+), 39 deletions(-) diff --git a/requirements.txt b/requirements.txt index 686fa2d996..76e4acd48a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,5 @@ pytest # Testing only Py3 compat backports.tempfile + +requests diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index f3daebcd12..c0ac326258 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -10,9 +10,11 @@ import zipfile from queue import Queue +import requests import six import torch import torchaudio +from six.moves import urllib from torch.utils.data import Dataset from torch.utils.model_zoo import tqdm @@ -53,18 +55,6 @@ def unicode_csv_reader(unicode_csv_data, **kwargs): yield line -def gen_bar_updater(): - pbar = tqdm(total=None) - - def bar_update(count, block_size, total_size): - if pbar.total is None and total_size: - pbar.total = total_size - progress_bytes = count * block_size - pbar.update(progress_bytes - pbar.n) - - return bar_update - - def makedir_exist_ok(dirpath): """ Python2 support for os.makedirs(.., exist_ok=True) @@ -78,41 +68,113 @@ def makedir_exist_ok(dirpath): raise -def download_url(url, root, filename=None, md5=None): - """Download a file from a url and place it in root. +def download_url_resume(url, download_folder, resume_byte_pos=None): + """Download url to disk with possible resumption. Args: - url (str): URL to download file from - root (str): Directory to place downloaded file in - filename (str, optional): Name to save the file under. If None, use the basename of the URL - md5 (str, optional): MD5 checksum of the download. If None, do not check + url (str): Url. + download_folder (str): Folder to download file. + resume_byte_pos (int): Position of byte from where to resume the download. """ - from six.moves import urllib + # Get size of file + r = requests.head(url) + file_size = int(r.headers.get("content-length", 0)) - root = os.path.expanduser(root) - if not filename: - filename = os.path.basename(url) - fpath = os.path.join(root, filename) + # Append information to resume download at specific byte position to header + resume_header = ( + {"Range": "bytes={}-".format(resume_byte_pos)} if resume_byte_pos else None + ) - makedir_exist_ok(root) + # Establish connection + r = requests.get(url, stream=True, headers=resume_header) - # downloads file - if os.path.isfile(fpath): - print("Using downloaded file: " + fpath) - else: - try: - print("Downloading " + url + " to " + fpath) - urllib.request.urlretrieve(url, fpath, reporthook=gen_bar_updater()) - except (urllib.error.URLError, IOError) as e: - if url[:5] == "https": - url = url.replace("https:", "http:") + # Set configuration + n_block = 32 + block_size = 1024 + initial_pos = resume_byte_pos if resume_byte_pos else 0 + mode = "ab" if resume_byte_pos else "wb" + + filename = os.path.basename(url) + filepath = os.path.join(download_folder, os.path.basename(url)) + + with open(filepath, mode) as f: + with tqdm( + unit="B", unit_scale=True, unit_divisor=1024, total=file_size + ) as pbar: + for chunk in r.iter_content(n_block * block_size): + f.write(chunk) + pbar.update(len(chunk)) + + +def download_url(url, download_folder, hash_value=None, hash_type="sha256"): + """Execute the correct download operation. + Depending on the size of the file online and offline, resume the + download if the file offline is smaller than online. + + Args: + url (str): Url. + download_folder (str): Folder to download file. + hash_value (str): Hash for url. + hash_type (str): Hash type. + """ + # Establish connection to header of file + r = requests.head(url) + + # Get filesize of online and offline file + file_size_online = int(r.headers.get("content-length", 0)) + filepath = os.path.join(download_folder, os.path.basename(url)) + + if os.path.exists(filepath): + file_size_offline = os.path.getsize(filepath) + + if file_size_online != file_size_offline: + # Resume download + print("File {} is incomplete. Resume download.".format(filepath)) + download_url_resume(url, download_folder, file_size_offline) + elif hash_value: + if validate_download_url(url, download_folder, hash_value, hash_type): + print("File {} is validated. Skip download.".format(filepath)) + else: print( - "Failed download. Trying https -> http instead." - " Downloading " + url + " to " + fpath + "File {} is corrupt. Delete it manually and retry.".format(filepath) ) - urllib.request.urlretrieve(url, fpath, reporthook=gen_bar_updater()) - else: - raise e + else: + # Skip download + print("File {} is complete. Skip download.".format(filepath)) + else: + # Start download + print("File {} has not been downloaded. Start download.".format(filepath)) + download_url_resume(url, download_folder) + + +def validate_download_url(url, download_folder, hash_value, hash_type="sha256"): + """Validate a given file with its hash. + The downloaded file is hashed and compared to a pre-registered + has value to validate the download procedure. + + Args: + url (str): Url. + download_folder (str): Folder to download file. + hash_value (str): Hash for url. + hash_type (str): Hash type. + """ + filepath = os.path.join(download_folder, os.path.basename(url)) + + if hash_type == "sha256": + sha = hashlib.sha256() + elif hash_type == "md5": + sha = hashlib.md5() + else: + raise ValueError + + with open(filepath, "rb") as f: + while True: + chunk = f.read(1000 * 1000) # 1MB so that memory is not exhausted + if not chunk: + break + sha.update(chunk) + + return sha.hexdigest() == hash_value def extract_archive(from_path, to_path=None, overwrite=False): From 9e7d84391fbfec7daf2ff6f040476b08922870ee Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 29 Oct 2019 16:01:21 -0400 Subject: [PATCH 02/12] with urllib. --- requirements.txt | 2 - torchaudio/datasets/utils.py | 103 +++++++++++++---------------------- 2 files changed, 37 insertions(+), 68 deletions(-) diff --git a/requirements.txt b/requirements.txt index 76e4acd48a..686fa2d996 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,5 +18,3 @@ pytest # Testing only Py3 compat backports.tempfile - -requests diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index c0ac326258..d27d1f9129 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -10,7 +10,6 @@ import zipfile from queue import Queue -import requests import six import torch import torchaudio @@ -68,44 +67,6 @@ def makedir_exist_ok(dirpath): raise -def download_url_resume(url, download_folder, resume_byte_pos=None): - """Download url to disk with possible resumption. - - Args: - url (str): Url. - download_folder (str): Folder to download file. - resume_byte_pos (int): Position of byte from where to resume the download. - """ - # Get size of file - r = requests.head(url) - file_size = int(r.headers.get("content-length", 0)) - - # Append information to resume download at specific byte position to header - resume_header = ( - {"Range": "bytes={}-".format(resume_byte_pos)} if resume_byte_pos else None - ) - - # Establish connection - r = requests.get(url, stream=True, headers=resume_header) - - # Set configuration - n_block = 32 - block_size = 1024 - initial_pos = resume_byte_pos if resume_byte_pos else 0 - mode = "ab" if resume_byte_pos else "wb" - - filename = os.path.basename(url) - filepath = os.path.join(download_folder, os.path.basename(url)) - - with open(filepath, mode) as f: - with tqdm( - unit="B", unit_scale=True, unit_divisor=1024, total=file_size - ) as pbar: - for chunk in r.iter_content(n_block * block_size): - f.write(chunk) - pbar.update(len(chunk)) - - def download_url(url, download_folder, hash_value=None, hash_type="sha256"): """Execute the correct download operation. Depending on the size of the file online and offline, resume the @@ -117,40 +78,51 @@ def download_url(url, download_folder, hash_value=None, hash_type="sha256"): hash_value (str): Hash for url. hash_type (str): Hash type. """ - # Establish connection to header of file - r = requests.head(url) - # Get filesize of online and offline file - file_size_online = int(r.headers.get("content-length", 0)) filepath = os.path.join(download_folder, os.path.basename(url)) + req = urllib.request.Request(url) if os.path.exists(filepath): - file_size_offline = os.path.getsize(filepath) - - if file_size_online != file_size_offline: - # Resume download - print("File {} is incomplete. Resume download.".format(filepath)) - download_url_resume(url, download_folder, file_size_offline) - elif hash_value: - if validate_download_url(url, download_folder, hash_value, hash_type): - print("File {} is validated. Skip download.".format(filepath)) - else: - print( - "File {} is corrupt. Delete it manually and retry.".format(filepath) - ) - else: - # Skip download - print("File {} is complete. Skip download.".format(filepath)) + mode = "ab" + local_size = os.path.getsize(filepath) + + # If the file exists, then download only the remainder + req.headers["Range"] = "bytes={}-".format(local_size) else: - # Start download - print("File {} has not been downloaded. Start download.".format(filepath)) - download_url_resume(url, download_folder) + mode = "wb" + local_size = 0 + + # If we already have the whole file, there is no need to download it again + url_size = int(urllib.request.urlopen(url).info().get("Content-Length", -1)) + if url_size == local_size: + if hash_value and not validate_download_url(filepath, hash_value, hash_type): + raise RuntimeError( + "The hash of {} does not match. Delete the file manually and retry.".format( + filepath + ) + ) + + return + with open(filepath, mode) as fpointer, urllib.request.urlopen( + req + ) as upointer, tqdm( + unit="B", unit_scale=True, unit_divisor=1024, total=url_size + ) as pbar: -def validate_download_url(url, download_folder, hash_value, hash_type="sha256"): + num_bytes = 0 + block_size = 32 * 1024 + while True: + chunk = upointer.read(block_size) + if not chunk: + break + fpointer.write(chunk) + num_bytes += len(chunk) + pbar.update(len(chunk)) + + +def validate_download_url(filepath, hash_value, hash_type="sha256"): """Validate a given file with its hash. - The downloaded file is hashed and compared to a pre-registered - has value to validate the download procedure. Args: url (str): Url. @@ -158,7 +130,6 @@ def validate_download_url(url, download_folder, hash_value, hash_type="sha256"): hash_value (str): Hash for url. hash_type (str): Hash type. """ - filepath = os.path.join(download_folder, os.path.basename(url)) if hash_type == "sha256": sha = hashlib.sha256() From 46e92fe2633a1f3de1c316ce30a5785de61ea030 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 30 Oct 2019 12:08:57 -0400 Subject: [PATCH 03/12] split stream from saving. detect filename. --- torchaudio/datasets/utils.py | 106 ++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 34 deletions(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index d27d1f9129..4018e87b0e 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -67,58 +67,96 @@ def makedir_exist_ok(dirpath): raise -def download_url(url, download_folder, hash_value=None, hash_type="sha256"): - """Execute the correct download operation. - Depending on the size of the file online and offline, resume the - download if the file offline is smaller than online. +def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True): + """Stream url by chunk + + Args: + url (str): Url. + start_byte (Optional[int]): Start streaming at that point. + block_size (int): Size of chunks to stream. + progress_bar (bool): Display a progress bar. + """ + + # If we already have the whole file, there is no need to download it again + req = urllib.request.Request(url, method="HEAD") + url_size = int(urllib.request.urlopen(req).info().get("Content-Length", -1)) + if url_size == start_byte: + raise StopIteration + + req = urllib.request.Request(url) + if start_byte: + req.headers["Range"] = "bytes={}-".format(start_byte) + + with urllib.request.urlopen(req) as upointer, tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=url_size, + disable=not progress_bar, + ) as pbar: + + num_bytes = 0 + while True: + chunk = upointer.read(block_size) + if not chunk: + break + yield chunk + num_bytes += len(chunk) + pbar.update(len(chunk)) + + +def download_url( + url, + download_folder, + filename=None, + hash_value=None, + hash_type="sha256", + progress_bar=True, + resume=False, +): + """Download file to disk. Args: url (str): Url. download_folder (str): Folder to download file. + filename (str): Name of downloaded file. If None, it is inferred from the url. hash_value (str): Hash for url. hash_type (str): Hash type. + progress_bar (bool): Display a progress bar. + resume (bool): Enable resuming download. """ - filepath = os.path.join(download_folder, os.path.basename(url)) + # Detect filename + if filename is None: + req = urllib.request.Request(url, method="HEAD") + filename = urllib.request.urlopen( + req + ).info().get_filename() or os.path.basename(url) - req = urllib.request.Request(url) - if os.path.exists(filepath): + filepath = os.path.join(download_folder, filename) + + if resume and os.path.exists(filepath): mode = "ab" local_size = os.path.getsize(filepath) - - # If the file exists, then download only the remainder - req.headers["Range"] = "bytes={}-".format(local_size) + elif resume and os.path.exists(filepath): + raise RuntimeError( + "{} already exists. Delete the file manually and retry.".format(filepath) + ) else: mode = "wb" - local_size = 0 + local_size = None # If we already have the whole file, there is no need to download it again - url_size = int(urllib.request.urlopen(url).info().get("Content-Length", -1)) - if url_size == local_size: - if hash_value and not validate_download_url(filepath, hash_value, hash_type): - raise RuntimeError( - "The hash of {} does not match. Delete the file manually and retry.".format( - filepath - ) + if hash_value and not validate_file(filepath, hash_value, hash_type): + raise RuntimeError( + "The hash of {} does not match. Delete the file manually and retry.".format( + filepath ) + ) - return - - with open(filepath, mode) as fpointer, urllib.request.urlopen( - req - ) as upointer, tqdm( - unit="B", unit_scale=True, unit_divisor=1024, total=url_size - ) as pbar: - - num_bytes = 0 - block_size = 32 * 1024 - while True: - chunk = upointer.read(block_size) - if not chunk: - break + with open(filepath, mode) as fpointer: + for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): fpointer.write(chunk) - num_bytes += len(chunk) - pbar.update(len(chunk)) def validate_download_url(filepath, hash_value, hash_type="sha256"): From 0ad22e28c17f928d80c7a6970fffbd9a367fd89d Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 30 Oct 2019 12:09:15 -0400 Subject: [PATCH 04/12] not specific to url. --- torchaudio/datasets/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 4018e87b0e..58b12f9458 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -159,31 +159,31 @@ def download_url( fpointer.write(chunk) -def validate_download_url(filepath, hash_value, hash_type="sha256"): +def validate_file(filepath, hash_value, hash_type="sha256"): """Validate a given file with its hash. Args: - url (str): Url. - download_folder (str): Folder to download file. + filepath (str): File to read. hash_value (str): Hash for url. hash_type (str): Hash type. """ if hash_type == "sha256": - sha = hashlib.sha256() + hash_func = hashlib.sha256() elif hash_type == "md5": - sha = hashlib.md5() + hash_func = hashlib.md5() else: raise ValueError with open(filepath, "rb") as f: while True: - chunk = f.read(1000 * 1000) # 1MB so that memory is not exhausted + # Read by chunk to avoid filling memory + chunk = f.read(1024 ** 2) if not chunk: break - sha.update(chunk) + hash_func.update(chunk) - return sha.hexdigest() == hash_value + return hash_func.hexdigest() == hash_value def extract_archive(from_path, to_path=None, overwrite=False): From 6d6813b29a8193d3fb23e8507911f1950f58fd2f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 30 Oct 2019 12:26:01 -0400 Subject: [PATCH 05/12] validate at end too. check file size again. --- torchaudio/datasets/utils.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 58b12f9458..cbce3f9491 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -126,13 +126,11 @@ def download_url( resume (bool): Enable resuming download. """ - # Detect filename - if filename is None: - req = urllib.request.Request(url, method="HEAD") - filename = urllib.request.urlopen( - req - ).info().get_filename() or os.path.basename(url) + req = urllib.request.Request(url, method="HEAD") + req_info = urllib.request.urlopen(req).info() + # Detect filename + filename = filename or req_info.get_filename() or os.path.basename(url) filepath = os.path.join(download_folder, filename) if resume and os.path.exists(filepath): @@ -146,7 +144,20 @@ def download_url( mode = "wb" local_size = None - # If we already have the whole file, there is no need to download it again + if hash_value and local_size == int(req_info.get("Content-Length", -1)): + if validate_file(filepath, hash_value, hash_type): + return + else: + raise RuntimeError( + "The hash of {} does not match. Delete the file manually and retry.".format( + filepath + ) + ) + + with open(filepath, mode) as fpointer: + for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): + fpointer.write(chunk) + if hash_value and not validate_file(filepath, hash_value, hash_type): raise RuntimeError( "The hash of {} does not match. Delete the file manually and retry.".format( @@ -154,10 +165,6 @@ def download_url( ) ) - with open(filepath, mode) as fpointer: - for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): - fpointer.write(chunk) - def validate_file(filepath, hash_value, hash_type="sha256"): """Validate a given file with its hash. From fcb5d0a5585b8b3487cc1eaa92a161a6d98d0651 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 30 Oct 2019 16:07:32 -0400 Subject: [PATCH 06/12] futures. --- torchaudio/datasets/utils.py | 48 +++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index cbce3f9491..15a20281b8 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -1,6 +1,6 @@ +import concurrent.futures import csv import errno -import gzip import hashlib import logging import os @@ -12,7 +12,6 @@ import six import torch -import torchaudio from six.moves import urllib from torch.utils.data import Dataset from torch.utils.model_zoo import tqdm @@ -81,7 +80,7 @@ def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True): req = urllib.request.Request(url, method="HEAD") url_size = int(urllib.request.urlopen(req).info().get("Content-Length", -1)) if url_size == start_byte: - raise StopIteration + return req = urllib.request.Request(url) if start_byte: @@ -105,7 +104,7 @@ def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True): pbar.update(len(chunk)) -def download_url( +def download_single_url( url, download_folder, filename=None, @@ -147,12 +146,11 @@ def download_url( if hash_value and local_size == int(req_info.get("Content-Length", -1)): if validate_file(filepath, hash_value, hash_type): return - else: - raise RuntimeError( - "The hash of {} does not match. Delete the file manually and retry.".format( - filepath - ) + raise RuntimeError( + "The hash of {} does not match. Delete the file manually and retry.".format( + filepath ) + ) with open(filepath, mode) as fpointer: for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): @@ -166,6 +164,38 @@ def download_url( ) +def download_url(urls, *args, max_workers=5, **kwargs): + """Download urls to disk. The other arguments are passed to download_single_url. + + Args: + urls (str or List[str]): List of urls. + max_workers (int): Maximum number of workers. + """ + + if isinstance(urls, str): + return download_single_url(urls, *args, **kwargs) + + # Turn arguments into lists + args = list(args) + for i, item in enumerate(args): + if not isinstance(item, list): + args[i] = [item] * len(urls) + args = list(zip(*args)) + + # Turn keyword arguments into lists + for key, value in kwargs.items(): + if not isinstance(value, list): + kwargs[key] = [value] * len(urls) + kwargs = [dict(zip(kwargs.keys(), values)) for values in zip(*(kwargs.values()))] + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(download_single_url, url, *arg, **kwarg) + for url, arg, kwarg in zip(urls, args, kwargs) + ] + return concurrent.futures.as_completed(futures) + + def validate_file(filepath, hash_value, hash_type="sha256"): """Validate a given file with its hash. From cb956957cbbacabf19c65fa99ca015eb3b3316d0 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 30 Oct 2019 16:44:40 -0400 Subject: [PATCH 07/12] expose choices of hash. --- torchaudio/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 15a20281b8..027bd0037b 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -120,7 +120,7 @@ def download_single_url( download_folder (str): Folder to download file. filename (str): Name of downloaded file. If None, it is inferred from the url. hash_value (str): Hash for url. - hash_type (str): Hash type. + hash_type (str): Hash type, among "sha256" and "md5". progress_bar (bool): Display a progress bar. resume (bool): Enable resuming download. """ From b301e1cbb39d32335f83df0a6c36bd820703c333 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 1 Nov 2019 14:59:17 -0400 Subject: [PATCH 08/12] update comment. --- torchaudio/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 027bd0037b..8073bb2a1f 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -202,7 +202,7 @@ def validate_file(filepath, hash_value, hash_type="sha256"): Args: filepath (str): File to read. hash_value (str): Hash for url. - hash_type (str): Hash type. + hash_type (str): Hash type, among "sha256" and "md5". """ if hash_type == "sha256": From 2ea30188b06127816a74ce2ed6c16d764e638095 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 5 Nov 2019 17:05:43 -0500 Subject: [PATCH 09/12] typo. --- torchaudio/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 8073bb2a1f..269a593fad 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -135,7 +135,7 @@ def download_single_url( if resume and os.path.exists(filepath): mode = "ab" local_size = os.path.getsize(filepath) - elif resume and os.path.exists(filepath): + elif not resume and os.path.exists(filepath): raise RuntimeError( "{} already exists. Delete the file manually and retry.".format(filepath) ) From edef3ca9d4e5b33dcf2c4e394b82d45d06890cd5 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 5 Nov 2019 17:06:14 -0500 Subject: [PATCH 10/12] remove parallel download. --- torchaudio/datasets/utils.py | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 269a593fad..f7b0df9b94 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -104,7 +104,7 @@ def stream_url(url, start_byte=None, block_size=32 * 1024, progress_bar=True): pbar.update(len(chunk)) -def download_single_url( +def download_url( url, download_folder, filename=None, @@ -164,38 +164,6 @@ def download_single_url( ) -def download_url(urls, *args, max_workers=5, **kwargs): - """Download urls to disk. The other arguments are passed to download_single_url. - - Args: - urls (str or List[str]): List of urls. - max_workers (int): Maximum number of workers. - """ - - if isinstance(urls, str): - return download_single_url(urls, *args, **kwargs) - - # Turn arguments into lists - args = list(args) - for i, item in enumerate(args): - if not isinstance(item, list): - args[i] = [item] * len(urls) - args = list(zip(*args)) - - # Turn keyword arguments into lists - for key, value in kwargs.items(): - if not isinstance(value, list): - kwargs[key] = [value] * len(urls) - kwargs = [dict(zip(kwargs.keys(), values)) for values in zip(*(kwargs.values()))] - - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [ - executor.submit(download_single_url, url, *arg, **kwarg) - for url, arg, kwarg in zip(urls, args, kwargs) - ] - return concurrent.futures.as_completed(futures) - - def validate_file(filepath, hash_value, hash_type="sha256"): """Validate a given file with its hash. From a3ed7d6a3b9da17c01eebd658a84b0a53c7de282 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 11 Nov 2019 16:03:11 -0800 Subject: [PATCH 11/12] extra library. --- torchaudio/datasets/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index f7b0df9b94..1273274d75 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -1,4 +1,3 @@ -import concurrent.futures import csv import errno import hashlib From f8abbcf30819a58074709f5ad7ae648f2b402f16 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 12 Nov 2019 18:25:18 -0800 Subject: [PATCH 12/12] validate now operates on file object. --- torchaudio/datasets/utils.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/torchaudio/datasets/utils.py b/torchaudio/datasets/utils.py index 1273274d75..545a12bc41 100644 --- a/torchaudio/datasets/utils.py +++ b/torchaudio/datasets/utils.py @@ -143,8 +143,9 @@ def download_url( local_size = None if hash_value and local_size == int(req_info.get("Content-Length", -1)): - if validate_file(filepath, hash_value, hash_type): - return + with open(filepath, "rb") as file_obj: + if validate_file(file_obj, hash_value, hash_type): + return raise RuntimeError( "The hash of {} does not match. Delete the file manually and retry.".format( filepath @@ -155,19 +156,20 @@ def download_url( for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): fpointer.write(chunk) - if hash_value and not validate_file(filepath, hash_value, hash_type): - raise RuntimeError( - "The hash of {} does not match. Delete the file manually and retry.".format( - filepath + with open(filepath, "rb") as file_obj: + if hash_value and not validate_file(file_obj, hash_value, hash_type): + raise RuntimeError( + "The hash of {} does not match. Delete the file manually and retry.".format( + filepath + ) ) - ) -def validate_file(filepath, hash_value, hash_type="sha256"): - """Validate a given file with its hash. +def validate_file(file_obj, hash_value, hash_type="sha256"): + """Validate a given file object with its hash. Args: - filepath (str): File to read. + file_obj: File object to read from. hash_value (str): Hash for url. hash_type (str): Hash type, among "sha256" and "md5". """ @@ -179,13 +181,12 @@ def validate_file(filepath, hash_value, hash_type="sha256"): else: raise ValueError - with open(filepath, "rb") as f: - while True: - # Read by chunk to avoid filling memory - chunk = f.read(1024 ** 2) - if not chunk: - break - hash_func.update(chunk) + while True: + # Read by chunk to avoid filling memory + chunk = f.read(1024 ** 2) + if not chunk: + break + hash_func.update(chunk) return hash_func.hexdigest() == hash_value