Skip to content

Commit 927e462

Browse files
committed
with urllib.
1 parent 236b587 commit 927e462

File tree

3 files changed

+38
-69
lines changed

3 files changed

+38
-69
lines changed

requirements.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,3 @@ pytest
1818

1919
# Testing only Py3 compat
2020
backports.tempfile
21-
22-
requests

test/test_datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
from torchaudio.datasets.commonvoice import COMMONVOICE
55
from torchaudio.datasets.librispeech import LIBRISPEECH
6-
from torchaudio.datasets.utils import DiskCache
76
from torchaudio.datasets.vctk import VCTK
87
from torchaudio.datasets.yesno import YESNO
8+
from torchaudio.datasets.utils import DiskCache, download_url, download_url
99

1010

1111
class TestDatasets(unittest.TestCase):

torchaudio/datasets/utils.py

Lines changed: 37 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import tarfile
99
import zipfile
1010

11-
import requests
1211
import six
1312
import torch
1413
import torchaudio
@@ -66,44 +65,6 @@ def makedir_exist_ok(dirpath):
6665
raise
6766

6867

69-
def download_url_resume(url, download_folder, resume_byte_pos=None):
70-
"""Download url to disk with possible resumption.
71-
72-
Args:
73-
url (str): Url.
74-
download_folder (str): Folder to download file.
75-
resume_byte_pos (int): Position of byte from where to resume the download.
76-
"""
77-
# Get size of file
78-
r = requests.head(url)
79-
file_size = int(r.headers.get("content-length", 0))
80-
81-
# Append information to resume download at specific byte position to header
82-
resume_header = (
83-
{"Range": "bytes={}-".format(resume_byte_pos)} if resume_byte_pos else None
84-
)
85-
86-
# Establish connection
87-
r = requests.get(url, stream=True, headers=resume_header)
88-
89-
# Set configuration
90-
n_block = 32
91-
block_size = 1024
92-
initial_pos = resume_byte_pos if resume_byte_pos else 0
93-
mode = "ab" if resume_byte_pos else "wb"
94-
95-
filename = os.path.basename(url)
96-
filepath = os.path.join(download_folder, os.path.basename(url))
97-
98-
with open(filepath, mode) as f:
99-
with tqdm(
100-
unit="B", unit_scale=True, unit_divisor=1024, total=file_size
101-
) as pbar:
102-
for chunk in r.iter_content(n_block * block_size):
103-
f.write(chunk)
104-
pbar.update(len(chunk))
105-
106-
10768
def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
10869
"""Execute the correct download operation.
10970
Depending on the size of the file online and offline, resume the
@@ -115,48 +76,58 @@ def download_url(url, download_folder, hash_value=None, hash_type="sha256"):
11576
hash_value (str): Hash for url.
11677
hash_type (str): Hash type.
11778
"""
118-
# Establish connection to header of file
119-
r = requests.head(url)
12079

121-
# Get filesize of online and offline file
122-
file_size_online = int(r.headers.get("content-length", 0))
12380
filepath = os.path.join(download_folder, os.path.basename(url))
12481

82+
req = urllib.request.Request(url)
12583
if os.path.exists(filepath):
126-
file_size_offline = os.path.getsize(filepath)
127-
128-
if file_size_online != file_size_offline:
129-
# Resume download
130-
print("File {} is incomplete. Resume download.".format(filepath))
131-
download_url_resume(url, download_folder, file_size_offline)
132-
elif hash_value:
133-
if validate_download_url(url, download_folder, hash_value, hash_type):
134-
print("File {} is validated. Skip download.".format(filepath))
135-
else:
136-
print(
137-
"File {} is corrupt. Delete it manually and retry.".format(filepath)
138-
)
139-
else:
140-
# Skip download
141-
print("File {} is complete. Skip download.".format(filepath))
84+
mode = "ab"
85+
local_size = os.path.getsize(filepath)
86+
87+
# If the file exists, then download only the remainder
88+
req.headers["Range"] = "bytes={}-".format(local_size)
14289
else:
143-
# Start download
144-
print("File {} has not been downloaded. Start download.".format(filepath))
145-
download_url_resume(url, download_folder)
90+
mode = "wb"
91+
local_size = 0
92+
93+
# If we already have the whole file, there is no need to download it again
94+
url_size = int(urllib.request.urlopen(url).info().get("Content-Length", -1))
95+
if url_size == local_size:
96+
if hash_value and not validate_download_url(filepath, hash_value, hash_type):
97+
raise RuntimeError(
98+
"The hash of {} does not match. Delete the file manually and retry.".format(
99+
filepath
100+
)
101+
)
102+
103+
return
146104

105+
with open(filepath, mode) as fpointer, urllib.request.urlopen(
106+
req
107+
) as upointer, tqdm(
108+
unit="B", unit_scale=True, unit_divisor=1024, total=url_size
109+
) as pbar:
147110

148-
def validate_download_url(url, download_folder, hash_value, hash_type="sha256"):
111+
num_bytes = 0
112+
block_size = 32 * 1024
113+
while True:
114+
chunk = upointer.read(block_size)
115+
if not chunk:
116+
break
117+
fpointer.write(chunk)
118+
num_bytes += len(chunk)
119+
pbar.update(len(chunk))
120+
121+
122+
def validate_download_url(filepath, hash_value, hash_type="sha256"):
149123
"""Validate a given file with its hash.
150-
The downloaded file is hashed and compared to a pre-registered
151-
has value to validate the download procedure.
152124
153125
Args:
154126
url (str): Url.
155127
download_folder (str): Folder to download file.
156128
hash_value (str): Hash for url.
157129
hash_type (str): Hash type.
158130
"""
159-
filepath = os.path.join(download_folder, os.path.basename(url))
160131

161132
if hash_type == "sha256":
162133
sha = hashlib.sha256()

0 commit comments

Comments
 (0)