Skip to content

Commit 366cef8

Browse files
authored
Revert "no longer download CommonVoice directly (#1018)" (#1079)
This reverts commit 09a6fca.
1 parent a2085b8 commit 366cef8

File tree

2 files changed

+84
-97
lines changed

2 files changed

+84
-97
lines changed

test/torchaudio_unittest/datasets/utils_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,15 @@ class TestIterator(TorchaudioTestCase):
5454
path = get_asset_path()
5555

5656
def test_disckcache_iterator(self):
57-
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
57+
data = COMMONVOICE(self.path, url="tatar")
5858
data = dataset_utils.diskcache_iterator(data)
5959
# Save
6060
data[0]
6161
# Load
6262
data[0]
6363

6464
def test_bg_iterator(self):
65-
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
65+
data = COMMONVOICE(self.path, url="tatar")
6666
data = dataset_utils.bg_iterator(data, 5)
6767
for _ in data:
6868
pass

torchaudio/datasets/commonvoice.py

Lines changed: 82 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import os
2-
import warnings
32
from pathlib import Path
4-
from typing import List, Dict, Tuple, Optional, Union
3+
from typing import List, Dict, Tuple, Union
54

65
import torchaudio
7-
from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file
6+
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
87
from torch import Tensor
98
from torch.utils.data import Dataset
109

@@ -17,39 +16,68 @@
1716
# validated.tsv
1817

1918
FOLDER_IN_ARCHIVE = "CommonVoice"
20-
LANGUAGE = "english"
21-
VERSION = "cv-corpus-5.1-2020-06-22"
19+
URL = "english"
20+
VERSION = "cv-corpus-4-2019-12-10"
2221
TSV = "train.tsv"
2322
_CHECKSUMS = {
24-
"cv-corpus-5.1-2020-06-22/tt.tar.gz": None,
25-
"cv-corpus-5.1-2020-06-22/en.tar.gz": None,
26-
"cv-corpus-5.1-2020-06-22/de.tar.gz": None,
27-
"cv-corpus-5.1-2020-06-22/fr.tar.gz": None,
28-
"cv-corpus-5.1-2020-06-22/cy.tar.gz": None,
29-
"cv-corpus-5.1-2020-06-22/br.tar.gz": None,
30-
"cv-corpus-5.1-2020-06-22/cv.tar.gz": None,
31-
"cv-corpus-5.1-2020-06-22/tr.tar.gz": None,
32-
"cv-corpus-5.1-2020-06-22/ky.tar.gz": None,
33-
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None,
34-
"cv-corpus-5.1-2020-06-22/kab.tar.gz": None,
35-
"cv-corpus-5.1-2020-06-22/ca.tar.gz": None,
36-
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None,
37-
"cv-corpus-5.1-2020-06-22/sl.tar.gz": None,
38-
"cv-corpus-5.1-2020-06-22/it.tar.gz": None,
39-
"cv-corpus-5.1-2020-06-22/nl.tar.gz": None,
40-
"cv-corpus-5.1-2020-06-22/cnh.tar.gz": None,
41-
"cv-corpus-5.1-2020-06-22/eo.tar.gz": None,
42-
"cv-corpus-5.1-2020-06-22/et.tar.gz": None,
43-
"cv-corpus-5.1-2020-06-22/fa.tar.gz": None,
44-
"cv-corpus-5.1-2020-06-22/eu.tar.gz": None,
45-
"cv-corpus-5.1-2020-06-22/es.tar.gz": None,
46-
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None,
47-
"cv-corpus-5.1-2020-06-22/mn.tar.gz": None,
48-
"cv-corpus-5.1-2020-06-22/sah.tar.gz": None,
49-
"cv-corpus-5.1-2020-06-22/dv.tar.gz": None,
50-
"cv-corpus-5.1-2020-06-22/rw.tar.gz": None,
51-
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None,
52-
"cv-corpus-5.1-2020-06-22/ru.tar.gz": None,
23+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
24+
None,
25+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
26+
None,
27+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
28+
None,
29+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
30+
None,
31+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
32+
None,
33+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
34+
None,
35+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
36+
None,
37+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
38+
None,
39+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
40+
None,
41+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
42+
None,
43+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
44+
None,
45+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
46+
None,
47+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
48+
None,
49+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
50+
None,
51+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
52+
None,
53+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
54+
None,
55+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
56+
None,
57+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
58+
None,
59+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
60+
None,
61+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
62+
None,
63+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
64+
None,
65+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
66+
None,
67+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
68+
None,
69+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
70+
None,
71+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
72+
None,
73+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
74+
None,
75+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
76+
None,
77+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
78+
None,
79+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
80+
None
5381
}
5482

5583

@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str],
73101

74102

75103
class COMMONVOICE(Dataset):
76-
"""Create a Dataset for `CommonVoice <https://commonvoice.mozilla.org/>`_.
104+
"""Create a Dataset for CommonVoice.
77105
78106
Args:
79107
root (str or Path): Path to the directory where the dataset is found or downloaded.
80108
tsv (str, optional): The name of the tsv file used to construct the metadata.
81109
(default: ``"train.tsv"``)
82-
url (str, optional): Deprecated.
83-
folder_in_archive (str, optional): The top-level directory of the dataset.
84-
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
85-
language (str, optional): Language of the dataset. (default: None)
86-
The following values are mapped to their corresponding shortened version:
87-
``"tatar"``, ``"english"``, ``"german"``,
110+
url (str, optional): The URL to download the dataset from, or the language of
111+
the dataset to download. (default: ``"english"``).
112+
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
88113
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
89114
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
90115
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset):
93118
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
94119
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
95120
``"romansh sursilvan"``.
121+
folder_in_archive (str, optional): The top-level directory of the dataset.
122+
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
96123
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
97-
download (bool, optional): Deprecated.
124+
download (bool, optional):
125+
Whether to download the dataset if it is not found at root path. (default: ``False``).
98126
"""
99127

100128
_ext_txt = ".txt"
@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset):
104132
def __init__(self,
105133
root: Union[str, Path],
106134
tsv: str = TSV,
107-
url: Optional[str] = None,
135+
url: str = URL,
108136
folder_in_archive: str = FOLDER_IN_ARCHIVE,
109137
version: str = VERSION,
110-
language: str = LANGUAGE,
111-
download: Optional[bool] = False) -> None:
112-
113-
if download is True:
114-
raise RuntimeError(
115-
"The dataset is no longer publicly accessible. You need to "
116-
"download the archives externally and place them in the root "
117-
"directory."
118-
)
119-
elif download is False:
120-
warnings.warn(
121-
"The use of the download flag is deprecated, since the dataset "
122-
"is no longer directly accessible.", RuntimeWarning
123-
)
124-
125-
if url is not None:
126-
warnings.warn(
127-
"The use of the url flag is deprecated, since the dataset "
128-
"is no longer publicly accessible. To specify the language of the dataset, "
129-
"please use the language parameter instead.", RuntimeWarning
130-
)
138+
download: bool = False) -> None:
131139

132140
languages = {
133141
"tatar": "tt",
@@ -172,22 +180,12 @@ def __init__(self,
172180
"romansh sursilvan": "rm-sursilv"
173181
}
174182

175-
if language in languages:
183+
if url in languages:
176184
ext_archive = ".tar.gz"
177-
language = languages[language]
178-
url = os.path.join(version, language + ext_archive)
179-
else:
180-
raise ValueError(
181-
'Allowed language values are "tatar", "english", "german",'
182-
'"french", "welsh", "breton", "chuvash", "turkish", "kyrgyz",'
183-
'"irish", "kabyle", "catalan", "taiwanese", "slovenian",'
184-
'"italian", "dutch", "hakha chin", "esperanto", "estonian",'
185-
'"persian", "portuguese", "basque", "spanish", "chinese",'
186-
'"mongolian", "sakha", "dhivehi", "kinyarwanda", "swedish",'
187-
'"russian", "indonesian", "arabic", "tamil", "interlingua",'
188-
'"latvian", "japanese", "votic", "abkhaz", "cantonese" and'
189-
'"romansh sursilvan".'
190-
)
185+
language = languages[url]
186+
187+
base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
188+
url = os.path.join(base_url, version, language + ext_archive)
191189

192190
# Get string representation of 'root' in case Path object is passed
193191
root = os.fspath(root)
@@ -200,23 +198,12 @@ def __init__(self,
200198

201199
self._path = os.path.join(root, folder_in_archive)
202200

203-
if not os.path.isdir(self._path):
204-
if os.path.isfile(archive):
205-
checksum = _CHECKSUMS.get(url, None)
206-
if checksum:
207-
filepath = os.path.basename(url)
208-
with open(filepath, "rb") as file_obj:
209-
if not validate_file(file_obj, checksum, "sha256"):
210-
raise RuntimeError(
211-
f"The hash of {filepath} does not match. Delete the file manually and retry."
212-
)
201+
if download:
202+
if not os.path.isdir(self._path):
203+
if not os.path.isfile(archive):
204+
checksum = _CHECKSUMS.get(url, None)
205+
download_url(url, root, hash_value=checksum)
213206
extract_archive(archive)
214-
else:
215-
raise RuntimeError(
216-
"The dataset is no longer publicly accessible. You need to "
217-
"download the archives externally and place them in the root "
218-
"directory."
219-
)
220207

221208
self._tsv = os.path.join(root, folder_in_archive, tsv)
222209

0 commit comments

Comments
 (0)