Skip to content

Commit d61d42b

Browse files
committed
Deprecate download/url args and add language arg in CommonVoice
1 parent 4062f35 commit d61d42b

File tree

1 file changed

+36
-14
lines changed

1 file changed

+36
-14
lines changed

torchaudio/datasets/commonvoice.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
2+
import warnings
23
from pathlib import Path
3-
from typing import List, Dict, Tuple, Union
4+
from typing import List, Dict, Tuple, Union, Optional
45

56
import torchaudio
67
from torchaudio.datasets.utils import unicode_csv_reader
@@ -16,7 +17,6 @@
1617
# validated.tsv
1718

1819
FOLDER_IN_ARCHIVE = "CommonVoice"
19-
URL = "english"
2020
VERSION = "cv-corpus-4-2019-12-10"
2121
TSV = "train.tsv"
2222

@@ -90,8 +90,15 @@ class COMMONVOICE(Dataset):
9090
root (str or Path): Path to the directory where the dataset is found or downloaded.
9191
tsv (str, optional): The name of the tsv file used to construct the metadata.
9292
(default: ``"train.tsv"``)
93-
url (str, optional): The URL to download the dataset from, or the language of
94-
the dataset to download. (default: ``"english"``).
93+
url (str, optional): Language of dataset. Deprecated. Please use ``language``.
94+
folder_in_archive (str, optional): The top-level directory of the dataset.
95+
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
96+
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
97+
download (bool, optional):
98+
Deprecated. CommonVoice requires user agreement on the usage term and torchaudio no longer
99+
provides download functionality. Providing ``True`` results in error.
100+
language (str, optional):
101+
the language of the dataset to download. (default: ``"english"``).
95102
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
96103
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
97104
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
@@ -101,11 +108,6 @@ class COMMONVOICE(Dataset):
101108
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
102109
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
103110
``"romansh sursilvan"``.
104-
folder_in_archive (str, optional): The top-level directory of the dataset.
105-
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
106-
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
107-
download (bool, optional):
108-
Whether to download the dataset if it is not found at root path. (default: ``False``).
109111
"""
110112

111113
_ext_txt = ".txt"
@@ -115,23 +117,43 @@ class COMMONVOICE(Dataset):
115117
def __init__(self,
116118
root: Union[str, Path],
117119
tsv: str = TSV,
118-
url: str = URL,
120+
url: Optional[str] = None,
119121
folder_in_archive: str = FOLDER_IN_ARCHIVE,
120122
version: str = VERSION,
121-
download: bool = False) -> None:
123+
download: Optional[bool] = None,
124+
language: Optional[str] = None) -> None:
122125
if download:
123126
raise RuntimeError(
124127
"Common Voice dataset requires user agreement on the usage term, "
125128
"and torchaudio no longer provides the download feature. "
126129
"Please download the dataset manually and extract it in the root directory, "
127130
"then provide the target language to `url` argument.")
128-
if url not in _LANG_CODE:
129-
raise ValueError(f"`url` must be one of available languages: {_LANG_CODE.keys()}")
131+
if download is not None: # download = False, which has no impact on funnctionality
132+
warnings.warn(
133+
"`download` argument is deprecated and will be removed in 0.9.0. "
134+
"Please remove the argument.")
135+
if url is not None and language is not None:
136+
raise ValueError(
137+
"`url` and `language` arguments can not be provided at the same time. "
138+
"Please use `language`."
139+
)
140+
if url is not None:
141+
warnings.warn(
142+
"`url` argument is deprecated and will be removed in 0.9.0."
143+
"Please use `language`.")
144+
if url not in _LANG_CODE:
145+
raise ValueError(f"`url` must be one of available languages: {_LANG_CODE.keys()}")
146+
language = url
147+
else:
148+
language = language or 'english'
149+
if language not in _LANG_CODE:
150+
raise ValueError(
151+
f"`language` must be one of available languages: {_LANG_CODE.keys()}")
130152

131153
# Get string representation of 'root' in case Path object is passed
132154
root = os.fspath(root)
133155

134-
lang_code = _LANG_CODE[url]
156+
lang_code = _LANG_CODE[language]
135157
folder_in_archive = os.path.join(folder_in_archive, version, lang_code)
136158

137159
self._path = os.path.join(root, folder_in_archive)

0 commit comments

Comments
 (0)