11import os
2+ import warnings
23from pathlib import Path
3- from typing import List , Dict , Tuple , Union
4+ from typing import List , Dict , Tuple , Union , Optional
45
56import torchaudio
67from torchaudio .datasets .utils import unicode_csv_reader
1617# validated.tsv
1718
1819FOLDER_IN_ARCHIVE = "CommonVoice"
19- URL = "english"
2020VERSION = "cv-corpus-4-2019-12-10"
2121TSV = "train.tsv"
2222
@@ -90,8 +90,15 @@ class COMMONVOICE(Dataset):
9090 root (str or Path): Path to the directory where the dataset is found or downloaded.
9191 tsv (str, optional): The name of the tsv file used to construct the metadata.
9292 (default: ``"train.tsv"``)
93- url (str, optional): The URL to download the dataset from, or the language of
94- the dataset to download. (default: ``"english"``).
93+ url (str, optional): Language of dataset. Deprecated. Please use ``language``.
94+ folder_in_archive (str, optional): The top-level directory of the dataset.
95+ version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
96+ For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
97+ download (bool, optional):
98+ Deprecated. CommonVoice requires user agreement on the usage term and torchaudio no longer
99+ provides download functionality. Providing ``True`` results in error.
100+ language (str, optional):
101+ the language of the dataset to download. (default: ``"english"``).
95102 Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
96103 ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
97104 ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
@@ -101,11 +108,6 @@ class COMMONVOICE(Dataset):
101108 ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
102109 ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
103110 ``"romansh sursilvan"``.
104- folder_in_archive (str, optional): The top-level directory of the dataset.
105- version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
106- For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
107- download (bool, optional):
108- Whether to download the dataset if it is not found at root path. (default: ``False``).
109111 """
110112
111113 _ext_txt = ".txt"
@@ -115,23 +117,43 @@ class COMMONVOICE(Dataset):
115117 def __init__ (self ,
116118 root : Union [str , Path ],
117119 tsv : str = TSV ,
118- url : str = URL ,
120+ url : Optional [ str ] = None ,
119121 folder_in_archive : str = FOLDER_IN_ARCHIVE ,
120122 version : str = VERSION ,
121- download : bool = False ) -> None :
123+ download : Optional [bool ] = None ,
124+ language : Optional [str ] = None ) -> None :
122125 if download :
123126 raise RuntimeError (
124127 "Common Voice dataset requires user agreement on the usage term, "
125128 "and torchaudio no longer provides the download feature. "
126129 "Please download the dataset manually and extract it in the root directory, "
127130 "then provide the target language to `url` argument." )
128- if url not in _LANG_CODE :
129- raise ValueError (f"`url` must be one of available languages: { _LANG_CODE .keys ()} " )
131+ if download is not None : # download = False, which has no impact on funnctionality
132+ warnings .warn (
133+ "`download` argument is deprecated and will be removed in 0.9.0. "
134+ "Please remove the argument." )
135+ if url is not None and language is not None :
136+ raise ValueError (
137+ "`url` and `language` arguments can not be provided at the same time. "
138+ "Please use `language`."
139+ )
140+ if url is not None :
141+ warnings .warn (
142+ "`url` argument is deprecated and will be removed in 0.9.0."
143+ "Please use `language`." )
144+ if url not in _LANG_CODE :
145+ raise ValueError (f"`url` must be one of available languages: { _LANG_CODE .keys ()} " )
146+ language = url
147+ else :
148+ language = language or 'english'
149+ if language not in _LANG_CODE :
150+ raise ValueError (
151+ f"`language` must be one of available languages: { _LANG_CODE .keys ()} " )
130152
131153 # Get string representation of 'root' in case Path object is passed
132154 root = os .fspath (root )
133155
134- lang_code = _LANG_CODE [url ]
156+ lang_code = _LANG_CODE [language ]
135157 folder_in_archive = os .path .join (folder_in_archive , version , lang_code )
136158
137159 self ._path = os .path .join (root , folder_in_archive )
0 commit comments