11import  os 
2- import  warnings 
32from  pathlib  import  Path 
4- from  typing  import  List , Dict , Tuple , Optional ,  Union 
3+ from  typing  import  List , Dict , Tuple , Union 
54
65import  torchaudio 
7- from  torchaudio .datasets .utils  import  extract_archive ,  unicode_csv_reader ,  validate_file 
6+ from  torchaudio .datasets .utils  import  download_url ,  extract_archive ,  unicode_csv_reader 
87from  torch  import  Tensor 
98from  torch .utils .data  import  Dataset 
109
1716# validated.tsv 
1817
1918FOLDER_IN_ARCHIVE  =  "CommonVoice" 
20- LANGUAGE  =  "english" 
21- VERSION  =  "cv-corpus-5.1-2020-06-22 " 
19+ URL  =  "english" 
20+ VERSION  =  "cv-corpus-4-2019-12-10 " 
2221TSV  =  "train.tsv" 
2322_CHECKSUMS  =  {
24-     "cv-corpus-5.1-2020-06-22/tt.tar.gz" : None ,
25-     "cv-corpus-5.1-2020-06-22/en.tar.gz" : None ,
26-     "cv-corpus-5.1-2020-06-22/de.tar.gz" : None ,
27-     "cv-corpus-5.1-2020-06-22/fr.tar.gz" : None ,
28-     "cv-corpus-5.1-2020-06-22/cy.tar.gz" : None ,
29-     "cv-corpus-5.1-2020-06-22/br.tar.gz" : None ,
30-     "cv-corpus-5.1-2020-06-22/cv.tar.gz" : None ,
31-     "cv-corpus-5.1-2020-06-22/tr.tar.gz" : None ,
32-     "cv-corpus-5.1-2020-06-22/ky.tar.gz" : None ,
33-     "cv-corpus-5.1-2020-06-22/ga-IE.tar.gz" : None ,
34-     "cv-corpus-5.1-2020-06-22/kab.tar.gz" : None ,
35-     "cv-corpus-5.1-2020-06-22/ca.tar.gz" : None ,
36-     "cv-corpus-5.1-2020-06-22/zh-TW.tar.gz" : None ,
37-     "cv-corpus-5.1-2020-06-22/sl.tar.gz" : None ,
38-     "cv-corpus-5.1-2020-06-22/it.tar.gz" : None ,
39-     "cv-corpus-5.1-2020-06-22/nl.tar.gz" : None ,
40-     "cv-corpus-5.1-2020-06-22/cnh.tar.gz" : None ,
41-     "cv-corpus-5.1-2020-06-22/eo.tar.gz" : None ,
42-     "cv-corpus-5.1-2020-06-22/et.tar.gz" : None ,
43-     "cv-corpus-5.1-2020-06-22/fa.tar.gz" : None ,
44-     "cv-corpus-5.1-2020-06-22/eu.tar.gz" : None ,
45-     "cv-corpus-5.1-2020-06-22/es.tar.gz" : None ,
46-     "cv-corpus-5.1-2020-06-22/zh-CN.tar.gz" : None ,
47-     "cv-corpus-5.1-2020-06-22/mn.tar.gz" : None ,
48-     "cv-corpus-5.1-2020-06-22/sah.tar.gz" : None ,
49-     "cv-corpus-5.1-2020-06-22/dv.tar.gz" : None ,
50-     "cv-corpus-5.1-2020-06-22/rw.tar.gz" : None ,
51-     "cv-corpus-5.1-2020-06-22/sv-SE.tar.gz" : None ,
52-     "cv-corpus-5.1-2020-06-22/ru.tar.gz" : None ,
23+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz" :
24+     None ,
25+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz" :
26+     None ,
27+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz" :
28+     None ,
29+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz" :
30+     None ,
31+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz" :
32+     None ,
33+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz" :
34+     None ,
35+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz" :
36+     None ,
37+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz" :
38+     None ,
39+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz" :
40+     None ,
41+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz" :
42+     None ,
43+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz" :
44+     None ,
45+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz" :
46+     None ,
47+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz" :
48+     None ,
49+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz" :
50+     None ,
51+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz" :
52+     None ,
53+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz" :
54+     None ,
55+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz" :
56+     None ,
57+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz" :
58+     None ,
59+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz" :
60+     None ,
61+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz" :
62+     None ,
63+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz" :
64+     None ,
65+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz" :
66+     None ,
67+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz" :
68+     None ,
69+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz" :
70+     None ,
71+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz" :
72+     None ,
73+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz" :
74+     None ,
75+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz" :
76+     None ,
77+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz" :
78+     None ,
79+     "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz" :
80+     None 
5381}
5482
5583
@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str],
73101
74102
75103class  COMMONVOICE (Dataset ):
76-     """Create a Dataset for ` CommonVoice <https://commonvoice.mozilla.org/>`_ . 
104+     """Create a Dataset for CommonVoice. 
77105
78106    Args: 
79107        root (str or Path): Path to the directory where the dataset is found or downloaded. 
80108        tsv (str, optional): The name of the tsv file used to construct the metadata. 
81109            (default: ``"train.tsv"``) 
82-         url (str, optional): Deprecated. 
83-         folder_in_archive (str, optional): The top-level directory of the dataset. 
84-         version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``) 
85-         language (str, optional): Language of the dataset. (default: None) 
86-             The following values are mapped to their corresponding shortened version: 
87-             ``"tatar"``, ``"english"``, ``"german"``, 
110+         url (str, optional): The URL to download the dataset from, or the language of 
111+             the dataset to download. (default: ``"english"``). 
112+             Allowed language values are ``"tatar"``, ``"english"``, ``"german"``, 
88113            ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, 
89114            ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, 
90115            ``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``, 
@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset):
93118            ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``, 
94119            ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and 
95120            ``"romansh sursilvan"``. 
121+         folder_in_archive (str, optional): The top-level directory of the dataset. 
122+         version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``) 
96123            For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. 
97-         download (bool, optional): Deprecated. 
124+         download (bool, optional): 
125+             Whether to download the dataset if it is not found at root path. (default: ``False``). 
98126    """ 
99127
100128    _ext_txt  =  ".txt" 
@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset):
104132    def  __init__ (self ,
105133                 root : Union [str , Path ],
106134                 tsv : str  =  TSV ,
107-                  url : Optional [ str ]  =  None ,
135+                  url : str  =  URL ,
108136                 folder_in_archive : str  =  FOLDER_IN_ARCHIVE ,
109137                 version : str  =  VERSION ,
110-                  language : str  =  LANGUAGE ,
111-                  download : Optional [bool ] =  False ) ->  None :
112- 
113-         if  download  is  True :
114-             raise  RuntimeError (
115-                 "The dataset is no longer publicly accessible. You need to " 
116-                 "download the archives externally and place them in the root " 
117-                 "directory." 
118-             )
119-         elif  download  is  False :
120-             warnings .warn (
121-                 "The use of the download flag is deprecated, since the dataset " 
122-                 "is no longer directly accessible." , RuntimeWarning 
123-             )
124- 
125-         if  url  is  not None :
126-             warnings .warn (
127-                 "The use of the url flag is deprecated, since the dataset " 
128-                 "is no longer publicly accessible. To specify the language of the dataset, " 
129-                 "please use the language parameter instead." , RuntimeWarning 
130-             )
138+                  download : bool  =  False ) ->  None :
131139
132140        languages  =  {
133141            "tatar" : "tt" ,
@@ -172,22 +180,12 @@ def __init__(self,
172180            "romansh sursilvan" : "rm-sursilv" 
173181        }
174182
175-         if  language  in  languages :
183+         if  url  in  languages :
176184            ext_archive  =  ".tar.gz" 
177-             language  =  languages [language ]
178-             url  =  os .path .join (version , language  +  ext_archive )
179-         else :
180-             raise  ValueError (
181-                 'Allowed language values are "tatar", "english", "german",' 
182-                 '"french", "welsh", "breton", "chuvash", "turkish", "kyrgyz",' 
183-                 '"irish", "kabyle", "catalan", "taiwanese", "slovenian",' 
184-                 '"italian", "dutch", "hakha chin", "esperanto", "estonian",' 
185-                 '"persian", "portuguese", "basque", "spanish", "chinese",' 
186-                 '"mongolian", "sakha", "dhivehi", "kinyarwanda", "swedish",' 
187-                 '"russian", "indonesian", "arabic", "tamil", "interlingua",' 
188-                 '"latvian", "japanese", "votic", "abkhaz", "cantonese" and' 
189-                 '"romansh sursilvan".' 
190-             )
185+             language  =  languages [url ]
186+ 
187+             base_url  =  "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com" 
188+             url  =  os .path .join (base_url , version , language  +  ext_archive )
191189
192190        # Get string representation of 'root' in case Path object is passed 
193191        root  =  os .fspath (root )
@@ -200,23 +198,12 @@ def __init__(self,
200198
201199        self ._path  =  os .path .join (root , folder_in_archive )
202200
203-         if  not  os .path .isdir (self ._path ):
204-             if  os .path .isfile (archive ):
205-                 checksum  =  _CHECKSUMS .get (url , None )
206-                 if  checksum :
207-                     filepath  =  os .path .basename (url )
208-                     with  open (filepath , "rb" ) as  file_obj :
209-                         if  not  validate_file (file_obj , checksum , "sha256" ):
210-                             raise  RuntimeError (
211-                                 f"The hash of { filepath }  
212-                             )
201+         if  download :
202+             if  not  os .path .isdir (self ._path ):
203+                 if  not  os .path .isfile (archive ):
204+                     checksum  =  _CHECKSUMS .get (url , None )
205+                     download_url (url , root , hash_value = checksum )
213206                extract_archive (archive )
214-             else :
215-                 raise  RuntimeError (
216-                     "The dataset is no longer publicly accessible. You need to " 
217-                     "download the archives externally and place them in the root " 
218-                     "directory." 
219-                 )
220207
221208        self ._tsv  =  os .path .join (root , folder_in_archive , tsv )
222209
0 commit comments