Skip to content

Commit aa56d30

Browse files
authored
Fix CommonVoice for French (#1126)
Resolves #1125 where dataset metadata does not contain an extension.
1 parent 9c48402 commit aa56d30

File tree

3 files changed

+133
-42
lines changed

3 files changed

+133
-42
lines changed
Lines changed: 112 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
import os
21
import csv
2+
import os
33
from pathlib import Path
4+
from typing import Tuple, Dict
45

5-
from torchaudio.datasets import COMMONVOICE
6+
from torch import Tensor
67
from torchaudio_unittest.common_utils import (
78
TempDirMixin,
89
TorchaudioTestCase,
@@ -11,55 +12,104 @@
1112
normalize_wav,
1213
)
1314

15+
from torchaudio.datasets import COMMONVOICE
1416

15-
class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
16-
backend = 'default'
17+
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
18+
_SAMPLE_RATE = 48000
19+
_HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
1720

18-
root_dir = None
19-
data = []
20-
_headers = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
21+
22+
def get_mock_dataset_en(root_dir) -> Tuple[Tensor, int, Dict[str, str]]:
23+
mocked_data = []
2124
# Note: extension is changed to wav for the sake of test
2225
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
23-
_train_csv_contents = [
26+
_en_train_csv_contents = [
2427
["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
25-
"common_voice_en_18885784.wav",
26-
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "", ""],
28+
"common_voice_en_18885784.wav",
29+
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "",
30+
""],
2731
["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
28-
"common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
32+
"common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
2933
["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
30-
"common_voice_en_18607573.wav",
31-
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
34+
"common_voice_en_18607573.wav",
35+
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
36+
]
37+
# Tsv file name difference does not mean different subset, testing as a whole dataset here
38+
tsv_filename = os.path.join(root_dir, "train.tsv")
39+
audio_base_path = os.path.join(root_dir, "clips")
40+
os.makedirs(audio_base_path, exist_ok=True)
41+
with open(tsv_filename, "w", newline='') as tsv:
42+
writer = csv.writer(tsv, delimiter='\t')
43+
writer.writerow(_HEADERS)
44+
for i, content in enumerate(_en_train_csv_contents):
45+
writer.writerow(content)
46+
# Generate and store audio
47+
audio_path = os.path.join(audio_base_path, content[1])
48+
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
49+
save_wav(audio_path, data, _SAMPLE_RATE)
50+
# Append data entry
51+
mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
52+
return mocked_data
53+
54+
55+
def get_mock_dataset_fr(root_dir) -> Tuple[Tensor, int, Dict[str, str]]:
56+
mocked_data = []
57+
_fr_train_csv_contents = [
58+
[
59+
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
60+
"18343441c601cae0597a4b0d3144",
61+
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
62+
"16cebac98ee5349e3e8262cb9329",
63+
"Or sur ce point nous n’avons aucune réponse de votre part.", "2", "0", "twenties", "male", "france"],
64+
[
65+
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
66+
"343441c601cae0597a4b0d3144",
67+
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
68+
"cbd395acbdfcfa9d76a6e199bbd",
69+
"Monsieur de La Verpillière, laissez parler le ministre", "2", "0", "twenties", "male", "france"],
70+
3271
]
33-
sample_rate = 48000
72+
# Tsv file name difference does not mean different subset, testing as a whole dataset here
73+
tsv_filename = os.path.join(root_dir, "train.tsv")
74+
audio_base_path = os.path.join(root_dir, "clips")
75+
os.makedirs(audio_base_path, exist_ok=True)
76+
with open(tsv_filename, "w", newline='') as tsv:
77+
writer = csv.writer(tsv, delimiter='\t')
78+
writer.writerow(_HEADERS)
79+
for i, content in enumerate(_fr_train_csv_contents):
80+
content[2] = str(content[2].encode("utf-8"))
81+
writer.writerow(content)
82+
# Generate and store audio
83+
audio_path = os.path.join(audio_base_path, content[1] + _ORIGINAL_EXT_AUDIO)
84+
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
85+
save_wav(audio_path, data, _SAMPLE_RATE)
86+
87+
# Append data entry
88+
mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
89+
return mocked_data
90+
91+
92+
class TestCommonVoiceEN(TempDirMixin, TorchaudioTestCase):
93+
backend = 'default'
94+
root_dir = None
3495

3596
@classmethod
3697
def setUpClass(cls):
3798
cls.root_dir = cls.get_base_temp_dir()
38-
# Tsv file name difference does not mean different subset, testing as a whole dataset here
39-
tsv_filename = os.path.join(cls.root_dir, "train.tsv")
40-
audio_base_path = os.path.join(cls.root_dir, "clips")
41-
os.makedirs(audio_base_path, exist_ok=True)
42-
with open(tsv_filename, "w", newline='') as tsv:
43-
writer = csv.writer(tsv, delimiter='\t')
44-
writer.writerow(cls._headers)
45-
for i, content in enumerate(cls._train_csv_contents):
46-
writer.writerow(content)
47-
48-
# Generate and store audio
49-
audio_path = os.path.join(audio_base_path, content[1])
50-
data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32')
51-
save_wav(audio_path, data, cls.sample_rate)
52-
53-
# Append data entry
54-
cls.data.append((normalize_wav(data), cls.sample_rate, dict(zip(cls._headers, content))))
99+
cls.data = get_mock_dataset_en(cls.root_dir)
100+
COMMONVOICE._ext_audio = ".wav"
101+
102+
@classmethod
103+
def tearDownClass(cls):
104+
COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
55105

56106
def _test_commonvoice(self, dataset):
57107
n_ite = 0
58108
for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
59109
expected_dictionary = self.data[i][2]
60110
expected_data = self.data[i][0]
61111
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
62-
assert sample_rate == TestCommonVoice.sample_rate
112+
assert sample_rate == _SAMPLE_RATE
63113
assert dictionary == expected_dictionary
64114
n_ite += 1
65115
assert n_ite == len(self.data)
@@ -71,3 +121,33 @@ def test_commonvoice_str(self):
71121
def test_commonvoice_path(self):
72122
dataset = COMMONVOICE(Path(self.root_dir))
73123
self._test_commonvoice(dataset)
124+
125+
126+
class TestCommonVoiceFR(TempDirMixin, TorchaudioTestCase):
127+
backend = 'default'
128+
root_dir = None
129+
130+
@classmethod
131+
def setUpClass(cls):
132+
cls.root_dir = cls.get_base_temp_dir()
133+
cls.data = get_mock_dataset_fr(cls.root_dir)
134+
COMMONVOICE._ext_audio = ".mp3"
135+
136+
@classmethod
137+
def tearDownClass(cls):
138+
COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
139+
140+
def _test_commonvoice(self, dataset):
141+
n_ite = 0
142+
for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
143+
expected_dictionary = self.data[i][2]
144+
expected_data = self.data[i][0]
145+
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
146+
assert sample_rate == _SAMPLE_RATE
147+
assert dictionary == expected_dictionary
148+
n_ite += 1
149+
assert n_ite == len(self.data)
150+
151+
def test_commonvoice_str(self):
152+
dataset = COMMONVOICE(self.root_dir)
153+
self._test_commonvoice(dataset)

test/torchaudio_unittest/datasets/utils_test.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,23 @@
1-
from torchaudio.datasets import utils as dataset_utils
2-
from torchaudio.datasets.commonvoice import COMMONVOICE
3-
41
from torchaudio_unittest.common_utils import (
5-
TempDirMixin,
62
TorchaudioTestCase,
73
get_asset_path,
84
)
95

6+
from torchaudio.datasets import utils as dataset_utils
7+
from torchaudio.datasets.commonvoice import COMMONVOICE
8+
9+
original_ext_audio = COMMONVOICE._ext_audio
10+
1011

1112
class TestIterator(TorchaudioTestCase):
13+
@classmethod
14+
def setUpClass(cls):
15+
COMMONVOICE._ext_audio = ".wav"
16+
17+
@classmethod
18+
def tearDownClass(cls):
19+
COMMONVOICE._ext_audio = original_ext_audio
20+
1221
backend = 'default'
1322
path = get_asset_path('CommonVoice', 'cv-corpus-4-2019-12-10', 'tt')
1423

torchaudio/datasets/commonvoice.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,28 @@
1-
import os
21
import csv
2+
import os
33
import warnings
44
from pathlib import Path
55
from typing import List, Dict, Tuple, Union, Optional
66

7-
import torchaudio
87
from torch import Tensor
98
from torch.utils.data import Dataset
109

10+
import torchaudio
11+
1112

1213
def load_commonvoice_item(line: List[str],
1314
header: List[str],
1415
path: str,
15-
folder_audio: str) -> Tuple[Tensor, int, Dict[str, str]]:
16+
folder_audio: str,
17+
ext_audio: str) -> Tuple[Tensor, int, Dict[str, str]]:
1618
# Each line as the following data:
1719
# client_id, path, sentence, up_votes, down_votes, age, gender, accent
1820

1921
assert header[1] == "path"
2022
fileid = line[1]
21-
2223
filename = os.path.join(path, folder_audio, fileid)
23-
24+
if not filename.endswith(ext_audio):
25+
filename += ext_audio
2426
waveform, sample_rate = torchaudio.load(filename)
2527

2628
dic = dict(zip(header, line))
@@ -95,7 +97,7 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]:
9597
``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
9698
"""
9799
line = self._walker[n]
98-
return load_commonvoice_item(line, self._header, self._path, self._folder_audio)
100+
return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
99101

100102
def __len__(self) -> int:
101103
return len(self._walker)

0 commit comments

Comments
 (0)