From 564b80b95f3b65d1500fd9f9392d1b8fccf21f2f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 5 Nov 2019 16:06:50 -0500 Subject: [PATCH 1/4] close file. --- torchaudio/datasets/librispeech.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/torchaudio/datasets/librispeech.py b/torchaudio/datasets/librispeech.py index e9919ba1a3..61178249e3 100644 --- a/torchaudio/datasets/librispeech.py +++ b/torchaudio/datasets/librispeech.py @@ -29,13 +29,14 @@ def load_librispeech_item(fileid, path, ext_audio, ext_txt): waveform, sample_rate = torchaudio.load(file_audio) # Load text - for line in open(file_text): - fileid_text, content = line.strip().split(" ", 1) - if fileid_audio == fileid_text: - break - else: - # Translation not found - raise FileNotFoundError("Translation not found for " + fileid_audio) + with open(file_text) as ft: + for line in ft: + fileid_text, content = line.strip().split(" ", 1) + if fileid_audio == fileid_text: + break + else: + # Translation not found + raise FileNotFoundError("Translation not found for " + fileid_audio) return { "speaker_id": speaker, From 2492ed448b9cf9bfb22e418e839058b10bc4651c Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 5 Nov 2019 16:08:28 -0500 Subject: [PATCH 2/4] staying with datapoints as tuples until further notice. --- test/test_datasets.py | 4 +-- torchaudio/datasets/commonvoice.py | 21 ++++++++----- torchaudio/datasets/librispeech.py | 35 ++++++++++++---------- torchaudio/datasets/vctk.py | 48 +++++++++--------------------- torchaudio/datasets/yesno.py | 35 ++++++++-------------- 5 files changed, 60 insertions(+), 83 deletions(-) diff --git a/test/test_datasets.py b/test/test_datasets.py index 8f35a27b56..54b611244c 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -15,11 +15,11 @@ class TestDatasets(unittest.TestCase): path = os.path.join(test_dirpath, "assets") def test_yesno(self): - data = YESNO(self.path, return_dict=True) + data = YESNO(self.path) data[0] def test_vctk(self): - data = VCTK(self.path, return_dict=True) + data = VCTK(self.path) data[0] def test_librispeech(self): diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index 94f19cb72a..a1676e0c7a 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -2,7 +2,8 @@ import torchaudio from torch.utils.data import Dataset -from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader +from torchaudio.datasets.utils import (download_url, extract_archive, + unicode_csv_reader) # Default TSV should be one of # dev.tsv @@ -16,21 +17,25 @@ TSV = "train.tsv" -def load_commonvoice_item(line, header, path, folder_audio): +def load_commonvoice_item(line, path, folder_audio): + # Each line as the following data: + # client_id, path, sentence, up_votes, down_votes, age, gender, accent fileid = line[1] filename = os.path.join(path, folder_audio, fileid) waveform, sample_rate = torchaudio.load(filename) - dic = dict(zip(header, line)) - dic["waveform"] = waveform - dic["sample_rate"] = sample_rate - - return dic + return(waveform, sample_rate, *line) class COMMONVOICE(Dataset): + """ + Create a Dataset for CommonVoice. Each item is a tuple of the form: + (waveform, sample_rate, client_id, path, sentence, + up_votes, down_votes, age, gender, accent) + following the format of the tsv-files. + """ _ext_txt = ".txt" _ext_audio = ".mp3" @@ -99,7 +104,7 @@ def __init__(self, root, tsv=TSV, url=URL, download=False): def __getitem__(self, n): line = self._walker[n] - return load_commonvoice_item(line, self._header, self._path, self._folder_audio) + return load_commonvoice_item(line, self._path, self._folder_audio) def __len__(self): return len(self._walker) diff --git a/torchaudio/datasets/librispeech.py b/torchaudio/datasets/librispeech.py index 61178249e3..f3d92467f8 100644 --- a/torchaudio/datasets/librispeech.py +++ b/torchaudio/datasets/librispeech.py @@ -1,8 +1,7 @@ import os -from torch.utils.data import Dataset - import torchaudio +from torch.utils.data import Dataset from torchaudio.datasets.utils import ( download_url, extract_archive, @@ -16,14 +15,14 @@ def load_librispeech_item(fileid, path, ext_audio, ext_txt): - speaker, chapter, utterance = fileid.split("-") + speaker_id, chapter_id, utterance_id = fileid.split("-") - file_text = speaker + "-" + chapter + ext_txt - file_text = os.path.join(path, speaker, chapter, file_text) + file_text = speaker_id + "-" + chapter_id + ext_txt + file_text = os.path.join(path, speaker_id, chapter_id, file_text) - fileid_audio = speaker + "-" + chapter + "-" + utterance + fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id file_audio = fileid_audio + ext_audio - file_audio = os.path.join(path, speaker, chapter, file_audio) + file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) # Load audio waveform, sample_rate = torchaudio.load(file_audio) @@ -31,24 +30,28 @@ def load_librispeech_item(fileid, path, ext_audio, ext_txt): # Load text with open(file_text) as ft: for line in ft: - fileid_text, content = line.strip().split(" ", 1) + fileid_text, utterance = line.strip().split(" ", 1) if fileid_audio == fileid_text: break else: # Translation not found raise FileNotFoundError("Translation not found for " + fileid_audio) - return { - "speaker_id": speaker, - "chapter_id": chapter, - "utterance_id": utterance, - "utterance": content, - "waveform": waveform, - "sample_rate": sample_rate, - } + return ( + waveform, + sample_rate, + utterance, + int(speaker_id), + int(chapter_id), + int(utterance_id), + ) class LIBRISPEECH(Dataset): + """ + Create a Dataset for LibriSpeech. Each item is a tuple of the form: + waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id + """ _ext_txt = ".trans.txt" _ext_audio = ".flac" diff --git a/torchaudio/datasets/vctk.py b/torchaudio/datasets/vctk.py index 279e83fa1d..813a9df62a 100644 --- a/torchaudio/datasets/vctk.py +++ b/torchaudio/datasets/vctk.py @@ -12,15 +12,15 @@ def load_vctk_item( fileid, path, ext_audio, ext_txt, folder_audio, folder_txt, downsample=False ): - speaker, utterance = fileid.split("_") + speaker_id, utterance_id = fileid.split("_") # Read text - file_txt = os.path.join(path, folder_txt, speaker, fileid + ext_txt) + file_txt = os.path.join(path, folder_txt, speaker_id, fileid + ext_txt) with open(file_txt) as file_text: - content = file_text.readlines()[0] + utterance = file_text.readlines()[0] # Read wav - file_audio = os.path.join(path, folder_audio, speaker, fileid + ext_audio) + file_audio = os.path.join(path, folder_audio, speaker_id, fileid + ext_audio) if downsample: # Legacy E = torchaudio.sox_effects.SoxEffectsChain() @@ -34,16 +34,14 @@ def load_vctk_item( else: waveform, sample_rate = torchaudio.load(file_audio) - return { - "speaker_id": speaker, - "utterance_id": utterance, - "utterance": content, - "waveform": waveform, - "sample_rate": sample_rate, - } + return waveform, sample_rate, utterance, speaker_id, utterance_id class VCTK(Dataset): + """ + Create a Dataset for VCTK. Each item is a tuple of the form: + (waveform, sample_rate, utterance, speaker_id, utterance_id) + """ _folder_txt = "txt" _folder_audio = "wav48" @@ -59,17 +57,8 @@ def __init__( downsample=False, transform=None, target_transform=None, - return_dict=False, ): - if not return_dict: - warnings.warn( - "In the next version, the item returned will be a dictionary. " - "Please use `return_dict=True` to enable this behavior now, " - "and suppress this warning.", - DeprecationWarning, - ) - if downsample: warnings.warn( "In the next version, transforms will not be part of the dataset. " @@ -89,7 +78,6 @@ def __init__( self.downsample = downsample self.transform = transform self.target_transform = target_transform - self.return_dict = return_dict archive = os.path.basename(url) archive = os.path.join(root, archive) @@ -122,23 +110,15 @@ def __getitem__(self, n): self._folder_txt, ) - # Legacy - waveform = item["waveform"] + # TODO Upon deprecation, uncomment line below and remove following code + # return item + + waveform, sample_rate, utterance, speaker_id, utterance_id = item if self.transform is not None: waveform = self.transform(waveform) - item["waveform"] = waveform - - # Legacy - utterance = item["utterance"] if self.target_transform is not None: utterance = self.target_transform(utterance) - item["utterance"] = utterance - - if self.return_dict: - return item - - # Legacy - return item["waveform"], item["utterance"] + return waveform, sample_rate, utterance, speaker_id, utterance_id def __len__(self): return len(self._walker) diff --git a/torchaudio/datasets/yesno.py b/torchaudio/datasets/yesno.py index dd1b9e180e..01bf8a3d15 100644 --- a/torchaudio/datasets/yesno.py +++ b/torchaudio/datasets/yesno.py @@ -11,16 +11,20 @@ def load_yesno_item(fileid, path, ext_audio): # Read label - label = fileid.split("_") + labels = [int(c) for c in fileid.split("_")] # Read wav file_audio = os.path.join(path, fileid + ext_audio) waveform, sample_rate = torchaudio.load(file_audio) - return {"label": label, "waveform": waveform, "sample_rate": sample_rate} + return waveform, sample_rate, labels class YESNO(Dataset): + """ + Create a Dataset for YesNo. Each item is a tuple of the form: + (waveform, sample_rate, labels) + """ _ext_audio = ".wav" @@ -32,17 +36,8 @@ def __init__( download=False, transform=None, target_transform=None, - return_dict=False, ): - if not return_dict: - warnings.warn( - "In the next version, the item returned will be a dictionary. " - "Please use `return_dict=True` to enable this behavior now, " - "and suppress this warning.", - DeprecationWarning, - ) - if transform is not None or target_transform is not None: warnings.warn( "In the next version, transforms will not be part of the dataset. " @@ -53,7 +48,6 @@ def __init__( self.transform = transform self.target_transform = target_transform - self.return_dict = return_dict archive = os.path.basename(url) archive = os.path.join(root, archive) @@ -79,20 +73,15 @@ def __getitem__(self, n): fileid = self._walker[n] item = load_yesno_item(fileid, self._path, self._ext_audio) - waveform = item["waveform"] + # TODO Upon deprecation, uncomment line below and remove following code + # return item + + waveform, sample_rate, labels = item if self.transform is not None: waveform = self.transform(waveform) - item["waveform"] = waveform - - label = item["label"] if self.target_transform is not None: - label = self.target_transform(label) - item["label"] = label - - if self.return_dict: - return item - - return item["waveform"], item["label"] + labels = self.target_transform(labels) + return waveform, sample_rate, labels def __len__(self): return len(self._walker) From e71004015e679adbbd1d9a9f45218c07c9159cbd Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 6 Nov 2019 11:49:45 -0500 Subject: [PATCH 3/4] loading tsv as dict instead. --- torchaudio/datasets/commonvoice.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index a1676e0c7a..fbc099e75a 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -1,9 +1,9 @@ import os -import torchaudio from torch.utils.data import Dataset -from torchaudio.datasets.utils import (download_url, extract_archive, - unicode_csv_reader) + +import torchaudio +from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader # Default TSV should be one of # dev.tsv @@ -17,24 +17,28 @@ TSV = "train.tsv" -def load_commonvoice_item(line, path, folder_audio): +def load_commonvoice_item(line, header, path, folder_audio): # Each line as the following data: # client_id, path, sentence, up_votes, down_votes, age, gender, accent + + assert header[1] == "path" fileid = line[1] filename = os.path.join(path, folder_audio, fileid) waveform, sample_rate = torchaudio.load(filename) - return(waveform, sample_rate, *line) + dic = dict(zip(header, line)) + + return waveform, sample_rate, dic class COMMONVOICE(Dataset): """ Create a Dataset for CommonVoice. Each item is a tuple of the form: - (waveform, sample_rate, client_id, path, sentence, - up_votes, down_votes, age, gender, accent) - following the format of the tsv-files. + (waveform, sample_rate, dict) + where dict is a dictionary built from the tsv file with the following keys: + client_id, path, sentence, up_votes, down_votes, age, gender, accent. """ _ext_txt = ".txt" @@ -104,7 +108,7 @@ def __init__(self, root, tsv=TSV, url=URL, download=False): def __getitem__(self, n): line = self._walker[n] - return load_commonvoice_item(line, self._path, self._folder_audio) + return load_commonvoice_item(line, self._header, self._path, self._folder_audio) def __len__(self): return len(self._walker) From 5b2fd93121e7ec06daf87f85ab389484e07898eb Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 6 Nov 2019 11:52:30 -0500 Subject: [PATCH 4/4] change var name. --- torchaudio/datasets/commonvoice.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index fbc099e75a..6caf3f6e5f 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -36,8 +36,8 @@ def load_commonvoice_item(line, header, path, folder_audio): class COMMONVOICE(Dataset): """ Create a Dataset for CommonVoice. Each item is a tuple of the form: - (waveform, sample_rate, dict) - where dict is a dictionary built from the tsv file with the following keys: + (waveform, sample_rate, dictionary) + where dictionary is a dictionary built from the tsv file with the following keys: client_id, path, sentence, up_votes, down_votes, age, gender, accent. """