-
Notifications
You must be signed in to change notification settings - Fork 739
Add LibriTTS dataset #790
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add LibriTTS dataset #790
Changes from all commits
8f4abf1
52e9dd4
128673a
eb3fab4
c43a53b
1537621
d046701
cf90566
f9f48cc
9d711d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |
| from torchaudio.datasets.ljspeech import LJSPEECH | ||
| from torchaudio.datasets.gtzan import GTZAN | ||
| from torchaudio.datasets.cmuarctic import CMUARCTIC | ||
| from torchaudio.datasets.libritts import LIBRITTS | ||
|
|
||
| from .common_utils import ( | ||
| TempDirMixin, | ||
|
|
@@ -110,5 +111,67 @@ def test_yesno(self): | |
| assert label == expected_label | ||
|
|
||
|
|
||
| class TestLibriTTS(TempDirMixin, TorchaudioTestCase): | ||
| backend = 'default' | ||
|
|
||
| root_dir = None | ||
| data = [] | ||
| utterance_ids = [ | ||
| [19, 198, '000000', '000000'], | ||
| [26, 495, '000004', '000000'], | ||
| ] | ||
| original_text = 'this is the original text.' | ||
| normalized_text = 'this is the normalized text.' | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
| cls.root_dir = cls.get_base_temp_dir() | ||
| base_dir = os.path.join(cls.root_dir, 'LibriTTS', 'train-clean-100') | ||
| for i, utterance_id in enumerate(cls.utterance_ids): | ||
| filename = f'{"_".join(str(u) for u in utterance_id)}.wav' | ||
| file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1])) | ||
| os.makedirs(file_dir, exist_ok=True) | ||
| path = os.path.join(file_dir, filename) | ||
|
|
||
| data = get_whitenoise(sample_rate=8000, duration=6, n_channels=1, dtype='int16', seed=i) | ||
| save_wav(path, data, 8000) | ||
| cls.data.append(normalize_wav(data)) | ||
|
|
||
| original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt' | ||
| path_original = os.path.join(file_dir, original_text_filename) | ||
| f = open(path_original, 'w') | ||
| f.write(cls.original_text) | ||
| f.close() | ||
|
|
||
| normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt' | ||
| path_normalized = os.path.join(file_dir, normalized_text_filename) | ||
| f = open(path_normalized, 'w') | ||
| f.write(cls.normalized_text) | ||
| f.close() | ||
|
|
||
| def test_libritts(self): | ||
| dataset = LIBRITTS(self.root_dir) | ||
| samples = list(dataset) | ||
| samples.sort(key=lambda s: s[4]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you change If dataset implementation returns items in the order you put, you can do the assertion part directly like for i, sample in enumerate(dataset):
expected_ids = ...
expected_data = ...
...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the information. I will think about it. Do you have any suggest function instead of using
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
FYI: In #791 GTZAN dataset got rid of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing this out :) However, I wouldn't replicate this particular solution here, until we have more general abstractions to do so. One of the strength of the dataset implementations that we currently have is how simple it is to replicate and extend to other cases. |
||
|
|
||
| for i, (waveform, | ||
| sample_rate, | ||
| original_text, | ||
| normalized_text, | ||
| speaker_id, | ||
| chapter_id, | ||
| utterance_id) in enumerate(samples): | ||
|
|
||
| expected_ids = self.utterance_ids[i] | ||
| expected_data = self.data[i] | ||
| self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8) | ||
| assert sample_rate == 8000 | ||
| assert speaker_id == expected_ids[0] | ||
| assert chapter_id == expected_ids[1] | ||
| assert original_text == self.original_text | ||
| assert normalized_text == self.normalized_text | ||
| assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}' | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,131 @@ | ||
| import os | ||
| from typing import Tuple | ||
|
|
||
| import torchaudio | ||
| from torch import Tensor | ||
| from torch.utils.data import Dataset | ||
| from torchaudio.datasets.utils import ( | ||
| download_url, | ||
| extract_archive, | ||
| walk_files, | ||
| ) | ||
|
|
||
| URL = "train-clean-100" | ||
| FOLDER_IN_ARCHIVE = "LibriTTS" | ||
| _CHECKSUMS = { | ||
| "http://www.openslr.org/60/dev-clean.tar.gz": "0c3076c1e5245bb3f0af7d82087ee207", | ||
| "http://www.openslr.org/60/dev-other.tar.gz": "815555d8d75995782ac3ccd7f047213d", | ||
| "http://www.openslr.org/60/test-clean.tar.gz": "7bed3bdb047c4c197f1ad3bc412db59f", | ||
| "http://www.openslr.org/60/test-other.tar.gz": "ae3258249472a13b5abef2a816f733e4", | ||
| "http://www.openslr.org/60/train-clean-100.tar.gz": "4a8c202b78fe1bc0c47916a98f3a2ea8", | ||
| "http://www.openslr.org/60/train-clean-360.tar.gz": "a84ef10ddade5fd25df69596a2767b2d", | ||
| "http://www.openslr.org/60/train-other-500.tar.gz": "7b181dd5ace343a5f38427999684aa6f", | ||
| } | ||
|
|
||
|
|
||
| def load_libritts_item( | ||
| fileid: str, | ||
| path: str, | ||
| ext_audio: str, | ||
| ext_original_txt: str, | ||
| ext_normalized_txt: str, | ||
| ) -> Tuple[Tensor, int, str, str, int, int, str]: | ||
| speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_") | ||
| utterance_id = fileid | ||
|
|
||
| normalized_text = utterance_id + ext_normalized_txt | ||
| normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text) | ||
|
|
||
| original_text = utterance_id + ext_original_txt | ||
| original_text = os.path.join(path, speaker_id, chapter_id, original_text) | ||
|
|
||
| file_audio = utterance_id + ext_audio | ||
| file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) | ||
|
|
||
| # Load audio | ||
| waveform, sample_rate = torchaudio.load(file_audio) | ||
|
|
||
| # Load original text | ||
| with open(original_text) as ft: | ||
| original_text = ft.readline() | ||
|
|
||
| # Load normalized text | ||
| with open(normalized_text, "r") as ft: | ||
| normalized_text = ft.readline() | ||
|
|
||
| return ( | ||
| waveform, | ||
| sample_rate, | ||
| original_text, | ||
| normalized_text, | ||
| int(speaker_id), | ||
| int(chapter_id), | ||
| utterance_id, | ||
| ) | ||
|
|
||
|
|
||
| class LIBRITTS(Dataset): | ||
| """ | ||
| Create a Dataset for LibriTTS. Each item is a tuple of the form: | ||
| waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id | ||
| """ | ||
|
|
||
| _ext_original_txt = ".original.txt" | ||
| _ext_normalized_txt = ".normalized.txt" | ||
| _ext_audio = ".wav" | ||
|
|
||
| def __init__( | ||
| self, | ||
| root: str, | ||
| url: str = URL, | ||
| folder_in_archive: str = FOLDER_IN_ARCHIVE, | ||
| download: bool = False, | ||
| ) -> None: | ||
|
|
||
| if url in [ | ||
| "dev-clean", | ||
| "dev-other", | ||
| "test-clean", | ||
| "test-other", | ||
| "train-clean-100", | ||
| "train-clean-360", | ||
| "train-other-500", | ||
| ]: | ||
|
|
||
| ext_archive = ".tar.gz" | ||
| base_url = "http://www.openslr.org/resources/60/" | ||
|
|
||
| url = os.path.join(base_url, url + ext_archive) | ||
|
|
||
| basename = os.path.basename(url) | ||
| archive = os.path.join(root, basename) | ||
|
|
||
| basename = basename.split(".")[0] | ||
| folder_in_archive = os.path.join(folder_in_archive, basename) | ||
|
|
||
| self._path = os.path.join(root, folder_in_archive) | ||
|
|
||
| if download: | ||
| if not os.path.isdir(self._path): | ||
| if not os.path.isfile(archive): | ||
| checksum = _CHECKSUMS.get(url, None) | ||
| download_url(url, root, hash_value=checksum) | ||
| extract_archive(archive) | ||
|
|
||
| walker = walk_files( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please do not use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Since we are using |
||
| self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True | ||
| ) | ||
| self._walker = list(walker) | ||
|
|
||
| def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]: | ||
| fileid = self._walker[n] | ||
| return load_libritts_item( | ||
| fileid, | ||
| self._path, | ||
| self._ext_audio, | ||
| self._ext_original_txt, | ||
| self._ext_normalized_txt, | ||
| ) | ||
|
|
||
| def __len__(self) -> int: | ||
| return len(self._walker) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jimchen90 I am checking LIbrispeech
train-clean-100but I do not see a file with 8kHz. Most of them seems 16000 Hz. Can you confirm?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mthrok You are right, I checked the published papers of LibriSpeech and LibriTTS linked from their website. They are 16 kHz for LibriSpeech and 24 kHz for LibriTTS.
In abstract of LibriSpeech paper, it mentions ' The LibriSpeech corpus is derived from audiobooks that are part of the LibriVox project, and contains 1000 hours of speech sampled at 16 kHz.'
In abstract of LibriTTS paper, it mentions 'The released corpus consists of 585 hours of speech data at 24kHz sampling rate from 2,456 speakers and the corresponding texts.'
I will open a pull request to update this from 8000 to 24K.