Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/source/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ LIBRISPEECH
:special-members:


LIBRITTS
~~~~~~~~

.. autoclass:: LIBRITTS
:members: __getitem__
:special-members:


LJSPEECH
~~~~~~~~

Expand Down
63 changes: 63 additions & 0 deletions test/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from torchaudio.datasets.ljspeech import LJSPEECH
from torchaudio.datasets.gtzan import GTZAN
from torchaudio.datasets.cmuarctic import CMUARCTIC
from torchaudio.datasets.libritts import LIBRITTS

from .common_utils import (
TempDirMixin,
Expand Down Expand Up @@ -110,5 +111,67 @@ def test_yesno(self):
assert label == expected_label


class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
backend = 'default'

root_dir = None
data = []
utterance_ids = [
[19, 198, '000000', '000000'],
[26, 495, '000004', '000000'],
]
original_text = 'this is the original text.'
normalized_text = 'this is the normalized text.'

@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
base_dir = os.path.join(cls.root_dir, 'LibriTTS', 'train-clean-100')
for i, utterance_id in enumerate(cls.utterance_ids):
filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
os.makedirs(file_dir, exist_ok=True)
path = os.path.join(file_dir, filename)

data = get_whitenoise(sample_rate=8000, duration=6, n_channels=1, dtype='int16', seed=i)
save_wav(path, data, 8000)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jimchen90 I am checking LIbrispeech train-clean-100 but I do not see a file with 8kHz. Most of them seems 16000 Hz. Can you confirm?

Copy link
Contributor Author

@jimchen90 jimchen90 Jul 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mthrok You are right, I checked the published papers of LibriSpeech and LibriTTS linked from their website. They are 16 kHz for LibriSpeech and 24 kHz for LibriTTS.

In abstract of LibriSpeech paper, it mentions ' The LibriSpeech corpus is derived from audiobooks that are part of the LibriVox project, and contains 1000 hours of speech sampled at 16 kHz.'

In abstract of LibriTTS paper, it mentions 'The released corpus consists of 585 hours of speech data at 24kHz sampling rate from 2,456 speakers and the corresponding texts.'

I will open a pull request to update this from 8000 to 24K.

cls.data.append(normalize_wav(data))

original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
path_original = os.path.join(file_dir, original_text_filename)
f = open(path_original, 'w')
f.write(cls.original_text)
f.close()

normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
path_normalized = os.path.join(file_dir, normalized_text_filename)
f = open(path_normalized, 'w')
f.write(cls.normalized_text)
f.close()

def test_libritts(self):
dataset = LIBRITTS(self.root_dir)
samples = list(dataset)
samples.sort(key=lambda s: s[4])
Copy link
Contributor

@mthrok mthrok Jul 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you change walk_files to be something else deterministic that returns items in lexicographical order, then you do not need to perform sort here. In #792, I found out that work_files return unpredicted so I had to first iterate through all the dataset and sort before performing comparison. Which is very counter intuitive because I would not expect Dataset class to return different samples for the same index.

If dataset implementation returns items in the order you put, you can do the assertion part directly like

for i, sample in enumerate(dataset):
    expected_ids = ...
    expected_data = ...
    ... 

Copy link
Contributor Author

@jimchen90 jimchen90 Jul 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the information. I will think about it. Do you have any suggest function instead of using walk_files?

Copy link
Contributor

@vincentqb vincentqb Jul 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Let's at least add a FIXME comment linking to #792. Is there something else in this snippet that should be flagged with #792?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the information. I will think about it. Do you have any suggest function instead of using walk_files?

FYI: In #791 GTZAN dataset got rid of walk_files with very simple implementation to add pattern matching for file name patterns. If LibriTTS also has patterns in dataset, it should be checking the patterns, too. (I believe such pattern matching option should be available on utility function side, but that's separate topic)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: In #791 GTZAN dataset got rid of walk_files with very simple implementation to add pattern matching for file name patterns. If LibriTTS also has patterns in dataset, it should be checking the patterns, too. (I believe such pattern matching option should be available on utility function side, but that's separate topic)

Thanks for pointing this out :) However, I wouldn't replicate this particular solution here, until we have more general abstractions to do so. One of the strength of the dataset implementations that we currently have is how simple it is to replicate and extend to other cases.


for i, (waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id) in enumerate(samples):

expected_ids = self.utterance_ids[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == 8000
assert speaker_id == expected_ids[0]
assert chapter_id == expected_ids[1]
assert original_text == self.original_text
assert normalized_text == self.normalized_text
assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}'


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions torchaudio/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .yesno import YESNO
from .ljspeech import LJSPEECH
from .cmuarctic import CMUARCTIC
from .libritts import LIBRITTS

__all__ = (
"COMMONVOICE",
Expand All @@ -17,6 +18,7 @@
"LJSPEECH",
"GTZAN",
"CMUARCTIC",
"LIBRITTS"
"diskcache_iterator",
"bg_iterator",
)
131 changes: 131 additions & 0 deletions torchaudio/datasets/libritts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
from typing import Tuple

import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio.datasets.utils import (
download_url,
extract_archive,
walk_files,
)

URL = "train-clean-100"
FOLDER_IN_ARCHIVE = "LibriTTS"
_CHECKSUMS = {
"http://www.openslr.org/60/dev-clean.tar.gz": "0c3076c1e5245bb3f0af7d82087ee207",
"http://www.openslr.org/60/dev-other.tar.gz": "815555d8d75995782ac3ccd7f047213d",
"http://www.openslr.org/60/test-clean.tar.gz": "7bed3bdb047c4c197f1ad3bc412db59f",
"http://www.openslr.org/60/test-other.tar.gz": "ae3258249472a13b5abef2a816f733e4",
"http://www.openslr.org/60/train-clean-100.tar.gz": "4a8c202b78fe1bc0c47916a98f3a2ea8",
"http://www.openslr.org/60/train-clean-360.tar.gz": "a84ef10ddade5fd25df69596a2767b2d",
"http://www.openslr.org/60/train-other-500.tar.gz": "7b181dd5ace343a5f38427999684aa6f",
}


def load_libritts_item(
fileid: str,
path: str,
ext_audio: str,
ext_original_txt: str,
ext_normalized_txt: str,
) -> Tuple[Tensor, int, str, str, int, int, str]:
speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
utterance_id = fileid

normalized_text = utterance_id + ext_normalized_txt
normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)

original_text = utterance_id + ext_original_txt
original_text = os.path.join(path, speaker_id, chapter_id, original_text)

file_audio = utterance_id + ext_audio
file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)

# Load audio
waveform, sample_rate = torchaudio.load(file_audio)

# Load original text
with open(original_text) as ft:
original_text = ft.readline()

# Load normalized text
with open(normalized_text, "r") as ft:
normalized_text = ft.readline()

return (
waveform,
sample_rate,
original_text,
normalized_text,
int(speaker_id),
int(chapter_id),
utterance_id,
)


class LIBRITTS(Dataset):
"""
Create a Dataset for LibriTTS. Each item is a tuple of the form:
waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id
"""

_ext_original_txt = ".original.txt"
_ext_normalized_txt = ".normalized.txt"
_ext_audio = ".wav"

def __init__(
self,
root: str,
url: str = URL,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
download: bool = False,
) -> None:

if url in [
"dev-clean",
"dev-other",
"test-clean",
"test-other",
"train-clean-100",
"train-clean-360",
"train-other-500",
]:

ext_archive = ".tar.gz"
base_url = "http://www.openslr.org/resources/60/"

url = os.path.join(base_url, url + ext_archive)

basename = os.path.basename(url)
archive = os.path.join(root, basename)

basename = basename.split(".")[0]
folder_in_archive = os.path.join(folder_in_archive, basename)

self._path = os.path.join(root, folder_in_archive)

if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)

walker = walk_files(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not use walk_files this returns files in unpredicted order. #794

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Since we are using walk_files in all other datasets, I'm ok with moving forward as it is for this pull request, and leaving the migration for a later time.

self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
)
self._walker = list(walker)

def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
fileid = self._walker[n]
return load_libritts_item(
fileid,
self._path,
self._ext_audio,
self._ext_original_txt,
self._ext_normalized_txt,
)

def __len__(self) -> int:
return len(self._walker)