Skip to content

Commit d5e44f8

Browse files
committed
Add TTS bundle/pipelines (#1872)
Future work items: - length computation of GriffinLim - better way to make InverseMelScale work in inference_mode
1 parent 217fb68 commit d5e44f8

File tree

8 files changed

+964
-0
lines changed

8 files changed

+964
-0
lines changed

docs/source/pipelines.rst

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,69 @@ HUBERT_ASR_XLARGE
167167
.. container:: py attribute
168168

169169
.. autodata:: HUBERT_ASR_XLARGE
170+
171+
172+
Tacotron2 Text-To-Speech
173+
------------------------
174+
175+
Tacotron2TTSBundle
176+
~~~~~~~~~~~~~~~~~~
177+
178+
.. autoclass:: Tacotron2TTSBundle
179+
180+
.. automethod:: get_text_processor
181+
182+
.. automethod:: get_tacotron2
183+
184+
.. automethod:: get_vocoder
185+
186+
Tacotron2TTSBundle - TextProcessor
187+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
188+
189+
.. autoclass:: torchaudio.pipelines::Tacotron2TTSBundle.TextProcessor
190+
:members: tokens
191+
:special-members: __call__
192+
193+
194+
Tacotron2TTSBundle - Vocoder
195+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
196+
197+
.. autoclass:: torchaudio.pipelines::Tacotron2TTSBundle.Vocoder
198+
:members: sample_rate
199+
:special-members: __call__
200+
201+
202+
TACOTRON2_WAVERNN_PHONE_LJSPEECH
203+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
204+
205+
.. container:: py attribute
206+
207+
.. autodata:: TACOTRON2_WAVERNN_PHONE_LJSPEECH
208+
:no-value:
209+
210+
211+
TACOTRON2_WAVERNN_CHAR_LJSPEECH
212+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
213+
214+
.. container:: py attribute
215+
216+
.. autodata:: TACOTRON2_WAVERNN_CHAR_LJSPEECH
217+
:no-value:
218+
219+
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
220+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
221+
222+
.. container:: py attribute
223+
224+
.. autodata:: TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
225+
:no-value:
226+
227+
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH
228+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
229+
230+
.. container:: py attribute
231+
232+
.. autodata:: TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH
170233
:no-value:
171234

172235
References

docs/source/refs.bib

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
@misc{ljspeech17,
2+
author = {Keith Ito and Linda Johnson},
3+
title = {The LJ Speech Dataset},
4+
howpublished = {\url{https://keithito.com/LJ-Speech-Dataset/}},
5+
year = {2017}
6+
}
17
@misc{conneau2020unsupervised,
28
title={Unsupervised Cross-lingual Representation Learning for Speech Recognition},
39
author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from torchaudio.pipelines import (
2+
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
3+
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
4+
TACOTRON2_WAVERNN_CHAR_LJSPEECH,
5+
TACOTRON2_WAVERNN_PHONE_LJSPEECH,
6+
)
7+
import pytest
8+
9+
10+
@pytest.mark.parametrize(
11+
'bundle',
12+
[
13+
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
14+
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
15+
TACOTRON2_WAVERNN_CHAR_LJSPEECH,
16+
TACOTRON2_WAVERNN_PHONE_LJSPEECH,
17+
]
18+
)
19+
def test_tts_models(bundle):
20+
"""Smoke test of TTS pipeline"""
21+
text = "Hello world! Text to Speech!"
22+
23+
processor = bundle.get_text_processor()
24+
tacotron2 = bundle.get_tacotron2()
25+
vocoder = bundle.get_vocoder()
26+
processed, lengths = processor(text)
27+
mel_spec, lengths, _ = tacotron2.infer(processed, lengths)
28+
waveforms, lengths = vocoder(mel_spec, lengths)

torchaudio/pipelines/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@
2020
HUBERT_ASR_LARGE,
2121
HUBERT_ASR_XLARGE,
2222
)
23+
from ._tts import (
24+
Tacotron2TTSBundle,
25+
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
26+
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
27+
TACOTRON2_WAVERNN_CHAR_LJSPEECH,
28+
TACOTRON2_WAVERNN_PHONE_LJSPEECH,
29+
)
2330

2431
__all__ = [
2532
'Wav2Vec2Bundle',
@@ -42,4 +49,9 @@
4249
'HUBERT_XLARGE',
4350
'HUBERT_ASR_LARGE',
4451
'HUBERT_ASR_XLARGE',
52+
'Tacotron2TTSBundle',
53+
'TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH',
54+
'TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH',
55+
'TACOTRON2_WAVERNN_CHAR_LJSPEECH',
56+
'TACOTRON2_WAVERNN_PHONE_LJSPEECH',
4557
]
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from .interface import Tacotron2TTSBundle
2+
from .impl import (
3+
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
4+
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
5+
TACOTRON2_WAVERNN_CHAR_LJSPEECH,
6+
TACOTRON2_WAVERNN_PHONE_LJSPEECH,
7+
)
8+
9+
10+
__all__ = [
11+
'Tacotron2TTSBundle',
12+
'TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH',
13+
'TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH',
14+
'TACOTRON2_WAVERNN_CHAR_LJSPEECH',
15+
'TACOTRON2_WAVERNN_PHONE_LJSPEECH',
16+
]

0 commit comments

Comments
 (0)