Skip to content

Commit 9d8f3d0

Browse files
committed
Add text preprocessing utilities for TTS pipeline
1 parent aa0dd03 commit 9d8f3d0

File tree

4 files changed

+153
-0
lines changed

4 files changed

+153
-0
lines changed

examples/pipeline_tacotron2/text/__init__.py

Whitespace-only changes.
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
""" Modified from https://github.com/keithito/tacotron """
2+
3+
import inflect
4+
import re
5+
6+
7+
_inflect = inflect.engine()
8+
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
9+
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10+
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11+
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12+
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13+
_number_re = re.compile(r'[0-9]+')
14+
15+
16+
def _remove_commas(m: re.Match) -> str:
17+
return m.group(1).replace(',', '')
18+
19+
20+
def _expand_decimal_point(m: re.Match) -> str:
21+
return m.group(1).replace('.', ' point ')
22+
23+
24+
def _expand_dollars(m: re.Match) -> str:
25+
match = m.group(1)
26+
parts = match.split('.')
27+
if len(parts) > 2:
28+
return match + ' dollars' # Unexpected format
29+
dollars = int(parts[0]) if parts[0] else 0
30+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31+
if dollars and cents:
32+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33+
cent_unit = 'cent' if cents == 1 else 'cents'
34+
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35+
elif dollars:
36+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37+
return '%s %s' % (dollars, dollar_unit)
38+
elif cents:
39+
cent_unit = 'cent' if cents == 1 else 'cents'
40+
return '%s %s' % (cents, cent_unit)
41+
else:
42+
return 'zero dollars'
43+
44+
45+
def _expand_ordinal(m: re.Match) -> str:
46+
return _inflect.number_to_words(m.group(0))
47+
48+
49+
def _expand_number(m: re.Match) -> str:
50+
num = int(m.group(0))
51+
if num > 1000 and num < 3000:
52+
if num == 2000:
53+
return 'two thousand'
54+
elif num > 2000 and num < 2010:
55+
return 'two thousand ' + _inflect.number_to_words(num % 100)
56+
elif num % 100 == 0:
57+
return _inflect.number_to_words(num // 100) + ' hundred'
58+
else:
59+
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60+
else:
61+
return _inflect.number_to_words(num, andword='')
62+
63+
64+
def normalize_numbers(text: str) -> str:
65+
text = re.sub(_comma_number_re, _remove_commas, text)
66+
text = re.sub(_pounds_re, r'\1 pounds', text)
67+
text = re.sub(_dollars_re, _expand_dollars, text)
68+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69+
text = re.sub(_ordinal_re, _expand_ordinal, text)
70+
text = re.sub(_number_re, _expand_number, text)
71+
return text
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import unittest
2+
3+
from parameterized import parameterized
4+
5+
from .text_preprocessing import text_to_sequence
6+
7+
8+
class TestTextPreprocessor(unittest.TestCase):
9+
10+
@parameterized.expand(
11+
[
12+
["dr. Strange?", [15, 26, 14, 31, 26, 29, 11, 30, 31, 29, 12, 25, 18, 16, 10]],
13+
["ML, is fun.", [24, 23, 6, 11, 20, 30, 11, 17, 32, 25, 7]],
14+
["I love torchaudio!", [20, 11, 23, 26, 33, 16, 11, 31, 26, 29, 14, 19, 12, 32, 15, 20, 26, 2]],
15+
# 'one thousand dollars, twenty cents'
16+
["$1,000.20", [26, 25, 16, 11, 31, 19, 26, 32, 30, 12, 25, 15, 11, 15, 26, 23, 23,
17+
12, 29, 30, 6, 11, 31, 34, 16, 25, 31, 36, 11, 14, 16, 25, 31, 30]],
18+
]
19+
)
20+
def test_text_to_sequence(self, sent, seq):
21+
22+
assert (text_to_sequence(sent) == seq)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""Modified from https://github.com/keithito/tacotron"""
2+
3+
from typing import List
4+
import re
5+
from unidecode import unidecode
6+
7+
from .numbers import normalize_numbers
8+
9+
10+
# Regular expression matching whitespace:
11+
_whitespace_re = re.compile(r'\s+')
12+
13+
# List of (regular expression, replacement) pairs for abbreviations:
14+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
15+
('mrs', 'misess'),
16+
('mr', 'mister'),
17+
('dr', 'doctor'),
18+
('st', 'saint'),
19+
('co', 'company'),
20+
('jr', 'junior'),
21+
('maj', 'major'),
22+
('gen', 'general'),
23+
('drs', 'doctors'),
24+
('rev', 'reverend'),
25+
('lt', 'lieutenant'),
26+
('hon', 'honorable'),
27+
('sgt', 'sergeant'),
28+
('capt', 'captain'),
29+
('esq', 'esquire'),
30+
('ltd', 'limited'),
31+
('col', 'colonel'),
32+
('ft', 'fort'),
33+
]]
34+
35+
_pad = '_'
36+
_punctuation = '!\'(),.:;? '
37+
_special = '-'
38+
_letters = 'abcdefghijklmnopqrstuvwxyz'
39+
40+
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters)
41+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
42+
43+
44+
def text_to_sequence(sent: str) -> List[int]:
45+
r'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
46+
47+
Args:
48+
sent (str): The input sentence to convert to a sequence.
49+
50+
Returns:
51+
List of integers corresponding to the symbols in the sentence.
52+
'''
53+
sent = unidecode(sent) # convert to ascii
54+
sent = sent.lower() # lower case
55+
sent = normalize_numbers(sent) # expand numbers
56+
for regex, replacement in _abbreviations: # expand abbreviations
57+
sent = re.sub(regex, replacement, sent)
58+
sent = re.sub(_whitespace_re, ' ', sent) # collapse whitespace
59+
60+
return [_symbol_to_id[s] for s in sent if s in _symbol_to_id]

0 commit comments

Comments
 (0)