Add text preprocessing utilities for TTS pipeline

yangarbiter · yangarbiter · commit 9d8f3d02716a · 2021-07-22T16:27:57.000Z
diff --git a/examples/pipeline_tacotron2/text/__init__.py b/examples/pipeline_tacotron2/text/__init__.py
diff --git a/examples/pipeline_tacotron2/text/numbers.py b/examples/pipeline_tacotron2/text/numbers.py
@@ -0,0 +1,71 @@
+""" Modified from https://github.com/keithito/tacotron """
+
+import inflect
+import re
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+
+def _remove_commas(m: re.Match) -> str:
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m: re.Match) -> str:
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m: re.Match) -> str:
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m: re.Match) -> str:
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m: re.Match) -> str:
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text: str) -> str:
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
diff --git a/examples/pipeline_tacotron2/text/test_text.py b/examples/pipeline_tacotron2/text/test_text.py
@@ -0,0 +1,22 @@
+import unittest
+
+from parameterized import parameterized
+
+from .text_preprocessing import text_to_sequence
+
+
+class TestTextPreprocessor(unittest.TestCase):
+
+    @parameterized.expand(
+        [
+            ["dr.  Strange?", [15, 26, 14, 31, 26, 29, 11, 30, 31, 29, 12, 25, 18, 16, 10]],
+            ["ML, is        fun.", [24, 23, 6, 11, 20, 30, 11, 17, 32, 25, 7]],
+            ["I love torchaudio!", [20, 11, 23, 26, 33, 16, 11, 31, 26, 29, 14, 19, 12, 32, 15, 20, 26, 2]],
+            # 'one thousand dollars, twenty cents'
+            ["$1,000.20", [26, 25, 16, 11, 31, 19, 26, 32, 30, 12, 25, 15, 11, 15, 26, 23, 23,
+                           12, 29, 30, 6, 11, 31, 34, 16, 25, 31, 36, 11, 14, 16, 25, 31, 30]],
+        ]
+    )
+    def test_text_to_sequence(self, sent, seq):
+
+        assert (text_to_sequence(sent) == seq)
diff --git a/examples/pipeline_tacotron2/text/text_preprocessing.py b/examples/pipeline_tacotron2/text/text_preprocessing.py
@@ -0,0 +1,60 @@
+"""Modified from https://github.com/keithito/tacotron"""
+
+from typing import List
+import re
+from unidecode import unidecode
+
+from .numbers import normalize_numbers
+
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+_pad = '_'
+_punctuation = '!\'(),.:;? '
+_special = '-'
+_letters = 'abcdefghijklmnopqrstuvwxyz'
+
+symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters)
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+
+
+def text_to_sequence(sent: str) -> List[int]:
+    r'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+
+      Args:
+        sent (str): The input sentence to convert to a sequence.
+
+      Returns:
+        List of integers corresponding to the symbols in the sentence.
+    '''
+    sent = unidecode(sent)  # convert to ascii
+    sent = sent.lower()  # lower case
+    sent = normalize_numbers(sent)  # expand numbers
+    for regex, replacement in _abbreviations:  # expand abbreviations
+        sent = re.sub(regex, replacement, sent)
+    sent = re.sub(_whitespace_re, ' ', sent)  # collapse whitespace
+
+    return [_symbol_to_id[s] for s in sent if s in _symbol_to_id]