diff --git a/docs/api/util.rst b/docs/api/util.rst index 4d41da6d1..9a90e1c55 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -24,6 +24,10 @@ Modules .. autofunction:: num_to_thaiword .. autofunction:: rank .. autofunction:: reign_year_to_ad +.. autofunction:: remove_dangling +.. autofunction:: remove_dup_spaces +.. autofunction:: remove_tonemark +.. autofunction:: remove_zw .. autofunction:: thai_time .. autofunction:: text_to_arabic_digit .. autofunction:: text_to_thai_digit diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index d52a6e1e7..d91ea38fa 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -22,6 +22,10 @@ "num_to_thaiword", "rank", "reign_year_to_ad", + "remove_dangling", + "remove_dup_spaces", + "remove_tonemark", + "remove_zw", "text_to_arabic_digit", "text_to_thai_digit", "thai_digit_to_arabic_digit", @@ -49,7 +53,14 @@ ) from pythainlp.util.keyboard import eng_to_thai, thai_to_eng from pythainlp.util.keywords import find_keyword, rank -from pythainlp.util.normalize import delete_tone, normalize +from pythainlp.util.normalize import ( + delete_tone, + normalize, + remove_dangling, + remove_dup_spaces, + remove_tonemark, + remove_zw, +) from pythainlp.util.numtoword import bahttext, num_to_thaiword from pythainlp.util.thai import countthai, isthai, isthaichar from pythainlp.util.thaiwordcheck import is_native_thai diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 86e267c05..92e27fee4 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -12,75 +12,99 @@ from pythainlp import thai_tonemarks as tonemarks -# VOWELS + Phinthu,Thanthakhat, Nikhahit, Yamakkan -_NO_REPEAT_CHARS = ( - f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" -) -_NORMALIZE_REPETITION = list( - zip([ch + "+" for ch in _NO_REPEAT_CHARS], _NO_REPEAT_CHARS) -) +_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" +_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+") -_NORMALIZE_REORDER = [ +_ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ + +_REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae ( f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)", "\\2\\1", - ), # TONE/Thanthakhat+ + A/BVOWELV+ -> A/BVOWEL+ + TONE/Thanthakhat+ + ), # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat ( f"\u0e4d([{tonemarks}]*)\u0e32", "\\1\u0e33", - ), # Nikhahit + TONEMARK* + Sara Aa -> TONEMARK* + Sara Am + ), # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am ( f"([{follow_v}]+)([{tonemarks}]+)", "\\2\\1", - ), # FOLLOWVOWEL+ + TONEMARK+ -> TONEMARK+ + FOLLOWVOWEL+ + ), # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL ] +# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan +_NOREPEAT_CHARS = ( + f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" +) +_NOREPEAT_PAIRS = list( + zip([f"({ch}[ ]*)+" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) +) + +_RE_TONEMARKS = re.compile(f"[{tonemarks}]+") -def normalize(text: str) -> str: +_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*") + + +def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS + return matchobj.group(0)[-1] + + +def remove_dangling(text: str) -> str: """ - This function normalize thai text with normalizing rules as follows: + Remove Thai non-base characters at the beginning of text. - * Remove redundant vowels and tonemarks - * Subsitute "เ" + "เ" with "แ" + This is a common "typo", especially for input field in a form, + as these non-base characters can be visually hidden from user + who may accidentally typed them in. - :param str text: thai text to be normalized - :return: normalized Thai text according to the fules - :rtype: str + A character to be removed should be both: - :Example: - :: + * tone mark, above vowel, below vowel, or non-base sign AND + * located at the beginning of the text - from pythainlp.util import normalize + :param str text: input text + :return: text without dangling Thai characters at the beginning + :rtype: str + """ + return _RE_REMOVE_DANGLINGS.sub("", text) - normalize('สระะน้ำ') - # output: สระน้ำ - normalize('เเปลก') - # output: แปลก +def remove_dup_spaces(text: str) -> str: + """ + Remove duplicate spaces. Replace multiple spaces with one space. - normalize('นานาาา') - # output: นานา + Multiple newline characters and empty lines will be replaced + with one newline character. + + :param str text: input text + :return: text without duplicated spaces and newlines + :rtype: str """ - for data in _NORMALIZE_REORDER: - text = re.sub(data[0], data[1], text) - for data in _NORMALIZE_REPETITION: - text = re.sub(data[0], data[1], text) + while " " in text: + text = text.replace(" ", " ") + text = _RE_REMOVE_NEWLINES.sub("\n", text) + text = text.strip() return text -def delete_tone(text: str) -> str: +def remove_tonemark(text: str) -> str: """ - This function removes Thai tonemarks from the text. - There are 4 tonemarks indicating 4 tones as follows: + Remove all Thai tone marks from the text. + + Thai script has four tone marks indicating four tones as follows: * Down tone (Thai: ไม้เอก _่ ) * Falling tone (Thai: ไม้โท _้ ) * High tone (Thai: ไม้ตรี ​_๊ ) * Rising tone (Thai: ไม้จัตวา _๋ ) - :param str text: text in Thai language - :return: text without Thai tonemarks + Putting wrong tone mark is a common mistake in Thai writing. + By removing tone marks from the string, it could be used to + for a approximate string matching + + :param str text: input text + :return: text without Thai tone marks :rtype: str :Example: @@ -91,5 +115,123 @@ def delete_tone(text: str) -> str: delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด') # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด """ - chars = [ch for ch in text if ch not in tonemarks] - return "".join(chars) + for ch in tonemarks: + while ch in text: + text = text.replace(ch, "") + return text + + +def remove_zw(text: str) -> str: + """ + Remove zero-width characters. + + These non-visible characters may cause unexpected result from the + user's point of view. Removing them can make string matching more robust. + + Characters to be removed: + + * Zero-width space (ZWSP) + * Zero-with non-joiner (ZWJP) + + :param str text: input text + :return: text without zero-width characters + :rtype: str + """ + for ch in _ZERO_WIDTH_CHARS: + while ch in text: + text = text.replace(ch, "") + + return text + + +def reorder_vowels(text: str) -> str: + """ + Reorder vowels and tone marks to the standard logical order/spelling. + + Characters in input text will be reordered/transformed, + according to these rules: + + * Sara E + Sara E -> Sara Ae + * Nikhahit + Sara Aa -> Sara Am + * tone mark + non-base vowel -> non-base vowel + tone mark + * follow vowel + tone mark -> tone mark + follow vowel + + :param str text: input text + :return: text with vowels and tone marks in the standard logical order + :rtype: str + """ + for pair in _REORDER_PAIRS: + text = re.sub(pair[0], pair[1], text) + + return text + + +def remove_repeat_vowels(text: str) -> str: + """ + Remove repeating vowels, tone marks, and signs. + + This function will call reorder_vowels() first, to make sure that + double Sara E will be converted to Sara Ae and not be removed. + + :param str text: input text + :return: text without repeating Thai vowels, tone marks, and signs + :rtype: str + """ + text = reorder_vowels(text) + for pair in _NOREPEAT_PAIRS: + text = re.sub(pair[0], pair[1], text) + + # remove repeating tone marks, use last tone mark + text = _RE_TONEMARKS.sub(_last_char, text) + + return text + + +def normalize(text: str) -> str: + """ + Normalize and clean Thai text with normalizing rules as follows: + + * Remove zero-width spaces + * Remove duplicate spaces + * Reorder tone marks and vowels to standard order/spelling + * Remove duplicate vowels and signs + * Remove duplicate tone marks + * Remove dangling non-base characters at the beginning of text + + normalize() simply call remove_zw(), remove_dup_spaces(), + remove_repeat_vowels(), and remove_dangling(), in that order. + + If a user wants to customize the selection or the order of rules + to be applied, they can choose to call those functions by themselves. + + :param str text: input text + :return: normalized text according to the fules + :rtype: str + + :Example: + :: + + from pythainlp.util import normalize + + normalize('สระะน้ำ') + # output: สระน้ำ + + normalize('เเปลก') + # output: แปลก + + normalize('นานาาา') + # output: นานา + """ + text = remove_zw(text) + text = remove_dup_spaces(text) + text = remove_repeat_vowels(text) + text = remove_dangling(text) + + return text + + +def delete_tone(text: str) -> str: + """ + DEPRECATED: Please use remove_tonemark(). + """ + return remove_tonemark(text) diff --git a/tests/test_util.py b/tests/test_util.py index 2f02cea6b..cdd020a5c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -15,7 +15,6 @@ bahttext, collate, countthai, - delete_tone, dict_trie, digit_to_text, eng_to_thai, @@ -28,6 +27,10 @@ num_to_thaiword, rank, reign_year_to_ad, + remove_dangling, + remove_dup_spaces, + remove_tonemark, + remove_zw, text_to_arabic_digit, text_to_thai_digit, thai_day2datetime, @@ -272,36 +275,66 @@ def test_trie(self): # ### pythainlp.util.normalize - def test_delete_tone(self): - self.assertEqual(delete_tone("จิ้น"), "จิน") - self.assertEqual(delete_tone("เก๋า"), "เกา") - def test_normalize(self): self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) # sara e + sara e self.assertEqual(normalize("เเปลก"), "แปลก") - # consonant + follow vowel + tonemark + # consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") # consonant + nikhahit + sara aa self.assertEqual(normalize("นํา"), "นำ") self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33") - # consonant + nikhahit + tonemark + sara aa + # consonant + nikhahit + tone mark + sara aa self.assertEqual( normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33" ) - # consonant + tonemark + nikhahit + sara aa + # consonant + tone mark + nikhahit + sara aa self.assertEqual( normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33" ) - # consonant + follow vowel + tonemark + # consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") + # repeating following vowels + self.assertEqual(normalize("กาา"), "กา") + self.assertEqual(normalize("กา า า า"), "กา") + self.assertEqual(normalize("กา าาะา"), "กาะา") + + # repeating tone marks + self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") + + # repeating different ton emarks + self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") + self.assertEqual( + normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" + ) + + # remove tone mark at the beginning of text + self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") + + # remove duplicate spaces + self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") + self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") + + # removing tone marks + self.assertEqual(remove_tonemark("จิ้น"), "จิน") + self.assertEqual(remove_tonemark("เก๋า"), "เกา") + + # removing zero width chars + self.assertEqual(remove_zw("กา\u200b"), "กา") + self.assertEqual(remove_zw("ก\u200cา"), "กา") + self.assertEqual(remove_zw("\u200bกา"), "กา") + self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา") + # ### pythainlp.util.thai def test_countthai(self):