From f2a3ab384c45935b6c4976755b3c4ae4d1c2c74f Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 6 May 2020 19:29:31 +0100 Subject: [PATCH 01/10] - Remove repetitive tonemarks - Remove "phantom chars" --- pythainlp/util/normalize.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 86e267c05..35e16737f 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -12,9 +12,9 @@ from pythainlp import thai_tonemarks as tonemarks -# VOWELS + Phinthu,Thanthakhat, Nikhahit, Yamakkan +# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan _NO_REPEAT_CHARS = ( - f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" + f"{follow_v}{lead_v}{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" ) _NORMALIZE_REPETITION = list( zip([ch + "+" for ch in _NO_REPEAT_CHARS], _NO_REPEAT_CHARS) @@ -25,17 +25,19 @@ ( f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)", "\\2\\1", - ), # TONE/Thanthakhat+ + A/BVOWELV+ -> A/BVOWEL+ + TONE/Thanthakhat+ + ), # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat ( f"\u0e4d([{tonemarks}]*)\u0e32", "\\1\u0e33", - ), # Nikhahit + TONEMARK* + Sara Aa -> TONEMARK* + Sara Am + ), # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am ( f"([{follow_v}]+)([{tonemarks}]+)", "\\2\\1", - ), # FOLLOWVOWEL+ + TONEMARK+ -> TONEMARK+ + FOLLOWVOWEL+ + ), # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL ] +_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" + def normalize(text: str) -> str: """ @@ -66,6 +68,9 @@ def normalize(text: str) -> str: text = re.sub(data[0], data[1], text) for data in _NORMALIZE_REPETITION: text = re.sub(data[0], data[1], text) + # remove a char that may have been accidentally typed in at the beginning + if text[0] in _PHANTOM_CHARS: + text = text[1:] return text From 088a1f35f82c3a36c6de3ee1f90bcd25d20b18c1 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 6 May 2020 20:32:49 +0100 Subject: [PATCH 02/10] Removing repeating different tonemarks --- pythainlp/util/normalize.py | 39 ++++++++++++++++++++++++------------- tests/test_util.py | 17 ++++++++++++++++ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 35e16737f..52554d84f 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -12,15 +12,9 @@ from pythainlp import thai_tonemarks as tonemarks -# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan -_NO_REPEAT_CHARS = ( - f"{follow_v}{lead_v}{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" -) -_NORMALIZE_REPETITION = list( - zip([ch + "+" for ch in _NO_REPEAT_CHARS], _NO_REPEAT_CHARS) -) +_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" -_NORMALIZE_REORDER = [ +_REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae ( f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)", @@ -36,7 +30,19 @@ ), # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL ] -_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" +# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan +_NOREPEAT_CHARS = ( + f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e" +) +_NOREPEAT_PAIRS = list( + zip([f"({ch}[ ]*)+" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) +) + +_RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+") + +# to be used with _RE_NOREPEAT_TONEMARKS +def last_char(matchobj): + return matchobj.group(0)[-1] def normalize(text: str) -> str: @@ -64,13 +70,18 @@ def normalize(text: str) -> str: normalize('นานาาา') # output: นานา """ - for data in _NORMALIZE_REORDER: - text = re.sub(data[0], data[1], text) - for data in _NORMALIZE_REPETITION: - text = re.sub(data[0], data[1], text) + for pair in _REORDER_PAIRS: + text = re.sub(pair[0], pair[1], text) + for pair in _NOREPEAT_PAIRS: + text = re.sub(pair[0], pair[1], text) + + # remove repeating tonemarks, use last tonemark + text = _RE_NOREPEAT_TONEMARKS.sub(last_char, text) + # remove a char that may have been accidentally typed in at the beginning - if text[0] in _PHANTOM_CHARS: + if text[0] in _PHANTOM_CHARS: text = text[1:] + return text diff --git a/tests/test_util.py b/tests/test_util.py index 8efa1e0e3..0bd0550c1 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -272,6 +272,23 @@ def test_normalize(self): # consonant + follow vowel + tonemark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") + # repeating tonemarks + self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") + + # repeating different tonemarks + self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") + self.assertEqual( + normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" + ) + + # tonemark at the beginning of text + self.assertEqual(normalize("\u0e48\u0e01"), "\u0e01") + + # repeating following vowels + self.assertEqual(normalize("กาา"), "กา") + self.assertEqual(normalize("กา า า า"), "กา") + self.assertEqual(normalize("กา าาะา"), "กาะา") + # ### pythainlp.util.thai def test_countthai(self): From 5abc5f376984f24666481f610c9681b4697871b2 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 6 May 2020 20:34:55 +0100 Subject: [PATCH 03/10] Fix PEP8 --- pythainlp/util/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 52554d84f..57a4799a4 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -11,7 +11,6 @@ from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks - _PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" _REORDER_PAIRS = [ @@ -40,6 +39,7 @@ _RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+") + # to be used with _RE_NOREPEAT_TONEMARKS def last_char(matchobj): return matchobj.group(0)[-1] From fa464b8c420da681976bcc0dd66bc2fecfcb3d1e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 7 May 2020 11:31:42 +0100 Subject: [PATCH 04/10] Remove zero width chars --- pythainlp/util/normalize.py | 7 ++++++- tests/test_util.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 57a4799a4..fcfd1e9ea 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -11,8 +11,12 @@ from pythainlp import thai_lead_vowels as lead_v from pythainlp import thai_tonemarks as tonemarks + _PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" +_ZERO_WIDTH_CHARS = "\u200c\u200b" +_RE_REMOVE_ZERO_WIDTHS = re.compile(f"[{_ZERO_WIDTH_CHARS}]+") + _REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae ( @@ -39,7 +43,6 @@ _RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+") - # to be used with _RE_NOREPEAT_TONEMARKS def last_char(matchobj): return matchobj.group(0)[-1] @@ -70,6 +73,8 @@ def normalize(text: str) -> str: normalize('นานาาา') # output: นานา """ + text = _RE_REMOVE_ZERO_WIDTHS.sub("", text) + for pair in _REORDER_PAIRS: text = re.sub(pair[0], pair[1], text) for pair in _NOREPEAT_PAIRS: diff --git a/tests/test_util.py b/tests/test_util.py index 0bd0550c1..13ae31885 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -289,6 +289,12 @@ def test_normalize(self): self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") + # zero width chars + self.assertEqual(normalize("กา\u200b"), "กา") + self.assertEqual(normalize("ก\u200cา"), "กา") + self.assertEqual(normalize("\u200bกา"), "กา") + self.assertEqual(normalize("กา\u200b\u200c\u200b"), "กา") + # ### pythainlp.util.thai def test_countthai(self): From a705898a5627c142e6c50ba9b9e309f1cf50f892 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 7 May 2020 11:41:39 +0100 Subject: [PATCH 05/10] Make remove phantom and zero-width chars a function --- pythainlp/util/__init__.py | 8 +++++--- pythainlp/util/normalize.py | 22 ++++++++++++++++++---- tests/test_util.py | 12 +++++++----- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index ad2a1538a..59f8c8ba5 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -20,6 +20,8 @@ "num_to_thaiword", "rank", "reign_year_to_ad", + "remove_phantom", + "remove_zw", "text_to_arabic_digit", "text_to_thai_digit", "thai_digit_to_arabic_digit", @@ -36,8 +38,8 @@ from .date import ( now_reign_year, reign_year_to_ad, - thai_strftime, thai_day2datetime, + thai_strftime, ) from .digitconv import ( arabic_digit_to_thai_digit, @@ -48,9 +50,9 @@ ) from .keyboard import eng_to_thai, thai_to_eng from .keywords import find_keyword, rank -from .normalize import delete_tone, normalize +from .normalize import delete_tone, normalize, remove_phantom, remove_zw from .numtoword import bahttext, num_to_thaiword from .thai import countthai, isthai, isthaichar -from .time import thai_time, thai_time2time from .thaiwordcheck import is_native_thai +from .time import thai_time, thai_time2time from .wordtonum import thaiword_to_num diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index fcfd1e9ea..6d76ed408 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -48,6 +48,22 @@ def last_char(matchobj): return matchobj.group(0)[-1] +def remove_phantom(text: str) -> str: + """ + Remove a char that may have been accidentally typed at the text beginning. + """ + if text[0] in _PHANTOM_CHARS: + text = text[1:] + return text + + +def remove_zw(text: str) -> str: + """ + Remove zero-width characters. + """ + return _RE_REMOVE_ZERO_WIDTHS.sub("", text) + + def normalize(text: str) -> str: """ This function normalize thai text with normalizing rules as follows: @@ -73,7 +89,7 @@ def normalize(text: str) -> str: normalize('นานาาา') # output: นานา """ - text = _RE_REMOVE_ZERO_WIDTHS.sub("", text) + text = remove_zw(text) for pair in _REORDER_PAIRS: text = re.sub(pair[0], pair[1], text) @@ -83,9 +99,7 @@ def normalize(text: str) -> str: # remove repeating tonemarks, use last tonemark text = _RE_NOREPEAT_TONEMARKS.sub(last_char, text) - # remove a char that may have been accidentally typed in at the beginning - if text[0] in _PHANTOM_CHARS: - text = text[1:] + text = remove_phantom(text) return text diff --git a/tests/test_util.py b/tests/test_util.py index 13ae31885..176c6684a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -24,6 +24,8 @@ num_to_thaiword, rank, reign_year_to_ad, + remove_phantom, + remove_zw, text_to_arabic_digit, text_to_thai_digit, thai_digit_to_arabic_digit, @@ -282,7 +284,7 @@ def test_normalize(self): ) # tonemark at the beginning of text - self.assertEqual(normalize("\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_phantom("\u0e48\u0e01"), "\u0e01") # repeating following vowels self.assertEqual(normalize("กาา"), "กา") @@ -290,10 +292,10 @@ def test_normalize(self): self.assertEqual(normalize("กา าาะา"), "กาะา") # zero width chars - self.assertEqual(normalize("กา\u200b"), "กา") - self.assertEqual(normalize("ก\u200cา"), "กา") - self.assertEqual(normalize("\u200bกา"), "กา") - self.assertEqual(normalize("กา\u200b\u200c\u200b"), "กา") + self.assertEqual(remove_zw("กา\u200b"), "กา") + self.assertEqual(remove_zw("ก\u200cา"), "กา") + self.assertEqual(remove_zw("\u200bกา"), "กา") + self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา") # ### pythainlp.util.thai From 83f208c51f0bb1d0716cddd9c2345bff3bcc9537 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 7 May 2020 11:59:36 +0100 Subject: [PATCH 06/10] Rewrite remove_tonemarks() (used to be delete_tone()) --- pythainlp/util/__init__.py | 9 +++++- pythainlp/util/normalize.py | 64 ++++++++++++++++++++----------------- tests/test_util.py | 25 ++++++++------- 3 files changed, 57 insertions(+), 41 deletions(-) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 59f8c8ba5..a202a4edb 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -21,6 +21,7 @@ "rank", "reign_year_to_ad", "remove_phantom", + "remove_tonemarks", "remove_zw", "text_to_arabic_digit", "text_to_thai_digit", @@ -50,7 +51,13 @@ ) from .keyboard import eng_to_thai, thai_to_eng from .keywords import find_keyword, rank -from .normalize import delete_tone, normalize, remove_phantom, remove_zw +from .normalize import ( + delete_tone, + normalize, + remove_phantom, + remove_tonemarks, + remove_zw, +) from .numtoword import bahttext, num_to_thaiword from .thai import countthai, isthai, isthaichar from .thaiwordcheck import is_native_thai diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 6d76ed408..0025aa58d 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -13,6 +13,7 @@ _PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" +_RE_REMOVE_PHANTOMS = re.compile(f"^[{_PHANTOM_CHARS}]+") _ZERO_WIDTH_CHARS = "\u200c\u200b" _RE_REMOVE_ZERO_WIDTHS = re.compile(f"[{_ZERO_WIDTH_CHARS}]+") @@ -41,10 +42,10 @@ zip([f"({ch}[ ]*)+" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS) ) -_RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+") +_RE_TONEMARKS = re.compile(f"[{tonemarks}]+") -# to be used with _RE_NOREPEAT_TONEMARKS -def last_char(matchobj): + +def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS return matchobj.group(0)[-1] @@ -52,9 +53,33 @@ def remove_phantom(text: str) -> str: """ Remove a char that may have been accidentally typed at the text beginning. """ - if text[0] in _PHANTOM_CHARS: - text = text[1:] - return text + return _RE_REMOVE_PHANTOMS.sub("", text) + + +def remove_tonemarks(text: str) -> str: + """ + Remove Thai tonemarks from the text. + + There are 4 tonemarks indicating 4 tones as follows: + + * Down tone (Thai: ไม้เอก _่ ) + * Falling tone (Thai: ไม้โท _้ ) + * High tone (Thai: ไม้ตรี ​_๊ ) + * Rising tone (Thai: ไม้จัตวา _๋ ) + + :param str text: text in Thai language + :return: text without Thai tonemarks + :rtype: str + + :Example: + :: + + from pythainlp.util import delete_tone + + delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด') + # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด + """ + return _RE_TONEMARKS.sub("", text) def remove_zw(text: str) -> str: @@ -66,7 +91,7 @@ def remove_zw(text: str) -> str: def normalize(text: str) -> str: """ - This function normalize thai text with normalizing rules as follows: + Normalize Thai text with normalizing rules as follows: * Remove redundant vowels and tonemarks * Subsitute "เ" + "เ" with "แ" @@ -97,7 +122,7 @@ def normalize(text: str) -> str: text = re.sub(pair[0], pair[1], text) # remove repeating tonemarks, use last tonemark - text = _RE_NOREPEAT_TONEMARKS.sub(last_char, text) + text = _RE_TONEMARKS.sub(_last_char, text) text = remove_phantom(text) @@ -106,25 +131,6 @@ def normalize(text: str) -> str: def delete_tone(text: str) -> str: """ - This function removes Thai tonemarks from the text. - There are 4 tonemarks indicating 4 tones as follows: - - * Down tone (Thai: ไม้เอก _่ ) - * Falling tone (Thai: ไม้โท _้ ) - * High tone (Thai: ไม้ตรี ​_๊ ) - * Rising tone (Thai: ไม้จัตวา _๋ ) - - :param str text: text in Thai language - :return: text without Thai tonemarks - :rtype: str - - :Example: - :: - - from pythainlp.util import delete_tone - - delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด') - # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด + DEPRECATED: Please use remove_tonemarks(). """ - chars = [ch for ch in text if ch not in tonemarks] - return "".join(chars) + return remove_tonemarks(text) diff --git a/tests/test_util.py b/tests/test_util.py index 176c6684a..3c3a22d2c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -12,7 +12,6 @@ bahttext, collate, countthai, - delete_tone, digit_to_text, eng_to_thai, find_keyword, @@ -25,6 +24,7 @@ rank, reign_year_to_ad, remove_phantom, + remove_tonemarks, remove_zw, text_to_arabic_digit, text_to_thai_digit, @@ -244,10 +244,6 @@ def test_thai_time(self): # ### pythainlp.util.normalize - def test_delete_tone(self): - self.assertEqual(delete_tone("จิ้น"), "จิน") - self.assertEqual(delete_tone("เก๋า"), "เกา") - def test_normalize(self): self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) @@ -274,6 +270,11 @@ def test_normalize(self): # consonant + follow vowel + tonemark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") + # repeating following vowels + self.assertEqual(normalize("กาา"), "กา") + self.assertEqual(normalize("กา า า า"), "กา") + self.assertEqual(normalize("กา าาะา"), "กาะา") + # repeating tonemarks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") @@ -283,15 +284,17 @@ def test_normalize(self): normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" ) - # tonemark at the beginning of text + # remove tonemark at the beginning of text self.assertEqual(remove_phantom("\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_phantom("\u0e48\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_phantom("\u0e48\u0e49\u0e01"), "\u0e01") + self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") - # repeating following vowels - self.assertEqual(normalize("กาา"), "กา") - self.assertEqual(normalize("กา า า า"), "กา") - self.assertEqual(normalize("กา าาะา"), "กาะา") + # removing tonemarks + self.assertEqual(remove_tonemarks("จิ้น"), "จิน") + self.assertEqual(remove_tonemarks("เก๋า"), "เกา") - # zero width chars + # removing zero width chars self.assertEqual(remove_zw("กา\u200b"), "กา") self.assertEqual(remove_zw("ก\u200cา"), "กา") self.assertEqual(remove_zw("\u200bกา"), "กา") From ba3a22e0594a7435d1a2f56d6001e172f3f88d58 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 7 May 2020 19:23:44 +0700 Subject: [PATCH 07/10] Update PyThaiNLP.util Docs --- docs/api/util.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/api/util.rst b/docs/api/util.rst index 4d41da6d1..b8614ce13 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -24,6 +24,9 @@ Modules .. autofunction:: num_to_thaiword .. autofunction:: rank .. autofunction:: reign_year_to_ad +.. autofunction:: remove_phantom +.. autofunction:: remove_tonemarks +.. autofunction:: remove_zw .. autofunction:: thai_time .. autofunction:: text_to_arabic_digit .. autofunction:: text_to_thai_digit From a37202319b581ee023728dc9b4f5a329c681fb9a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 7 May 2020 18:27:18 +0100 Subject: [PATCH 08/10] Replace regex with faster while loop string replace --- docs/api/util.rst | 2 +- pythainlp/util/__init__.py | 4 ++-- pythainlp/util/normalize.py | 19 ++++++++++++------- tests/test_util.py | 6 +++--- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index b8614ce13..1b1cbcdb9 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -25,7 +25,7 @@ Modules .. autofunction:: rank .. autofunction:: reign_year_to_ad .. autofunction:: remove_phantom -.. autofunction:: remove_tonemarks +.. autofunction:: remove_tonemark .. autofunction:: remove_zw .. autofunction:: thai_time .. autofunction:: text_to_arabic_digit diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index d3d46feae..a586ae316 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -23,7 +23,7 @@ "rank", "reign_year_to_ad", "remove_phantom", - "remove_tonemarks", + "remove_tonemark", "remove_zw", "text_to_arabic_digit", "text_to_thai_digit", @@ -56,7 +56,7 @@ delete_tone, normalize, remove_phantom, - remove_tonemarks, + remove_tonemark, remove_zw, ) from pythainlp.util.numtoword import bahttext, num_to_thaiword diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 0025aa58d..a1886ca4b 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -16,7 +16,6 @@ _RE_REMOVE_PHANTOMS = re.compile(f"^[{_PHANTOM_CHARS}]+") _ZERO_WIDTH_CHARS = "\u200c\u200b" -_RE_REMOVE_ZERO_WIDTHS = re.compile(f"[{_ZERO_WIDTH_CHARS}]+") _REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae @@ -56,9 +55,9 @@ def remove_phantom(text: str) -> str: return _RE_REMOVE_PHANTOMS.sub("", text) -def remove_tonemarks(text: str) -> str: +def remove_tonemark(text: str) -> str: """ - Remove Thai tonemarks from the text. + Remove all Thai tonemarks from the text. There are 4 tonemarks indicating 4 tones as follows: @@ -79,14 +78,20 @@ def remove_tonemarks(text: str) -> str: delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด') # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด """ - return _RE_TONEMARKS.sub("", text) + for ch in tonemarks: + while ch in text: + text = text.replace(ch, "") + return text def remove_zw(text: str) -> str: """ Remove zero-width characters. """ - return _RE_REMOVE_ZERO_WIDTHS.sub("", text) + for ch in _ZERO_WIDTH_CHARS: + while ch in text: + text = text.replace(ch, "") + return text def normalize(text: str) -> str: @@ -131,6 +136,6 @@ def normalize(text: str) -> str: def delete_tone(text: str) -> str: """ - DEPRECATED: Please use remove_tonemarks(). + DEPRECATED: Please use remove_tonemark(). """ - return remove_tonemarks(text) + return remove_tonemark(text) diff --git a/tests/test_util.py b/tests/test_util.py index 8e38cce72..380b4d292 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -28,7 +28,7 @@ rank, reign_year_to_ad, remove_phantom, - remove_tonemarks, + remove_tonemark, remove_zw, text_to_arabic_digit, text_to_thai_digit, @@ -321,8 +321,8 @@ def test_normalize(self): self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") # removing tonemarks - self.assertEqual(remove_tonemarks("จิ้น"), "จิน") - self.assertEqual(remove_tonemarks("เก๋า"), "เกา") + self.assertEqual(remove_tonemark("จิ้น"), "จิน") + self.assertEqual(remove_tonemark("เก๋า"), "เกา") # removing zero width chars self.assertEqual(remove_zw("กา\u200b"), "กา") From 1f8eb5c8999c038b0b26979ed08f57e63a50e8b0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 7 May 2020 19:19:08 +0100 Subject: [PATCH 09/10] Add remove_dup_spaces --- docs/api/util.rst | 1 + pythainlp/util/__init__.py | 2 ++ pythainlp/util/normalize.py | 17 +++++++++++++++++ tests/test_util.py | 5 +++++ 4 files changed, 25 insertions(+) diff --git a/docs/api/util.rst b/docs/api/util.rst index 1b1cbcdb9..cb02e9e07 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -24,6 +24,7 @@ Modules .. autofunction:: num_to_thaiword .. autofunction:: rank .. autofunction:: reign_year_to_ad +.. autofunction:: remove_dup_spaces .. autofunction:: remove_phantom .. autofunction:: remove_tonemark .. autofunction:: remove_zw diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index a586ae316..1958e7e7f 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -22,6 +22,7 @@ "num_to_thaiword", "rank", "reign_year_to_ad", + "remove_dup_spaces", "remove_phantom", "remove_tonemark", "remove_zw", @@ -55,6 +56,7 @@ from pythainlp.util.normalize import ( delete_tone, normalize, + remove_dup_spaces, remove_phantom, remove_tonemark, remove_zw, diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index a1886ca4b..02b363aff 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -43,11 +43,27 @@ _RE_TONEMARKS = re.compile(f"[{tonemarks}]+") +_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*") + def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS return matchobj.group(0)[-1] +def remove_dup_spaces(text: str) -> str: + """ + Remove duplicate spaces. Replace multiple spaces with one space. + + Multiple newline characters and empty lines will be replaced + with one newline character. + """ + while " " in text: + text = text.replace(" ", " ") + text = _RE_REMOVE_NEWLINES.sub("\n", text) + text = text.strip() + return text + + def remove_phantom(text: str) -> str: """ Remove a char that may have been accidentally typed at the text beginning. @@ -120,6 +136,7 @@ def normalize(text: str) -> str: # output: นานา """ text = remove_zw(text) + text = remove_dup_spaces(text) for pair in _REORDER_PAIRS: text = re.sub(pair[0], pair[1], text) diff --git a/tests/test_util.py b/tests/test_util.py index 380b4d292..7cd6ca8e5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -27,6 +27,7 @@ num_to_thaiword, rank, reign_year_to_ad, + remove_dup_spaces, remove_phantom, remove_tonemark, remove_zw, @@ -320,6 +321,10 @@ def test_normalize(self): self.assertEqual(remove_phantom("\u0e48\u0e49\u0e01"), "\u0e01") self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") + # remove duplicate spaces + self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") + self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") + # removing tonemarks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") From 0be2ea6da01b4236342442d9878d9848a97db773 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 8 May 2020 09:32:28 +0100 Subject: [PATCH 10/10] Add docstrings Rename remove_phantom() to remove_dangling() --- docs/api/util.rst | 2 +- pythainlp/util/__init__.py | 4 +- pythainlp/util/normalize.py | 137 ++++++++++++++++++++++++++++-------- tests/test_util.py | 26 +++---- 4 files changed, 124 insertions(+), 45 deletions(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index cb02e9e07..9a90e1c55 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -24,8 +24,8 @@ Modules .. autofunction:: num_to_thaiword .. autofunction:: rank .. autofunction:: reign_year_to_ad +.. autofunction:: remove_dangling .. autofunction:: remove_dup_spaces -.. autofunction:: remove_phantom .. autofunction:: remove_tonemark .. autofunction:: remove_zw .. autofunction:: thai_time diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 1958e7e7f..d91ea38fa 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -22,8 +22,8 @@ "num_to_thaiword", "rank", "reign_year_to_ad", + "remove_dangling", "remove_dup_spaces", - "remove_phantom", "remove_tonemark", "remove_zw", "text_to_arabic_digit", @@ -56,8 +56,8 @@ from pythainlp.util.normalize import ( delete_tone, normalize, + remove_dangling, remove_dup_spaces, - remove_phantom, remove_tonemark, remove_zw, ) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 02b363aff..92e27fee4 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -12,10 +12,10 @@ from pythainlp import thai_tonemarks as tonemarks -_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" -_RE_REMOVE_PHANTOMS = re.compile(f"^[{_PHANTOM_CHARS}]+") +_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e" +_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+") -_ZERO_WIDTH_CHARS = "\u200c\u200b" +_ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ _REORDER_PAIRS = [ ("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae @@ -50,12 +50,36 @@ def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS return matchobj.group(0)[-1] +def remove_dangling(text: str) -> str: + """ + Remove Thai non-base characters at the beginning of text. + + This is a common "typo", especially for input field in a form, + as these non-base characters can be visually hidden from user + who may accidentally typed them in. + + A character to be removed should be both: + + * tone mark, above vowel, below vowel, or non-base sign AND + * located at the beginning of the text + + :param str text: input text + :return: text without dangling Thai characters at the beginning + :rtype: str + """ + return _RE_REMOVE_DANGLINGS.sub("", text) + + def remove_dup_spaces(text: str) -> str: """ Remove duplicate spaces. Replace multiple spaces with one space. Multiple newline characters and empty lines will be replaced with one newline character. + + :param str text: input text + :return: text without duplicated spaces and newlines + :rtype: str """ while " " in text: text = text.replace(" ", " ") @@ -64,26 +88,23 @@ def remove_dup_spaces(text: str) -> str: return text -def remove_phantom(text: str) -> str: - """ - Remove a char that may have been accidentally typed at the text beginning. - """ - return _RE_REMOVE_PHANTOMS.sub("", text) - - def remove_tonemark(text: str) -> str: """ - Remove all Thai tonemarks from the text. + Remove all Thai tone marks from the text. - There are 4 tonemarks indicating 4 tones as follows: + Thai script has four tone marks indicating four tones as follows: * Down tone (Thai: ไม้เอก _่ ) * Falling tone (Thai: ไม้โท _้ ) * High tone (Thai: ไม้ตรี ​_๊ ) * Rising tone (Thai: ไม้จัตวา _๋ ) - :param str text: text in Thai language - :return: text without Thai tonemarks + Putting wrong tone mark is a common mistake in Thai writing. + By removing tone marks from the string, it could be used to + for a approximate string matching + + :param str text: input text + :return: text without Thai tone marks :rtype: str :Example: @@ -103,22 +124,88 @@ def remove_tonemark(text: str) -> str: def remove_zw(text: str) -> str: """ Remove zero-width characters. + + These non-visible characters may cause unexpected result from the + user's point of view. Removing them can make string matching more robust. + + Characters to be removed: + + * Zero-width space (ZWSP) + * Zero-with non-joiner (ZWJP) + + :param str text: input text + :return: text without zero-width characters + :rtype: str """ for ch in _ZERO_WIDTH_CHARS: while ch in text: text = text.replace(ch, "") + + return text + + +def reorder_vowels(text: str) -> str: + """ + Reorder vowels and tone marks to the standard logical order/spelling. + + Characters in input text will be reordered/transformed, + according to these rules: + + * Sara E + Sara E -> Sara Ae + * Nikhahit + Sara Aa -> Sara Am + * tone mark + non-base vowel -> non-base vowel + tone mark + * follow vowel + tone mark -> tone mark + follow vowel + + :param str text: input text + :return: text with vowels and tone marks in the standard logical order + :rtype: str + """ + for pair in _REORDER_PAIRS: + text = re.sub(pair[0], pair[1], text) + + return text + + +def remove_repeat_vowels(text: str) -> str: + """ + Remove repeating vowels, tone marks, and signs. + + This function will call reorder_vowels() first, to make sure that + double Sara E will be converted to Sara Ae and not be removed. + + :param str text: input text + :return: text without repeating Thai vowels, tone marks, and signs + :rtype: str + """ + text = reorder_vowels(text) + for pair in _NOREPEAT_PAIRS: + text = re.sub(pair[0], pair[1], text) + + # remove repeating tone marks, use last tone mark + text = _RE_TONEMARKS.sub(_last_char, text) + return text def normalize(text: str) -> str: """ - Normalize Thai text with normalizing rules as follows: + Normalize and clean Thai text with normalizing rules as follows: - * Remove redundant vowels and tonemarks - * Subsitute "เ" + "เ" with "แ" + * Remove zero-width spaces + * Remove duplicate spaces + * Reorder tone marks and vowels to standard order/spelling + * Remove duplicate vowels and signs + * Remove duplicate tone marks + * Remove dangling non-base characters at the beginning of text - :param str text: thai text to be normalized - :return: normalized Thai text according to the fules + normalize() simply call remove_zw(), remove_dup_spaces(), + remove_repeat_vowels(), and remove_dangling(), in that order. + + If a user wants to customize the selection or the order of rules + to be applied, they can choose to call those functions by themselves. + + :param str text: input text + :return: normalized text according to the fules :rtype: str :Example: @@ -137,16 +224,8 @@ def normalize(text: str) -> str: """ text = remove_zw(text) text = remove_dup_spaces(text) - - for pair in _REORDER_PAIRS: - text = re.sub(pair[0], pair[1], text) - for pair in _NOREPEAT_PAIRS: - text = re.sub(pair[0], pair[1], text) - - # remove repeating tonemarks, use last tonemark - text = _RE_TONEMARKS.sub(_last_char, text) - - text = remove_phantom(text) + text = remove_repeat_vowels(text) + text = remove_dangling(text) return text diff --git a/tests/test_util.py b/tests/test_util.py index 7cd6ca8e5..cdd020a5c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -27,8 +27,8 @@ num_to_thaiword, rank, reign_year_to_ad, + remove_dangling, remove_dup_spaces, - remove_phantom, remove_tonemark, remove_zw, text_to_arabic_digit, @@ -281,24 +281,24 @@ def test_normalize(self): # sara e + sara e self.assertEqual(normalize("เเปลก"), "แปลก") - # consonant + follow vowel + tonemark + # consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") # consonant + nikhahit + sara aa self.assertEqual(normalize("นํา"), "นำ") self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33") - # consonant + nikhahit + tonemark + sara aa + # consonant + nikhahit + tone mark + sara aa self.assertEqual( normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33" ) - # consonant + tonemark + nikhahit + sara aa + # consonant + tone mark + nikhahit + sara aa self.assertEqual( normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33" ) - # consonant + follow vowel + tonemark + # consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") # repeating following vowels @@ -306,26 +306,26 @@ def test_normalize(self): self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") - # repeating tonemarks + # repeating tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") - # repeating different tonemarks + # repeating different ton emarks self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") self.assertEqual( normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" ) - # remove tonemark at the beginning of text - self.assertEqual(remove_phantom("\u0e48\u0e01"), "\u0e01") - self.assertEqual(remove_phantom("\u0e48\u0e48\u0e01"), "\u0e01") - self.assertEqual(remove_phantom("\u0e48\u0e49\u0e01"), "\u0e01") - self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") + # remove tone mark at the beginning of text + self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") + self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") # remove duplicate spaces self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") - # removing tonemarks + # removing tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา")