From f2a3ab384c45935b6c4976755b3c4ae4d1c2c74f Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 6 May 2020 19:29:31 +0100
Subject: [PATCH 01/10] - Remove repetitive tonemarks - Remove "phantom chars"

---
 pythainlp/util/normalize.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 86e267c05..35e16737f 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -12,9 +12,9 @@
 from pythainlp import thai_tonemarks as tonemarks
 
 
-# VOWELS + Phinthu,Thanthakhat, Nikhahit, Yamakkan
+# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
 _NO_REPEAT_CHARS = (
-    f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
+    f"{follow_v}{lead_v}{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
 )
 _NORMALIZE_REPETITION = list(
     zip([ch + "+" for ch in _NO_REPEAT_CHARS], _NO_REPEAT_CHARS)
@@ -25,17 +25,19 @@
     (
         f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
         "\\2\\1",
-    ),  # TONE/Thanthakhat+ + A/BVOWELV+ -> A/BVOWEL+ + TONE/Thanthakhat+
+    ),  # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
     (
         f"\u0e4d([{tonemarks}]*)\u0e32",
         "\\1\u0e33",
-    ),  # Nikhahit + TONEMARK* + Sara Aa -> TONEMARK* + Sara Am
+    ),  # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
     (
         f"([{follow_v}]+)([{tonemarks}]+)",
         "\\2\\1",
-    ),  # FOLLOWVOWEL+ + TONEMARK+ -> TONEMARK+ + FOLLOWVOWEL+
+    ),  # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
 ]
 
+_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
+
 
 def normalize(text: str) -> str:
     """
@@ -66,6 +68,9 @@ def normalize(text: str) -> str:
         text = re.sub(data[0], data[1], text)
     for data in _NORMALIZE_REPETITION:
         text = re.sub(data[0], data[1], text)
+    # remove a char that may have been accidentally typed in at the beginning
+    if text[0] in _PHANTOM_CHARS:  
+        text = text[1:]
     return text
 
 

From 088a1f35f82c3a36c6de3ee1f90bcd25d20b18c1 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 6 May 2020 20:32:49 +0100
Subject: [PATCH 02/10] Removing repeating different tonemarks

---
 pythainlp/util/normalize.py | 39 ++++++++++++++++++++++++-------------
 tests/test_util.py          | 17 ++++++++++++++++
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 35e16737f..52554d84f 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -12,15 +12,9 @@
 from pythainlp import thai_tonemarks as tonemarks
 
 
-# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
-_NO_REPEAT_CHARS = (
-    f"{follow_v}{lead_v}{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
-)
-_NORMALIZE_REPETITION = list(
-    zip([ch + "+" for ch in _NO_REPEAT_CHARS], _NO_REPEAT_CHARS)
-)
+_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
 
-_NORMALIZE_REORDER = [
+_REORDER_PAIRS = [
     ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
     (
         f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
@@ -36,7 +30,19 @@
     ),  # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
 ]
 
-_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
+# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
+_NOREPEAT_CHARS = (
+    f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
+)
+_NOREPEAT_PAIRS = list(
+    zip([f"({ch}[ ]*)+" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
+)
+
+_RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+")
+
+# to be used with _RE_NOREPEAT_TONEMARKS
+def last_char(matchobj):
+    return matchobj.group(0)[-1]
 
 
 def normalize(text: str) -> str:
@@ -64,13 +70,18 @@ def normalize(text: str) -> str:
         normalize('นานาาา')
         # output: นานา
     """
-    for data in _NORMALIZE_REORDER:
-        text = re.sub(data[0], data[1], text)
-    for data in _NORMALIZE_REPETITION:
-        text = re.sub(data[0], data[1], text)
+    for pair in _REORDER_PAIRS:
+        text = re.sub(pair[0], pair[1], text)
+    for pair in _NOREPEAT_PAIRS:
+        text = re.sub(pair[0], pair[1], text)
+
+    # remove repeating tonemarks, use last tonemark
+    text = _RE_NOREPEAT_TONEMARKS.sub(last_char, text)
+
     # remove a char that may have been accidentally typed in at the beginning
-    if text[0] in _PHANTOM_CHARS:  
+    if text[0] in _PHANTOM_CHARS:
         text = text[1:]
+
     return text
 
 
diff --git a/tests/test_util.py b/tests/test_util.py
index 8efa1e0e3..0bd0550c1 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -272,6 +272,23 @@ def test_normalize(self):
         # consonant + follow vowel + tonemark
         self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32")
 
+        # repeating tonemarks
+        self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48")
+
+        # repeating different tonemarks
+        self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49")
+        self.assertEqual(
+            normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49"
+        )
+
+        # tonemark at the beginning of text
+        self.assertEqual(normalize("\u0e48\u0e01"), "\u0e01")
+
+        # repeating following vowels
+        self.assertEqual(normalize("กาา"), "กา")
+        self.assertEqual(normalize("กา า  า  า"), "กา")
+        self.assertEqual(normalize("กา าาะา"), "กาะา")
+
     # ### pythainlp.util.thai
 
     def test_countthai(self):

From 5abc5f376984f24666481f610c9681b4697871b2 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 6 May 2020 20:34:55 +0100
Subject: [PATCH 03/10] Fix PEP8

---
 pythainlp/util/normalize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 52554d84f..57a4799a4 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -11,7 +11,6 @@
 from pythainlp import thai_lead_vowels as lead_v
 from pythainlp import thai_tonemarks as tonemarks
 
-
 _PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
 
 _REORDER_PAIRS = [
@@ -40,6 +39,7 @@
 
 _RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+")
 
+
 # to be used with _RE_NOREPEAT_TONEMARKS
 def last_char(matchobj):
     return matchobj.group(0)[-1]

From fa464b8c420da681976bcc0dd66bc2fecfcb3d1e Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 7 May 2020 11:31:42 +0100
Subject: [PATCH 04/10] Remove zero width chars

---
 pythainlp/util/normalize.py | 7 ++++++-
 tests/test_util.py          | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 57a4799a4..fcfd1e9ea 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -11,8 +11,12 @@
 from pythainlp import thai_lead_vowels as lead_v
 from pythainlp import thai_tonemarks as tonemarks
 
+
 _PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
 
+_ZERO_WIDTH_CHARS = "\u200c\u200b"
+_RE_REMOVE_ZERO_WIDTHS = re.compile(f"[{_ZERO_WIDTH_CHARS}]+")
+
 _REORDER_PAIRS = [
     ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
     (
@@ -39,7 +43,6 @@
 
 _RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+")
 
-
 # to be used with _RE_NOREPEAT_TONEMARKS
 def last_char(matchobj):
     return matchobj.group(0)[-1]
@@ -70,6 +73,8 @@ def normalize(text: str) -> str:
         normalize('นานาาา')
         # output: นานา
     """
+    text = _RE_REMOVE_ZERO_WIDTHS.sub("", text)
+
     for pair in _REORDER_PAIRS:
         text = re.sub(pair[0], pair[1], text)
     for pair in _NOREPEAT_PAIRS:
diff --git a/tests/test_util.py b/tests/test_util.py
index 0bd0550c1..13ae31885 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -289,6 +289,12 @@ def test_normalize(self):
         self.assertEqual(normalize("กา า  า  า"), "กา")
         self.assertEqual(normalize("กา าาะา"), "กาะา")
 
+        # zero width chars
+        self.assertEqual(normalize("กา\u200b"), "กา")
+        self.assertEqual(normalize("ก\u200cา"), "กา")
+        self.assertEqual(normalize("\u200bกา"), "กา")
+        self.assertEqual(normalize("กา\u200b\u200c\u200b"), "กา")
+
     # ### pythainlp.util.thai
 
     def test_countthai(self):

From a705898a5627c142e6c50ba9b9e309f1cf50f892 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 7 May 2020 11:41:39 +0100
Subject: [PATCH 05/10] Make remove phantom and zero-width chars a function

---
 pythainlp/util/__init__.py  |  8 +++++---
 pythainlp/util/normalize.py | 22 ++++++++++++++++++----
 tests/test_util.py          | 12 +++++++-----
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index ad2a1538a..59f8c8ba5 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -20,6 +20,8 @@
     "num_to_thaiword",
     "rank",
     "reign_year_to_ad",
+    "remove_phantom",
+    "remove_zw",
     "text_to_arabic_digit",
     "text_to_thai_digit",
     "thai_digit_to_arabic_digit",
@@ -36,8 +38,8 @@
 from .date import (
     now_reign_year,
     reign_year_to_ad,
-    thai_strftime,
     thai_day2datetime,
+    thai_strftime,
 )
 from .digitconv import (
     arabic_digit_to_thai_digit,
@@ -48,9 +50,9 @@
 )
 from .keyboard import eng_to_thai, thai_to_eng
 from .keywords import find_keyword, rank
-from .normalize import delete_tone, normalize
+from .normalize import delete_tone, normalize, remove_phantom, remove_zw
 from .numtoword import bahttext, num_to_thaiword
 from .thai import countthai, isthai, isthaichar
-from .time import thai_time, thai_time2time
 from .thaiwordcheck import is_native_thai
+from .time import thai_time, thai_time2time
 from .wordtonum import thaiword_to_num
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index fcfd1e9ea..6d76ed408 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -48,6 +48,22 @@ def last_char(matchobj):
     return matchobj.group(0)[-1]
 
 
+def remove_phantom(text: str) -> str:
+    """
+    Remove a char that may have been accidentally typed at the text beginning.
+    """
+    if text[0] in _PHANTOM_CHARS:
+        text = text[1:]
+    return text
+
+
+def remove_zw(text: str) -> str:
+    """
+    Remove zero-width characters.
+    """
+    return _RE_REMOVE_ZERO_WIDTHS.sub("", text)
+
+
 def normalize(text: str) -> str:
     """
     This function normalize thai text with normalizing rules as follows:
@@ -73,7 +89,7 @@ def normalize(text: str) -> str:
         normalize('นานาาา')
         # output: นานา
     """
-    text = _RE_REMOVE_ZERO_WIDTHS.sub("", text)
+    text = remove_zw(text)
 
     for pair in _REORDER_PAIRS:
         text = re.sub(pair[0], pair[1], text)
@@ -83,9 +99,7 @@ def normalize(text: str) -> str:
     # remove repeating tonemarks, use last tonemark
     text = _RE_NOREPEAT_TONEMARKS.sub(last_char, text)
 
-    # remove a char that may have been accidentally typed in at the beginning
-    if text[0] in _PHANTOM_CHARS:
-        text = text[1:]
+    text = remove_phantom(text)
 
     return text
 
diff --git a/tests/test_util.py b/tests/test_util.py
index 13ae31885..176c6684a 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -24,6 +24,8 @@
     num_to_thaiword,
     rank,
     reign_year_to_ad,
+    remove_phantom,
+    remove_zw,
     text_to_arabic_digit,
     text_to_thai_digit,
     thai_digit_to_arabic_digit,
@@ -282,7 +284,7 @@ def test_normalize(self):
         )
 
         # tonemark at the beginning of text
-        self.assertEqual(normalize("\u0e48\u0e01"), "\u0e01")
+        self.assertEqual(remove_phantom("\u0e48\u0e01"), "\u0e01")
 
         # repeating following vowels
         self.assertEqual(normalize("กาา"), "กา")
@@ -290,10 +292,10 @@ def test_normalize(self):
         self.assertEqual(normalize("กา าาะา"), "กาะา")
 
         # zero width chars
-        self.assertEqual(normalize("กา\u200b"), "กา")
-        self.assertEqual(normalize("ก\u200cา"), "กา")
-        self.assertEqual(normalize("\u200bกา"), "กา")
-        self.assertEqual(normalize("กา\u200b\u200c\u200b"), "กา")
+        self.assertEqual(remove_zw("กา\u200b"), "กา")
+        self.assertEqual(remove_zw("ก\u200cา"), "กา")
+        self.assertEqual(remove_zw("\u200bกา"), "กา")
+        self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
 
     # ### pythainlp.util.thai
 

From 83f208c51f0bb1d0716cddd9c2345bff3bcc9537 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 7 May 2020 11:59:36 +0100
Subject: [PATCH 06/10] Rewrite remove_tonemarks() (used to be delete_tone())

---
 pythainlp/util/__init__.py  |  9 +++++-
 pythainlp/util/normalize.py | 64 ++++++++++++++++++++-----------------
 tests/test_util.py          | 25 ++++++++-------
 3 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index 59f8c8ba5..a202a4edb 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -21,6 +21,7 @@
     "rank",
     "reign_year_to_ad",
     "remove_phantom",
+    "remove_tonemarks",
     "remove_zw",
     "text_to_arabic_digit",
     "text_to_thai_digit",
@@ -50,7 +51,13 @@
 )
 from .keyboard import eng_to_thai, thai_to_eng
 from .keywords import find_keyword, rank
-from .normalize import delete_tone, normalize, remove_phantom, remove_zw
+from .normalize import (
+    delete_tone,
+    normalize,
+    remove_phantom,
+    remove_tonemarks,
+    remove_zw,
+)
 from .numtoword import bahttext, num_to_thaiword
 from .thai import countthai, isthai, isthaichar
 from .thaiwordcheck import is_native_thai
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 6d76ed408..0025aa58d 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -13,6 +13,7 @@
 
 
 _PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
+_RE_REMOVE_PHANTOMS = re.compile(f"^[{_PHANTOM_CHARS}]+")
 
 _ZERO_WIDTH_CHARS = "\u200c\u200b"
 _RE_REMOVE_ZERO_WIDTHS = re.compile(f"[{_ZERO_WIDTH_CHARS}]+")
@@ -41,10 +42,10 @@
     zip([f"({ch}[ ]*)+" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
 )
 
-_RE_NOREPEAT_TONEMARKS = re.compile(f"[{tonemarks}]+")
+_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")
 
-# to be used with _RE_NOREPEAT_TONEMARKS
-def last_char(matchobj):
+
+def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
     return matchobj.group(0)[-1]
 
 
@@ -52,9 +53,33 @@ def remove_phantom(text: str) -> str:
     """
     Remove a char that may have been accidentally typed at the text beginning.
     """
-    if text[0] in _PHANTOM_CHARS:
-        text = text[1:]
-    return text
+    return _RE_REMOVE_PHANTOMS.sub("", text)
+
+
+def remove_tonemarks(text: str) -> str:
+    """
+    Remove Thai tonemarks from the text.
+
+    There are 4 tonemarks indicating 4 tones as follows:
+
+        * Down tone (Thai: ไม้เอก  _่ )
+        * Falling tone  (Thai: ไม้โท  _้ )
+        * High tone (Thai: ไม้ตรี  ​_๊ )
+        * Rising tone (Thai: ไม้จัตวา _๋ )
+
+    :param str text: text in Thai language
+    :return: text without Thai tonemarks
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.util import delete_tone
+
+        delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
+        # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
+    """
+    return _RE_TONEMARKS.sub("", text)
 
 
 def remove_zw(text: str) -> str:
@@ -66,7 +91,7 @@ def remove_zw(text: str) -> str:
 
 def normalize(text: str) -> str:
     """
-    This function normalize thai text with normalizing rules as follows:
+    Normalize Thai text with normalizing rules as follows:
 
         * Remove redundant vowels and tonemarks
         * Subsitute "เ" + "เ" with "แ"
@@ -97,7 +122,7 @@ def normalize(text: str) -> str:
         text = re.sub(pair[0], pair[1], text)
 
     # remove repeating tonemarks, use last tonemark
-    text = _RE_NOREPEAT_TONEMARKS.sub(last_char, text)
+    text = _RE_TONEMARKS.sub(_last_char, text)
 
     text = remove_phantom(text)
 
@@ -106,25 +131,6 @@ def normalize(text: str) -> str:
 
 def delete_tone(text: str) -> str:
     """
-    This function removes Thai tonemarks from the text.
-    There are 4 tonemarks indicating 4 tones as follows:
-
-        * Down tone (Thai: ไม้เอก  _่ )
-        * Falling tone  (Thai: ไม้โท  _้ )
-        * High tone (Thai: ไม้ตรี  ​_๊ )
-        * Rising tone (Thai: ไม้จัตวา _๋ )
-
-    :param str text: text in Thai language
-    :return: text without Thai tonemarks
-    :rtype: str
-
-    :Example:
-    ::
-
-        from pythainlp.util import delete_tone
-
-        delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
-        # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
+    DEPRECATED: Please use remove_tonemarks().
     """
-    chars = [ch for ch in text if ch not in tonemarks]
-    return "".join(chars)
+    return remove_tonemarks(text)
diff --git a/tests/test_util.py b/tests/test_util.py
index 176c6684a..3c3a22d2c 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -12,7 +12,6 @@
     bahttext,
     collate,
     countthai,
-    delete_tone,
     digit_to_text,
     eng_to_thai,
     find_keyword,
@@ -25,6 +24,7 @@
     rank,
     reign_year_to_ad,
     remove_phantom,
+    remove_tonemarks,
     remove_zw,
     text_to_arabic_digit,
     text_to_thai_digit,
@@ -244,10 +244,6 @@ def test_thai_time(self):
 
     # ### pythainlp.util.normalize
 
-    def test_delete_tone(self):
-        self.assertEqual(delete_tone("จิ้น"), "จิน")
-        self.assertEqual(delete_tone("เก๋า"), "เกา")
-
     def test_normalize(self):
         self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))
 
@@ -274,6 +270,11 @@ def test_normalize(self):
         # consonant + follow vowel + tonemark
         self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32")
 
+        # repeating following vowels
+        self.assertEqual(normalize("กาา"), "กา")
+        self.assertEqual(normalize("กา า  า  า"), "กา")
+        self.assertEqual(normalize("กา าาะา"), "กาะา")
+
         # repeating tonemarks
         self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48")
 
@@ -283,15 +284,17 @@ def test_normalize(self):
             normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49"
         )
 
-        # tonemark at the beginning of text
+        # remove tonemark at the beginning of text
         self.assertEqual(remove_phantom("\u0e48\u0e01"), "\u0e01")
+        self.assertEqual(remove_phantom("\u0e48\u0e48\u0e01"), "\u0e01")
+        self.assertEqual(remove_phantom("\u0e48\u0e49\u0e01"), "\u0e01")
+        self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
 
-        # repeating following vowels
-        self.assertEqual(normalize("กาา"), "กา")
-        self.assertEqual(normalize("กา า  า  า"), "กา")
-        self.assertEqual(normalize("กา าาะา"), "กาะา")
+        # removing tonemarks
+        self.assertEqual(remove_tonemarks("จิ้น"), "จิน")
+        self.assertEqual(remove_tonemarks("เก๋า"), "เกา")
 
-        # zero width chars
+        # removing zero width chars
         self.assertEqual(remove_zw("กา\u200b"), "กา")
         self.assertEqual(remove_zw("ก\u200cา"), "กา")
         self.assertEqual(remove_zw("\u200bกา"), "กา")

From ba3a22e0594a7435d1a2f56d6001e172f3f88d58 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 7 May 2020 19:23:44 +0700
Subject: [PATCH 07/10] Update PyThaiNLP.util Docs

---
 docs/api/util.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/api/util.rst b/docs/api/util.rst
index 4d41da6d1..b8614ce13 100644
--- a/docs/api/util.rst
+++ b/docs/api/util.rst
@@ -24,6 +24,9 @@ Modules
 .. autofunction:: num_to_thaiword
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
+.. autofunction:: remove_phantom
+.. autofunction:: remove_tonemarks
+.. autofunction:: remove_zw
 .. autofunction:: thai_time
 .. autofunction:: text_to_arabic_digit
 .. autofunction:: text_to_thai_digit

From a37202319b581ee023728dc9b4f5a329c681fb9a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 7 May 2020 18:27:18 +0100
Subject: [PATCH 08/10] Replace regex with faster while loop string replace

---
 docs/api/util.rst           |  2 +-
 pythainlp/util/__init__.py  |  4 ++--
 pythainlp/util/normalize.py | 19 ++++++++++++-------
 tests/test_util.py          |  6 +++---
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/api/util.rst b/docs/api/util.rst
index b8614ce13..1b1cbcdb9 100644
--- a/docs/api/util.rst
+++ b/docs/api/util.rst
@@ -25,7 +25,7 @@ Modules
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
 .. autofunction:: remove_phantom
-.. autofunction:: remove_tonemarks
+.. autofunction:: remove_tonemark
 .. autofunction:: remove_zw
 .. autofunction:: thai_time
 .. autofunction:: text_to_arabic_digit
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index d3d46feae..a586ae316 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -23,7 +23,7 @@
     "rank",
     "reign_year_to_ad",
     "remove_phantom",
-    "remove_tonemarks",
+    "remove_tonemark",
     "remove_zw",
     "text_to_arabic_digit",
     "text_to_thai_digit",
@@ -56,7 +56,7 @@
     delete_tone,
     normalize,
     remove_phantom,
-    remove_tonemarks,
+    remove_tonemark,
     remove_zw,
 )
 from pythainlp.util.numtoword import bahttext, num_to_thaiword
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 0025aa58d..a1886ca4b 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -16,7 +16,6 @@
 _RE_REMOVE_PHANTOMS = re.compile(f"^[{_PHANTOM_CHARS}]+")
 
 _ZERO_WIDTH_CHARS = "\u200c\u200b"
-_RE_REMOVE_ZERO_WIDTHS = re.compile(f"[{_ZERO_WIDTH_CHARS}]+")
 
 _REORDER_PAIRS = [
     ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
@@ -56,9 +55,9 @@ def remove_phantom(text: str) -> str:
     return _RE_REMOVE_PHANTOMS.sub("", text)
 
 
-def remove_tonemarks(text: str) -> str:
+def remove_tonemark(text: str) -> str:
     """
-    Remove Thai tonemarks from the text.
+    Remove all Thai tonemarks from the text.
 
     There are 4 tonemarks indicating 4 tones as follows:
 
@@ -79,14 +78,20 @@ def remove_tonemarks(text: str) -> str:
         delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
         # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
     """
-    return _RE_TONEMARKS.sub("", text)
+    for ch in tonemarks:
+        while ch in text:
+            text = text.replace(ch, "")
+    return text
 
 
 def remove_zw(text: str) -> str:
     """
     Remove zero-width characters.
     """
-    return _RE_REMOVE_ZERO_WIDTHS.sub("", text)
+    for ch in _ZERO_WIDTH_CHARS:
+        while ch in text:
+            text = text.replace(ch, "")
+    return text
 
 
 def normalize(text: str) -> str:
@@ -131,6 +136,6 @@ def normalize(text: str) -> str:
 
 def delete_tone(text: str) -> str:
     """
-    DEPRECATED: Please use remove_tonemarks().
+    DEPRECATED: Please use remove_tonemark().
     """
-    return remove_tonemarks(text)
+    return remove_tonemark(text)
diff --git a/tests/test_util.py b/tests/test_util.py
index 8e38cce72..380b4d292 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -28,7 +28,7 @@
     rank,
     reign_year_to_ad,
     remove_phantom,
-    remove_tonemarks,
+    remove_tonemark,
     remove_zw,
     text_to_arabic_digit,
     text_to_thai_digit,
@@ -321,8 +321,8 @@ def test_normalize(self):
         self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
 
         # removing tonemarks
-        self.assertEqual(remove_tonemarks("จิ้น"), "จิน")
-        self.assertEqual(remove_tonemarks("เก๋า"), "เกา")
+        self.assertEqual(remove_tonemark("จิ้น"), "จิน")
+        self.assertEqual(remove_tonemark("เก๋า"), "เกา")
 
         # removing zero width chars
         self.assertEqual(remove_zw("กา\u200b"), "กา")

From 1f8eb5c8999c038b0b26979ed08f57e63a50e8b0 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 7 May 2020 19:19:08 +0100
Subject: [PATCH 09/10] Add remove_dup_spaces

---
 docs/api/util.rst           |  1 +
 pythainlp/util/__init__.py  |  2 ++
 pythainlp/util/normalize.py | 17 +++++++++++++++++
 tests/test_util.py          |  5 +++++
 4 files changed, 25 insertions(+)

diff --git a/docs/api/util.rst b/docs/api/util.rst
index 1b1cbcdb9..cb02e9e07 100644
--- a/docs/api/util.rst
+++ b/docs/api/util.rst
@@ -24,6 +24,7 @@ Modules
 .. autofunction:: num_to_thaiword
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
+.. autofunction:: remove_dup_spaces
 .. autofunction:: remove_phantom
 .. autofunction:: remove_tonemark
 .. autofunction:: remove_zw
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index a586ae316..1958e7e7f 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -22,6 +22,7 @@
     "num_to_thaiword",
     "rank",
     "reign_year_to_ad",
+    "remove_dup_spaces",
     "remove_phantom",
     "remove_tonemark",
     "remove_zw",
@@ -55,6 +56,7 @@
 from pythainlp.util.normalize import (
     delete_tone,
     normalize,
+    remove_dup_spaces,
     remove_phantom,
     remove_tonemark,
     remove_zw,
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index a1886ca4b..02b363aff 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -43,11 +43,27 @@
 
 _RE_TONEMARKS = re.compile(f"[{tonemarks}]+")
 
+_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
+
 
 def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
     return matchobj.group(0)[-1]
 
 
+def remove_dup_spaces(text: str) -> str:
+    """
+    Remove duplicate spaces. Replace multiple spaces with one space.
+
+    Multiple newline characters and empty lines will be replaced
+    with one newline character.
+    """
+    while "  " in text:
+        text = text.replace("  ", " ")
+    text = _RE_REMOVE_NEWLINES.sub("\n", text)
+    text = text.strip()
+    return text
+
+
 def remove_phantom(text: str) -> str:
     """
     Remove a char that may have been accidentally typed at the text beginning.
@@ -120,6 +136,7 @@ def normalize(text: str) -> str:
         # output: นานา
     """
     text = remove_zw(text)
+    text = remove_dup_spaces(text)
 
     for pair in _REORDER_PAIRS:
         text = re.sub(pair[0], pair[1], text)
diff --git a/tests/test_util.py b/tests/test_util.py
index 380b4d292..7cd6ca8e5 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -27,6 +27,7 @@
     num_to_thaiword,
     rank,
     reign_year_to_ad,
+    remove_dup_spaces,
     remove_phantom,
     remove_tonemark,
     remove_zw,
@@ -320,6 +321,10 @@ def test_normalize(self):
         self.assertEqual(remove_phantom("\u0e48\u0e49\u0e01"), "\u0e01")
         self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
 
+        # remove duplicate spaces
+        self.assertEqual(remove_dup_spaces("  ab  c d  "), "ab c d")
+        self.assertEqual(remove_dup_spaces("\nab  c   \n d \n"), "ab c\nd")
+
         # removing tonemarks
         self.assertEqual(remove_tonemark("จิ้น"), "จิน")
         self.assertEqual(remove_tonemark("เก๋า"), "เกา")

From 0be2ea6da01b4236342442d9878d9848a97db773 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 8 May 2020 09:32:28 +0100
Subject: [PATCH 10/10] Add docstrings Rename remove_phantom() to
 remove_dangling()

---
 docs/api/util.rst           |   2 +-
 pythainlp/util/__init__.py  |   4 +-
 pythainlp/util/normalize.py | 137 ++++++++++++++++++++++++++++--------
 tests/test_util.py          |  26 +++----
 4 files changed, 124 insertions(+), 45 deletions(-)

diff --git a/docs/api/util.rst b/docs/api/util.rst
index cb02e9e07..9a90e1c55 100644
--- a/docs/api/util.rst
+++ b/docs/api/util.rst
@@ -24,8 +24,8 @@ Modules
 .. autofunction:: num_to_thaiword
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
+.. autofunction:: remove_dangling
 .. autofunction:: remove_dup_spaces
-.. autofunction:: remove_phantom
 .. autofunction:: remove_tonemark
 .. autofunction:: remove_zw
 .. autofunction:: thai_time
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index 1958e7e7f..d91ea38fa 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -22,8 +22,8 @@
     "num_to_thaiword",
     "rank",
     "reign_year_to_ad",
+    "remove_dangling",
     "remove_dup_spaces",
-    "remove_phantom",
     "remove_tonemark",
     "remove_zw",
     "text_to_arabic_digit",
@@ -56,8 +56,8 @@
 from pythainlp.util.normalize import (
     delete_tone,
     normalize,
+    remove_dangling,
     remove_dup_spaces,
-    remove_phantom,
     remove_tonemark,
     remove_zw,
 )
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
index 02b363aff..92e27fee4 100644
--- a/pythainlp/util/normalize.py
+++ b/pythainlp/util/normalize.py
@@ -12,10 +12,10 @@
 from pythainlp import thai_tonemarks as tonemarks
 
 
-_PHANTOM_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
-_RE_REMOVE_PHANTOMS = re.compile(f"^[{_PHANTOM_CHARS}]+")
+_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
+_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")
 
-_ZERO_WIDTH_CHARS = "\u200c\u200b"
+_ZERO_WIDTH_CHARS = "\u200b\u200c"  # ZWSP, ZWNJ
 
 _REORDER_PAIRS = [
     ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
@@ -50,12 +50,36 @@ def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
     return matchobj.group(0)[-1]
 
 
+def remove_dangling(text: str) -> str:
+    """
+    Remove Thai non-base characters at the beginning of text.
+
+    This is a common "typo", especially for input field in a form,
+    as these non-base characters can be visually hidden from user
+    who may accidentally typed them in.
+
+    A character to be removed should be both:
+
+        * tone mark, above vowel, below vowel, or non-base sign AND
+        * located at the beginning of the text
+
+    :param str text: input text
+    :return: text without dangling Thai characters at the beginning
+    :rtype: str
+    """
+    return _RE_REMOVE_DANGLINGS.sub("", text)
+
+
 def remove_dup_spaces(text: str) -> str:
     """
     Remove duplicate spaces. Replace multiple spaces with one space.
 
     Multiple newline characters and empty lines will be replaced
     with one newline character.
+
+    :param str text: input text
+    :return: text without duplicated spaces and newlines
+    :rtype: str
     """
     while "  " in text:
         text = text.replace("  ", " ")
@@ -64,26 +88,23 @@ def remove_dup_spaces(text: str) -> str:
     return text
 
 
-def remove_phantom(text: str) -> str:
-    """
-    Remove a char that may have been accidentally typed at the text beginning.
-    """
-    return _RE_REMOVE_PHANTOMS.sub("", text)
-
-
 def remove_tonemark(text: str) -> str:
     """
-    Remove all Thai tonemarks from the text.
+    Remove all Thai tone marks from the text.
 
-    There are 4 tonemarks indicating 4 tones as follows:
+    Thai script has four tone marks indicating four tones as follows:
 
         * Down tone (Thai: ไม้เอก  _่ )
         * Falling tone  (Thai: ไม้โท  _้ )
         * High tone (Thai: ไม้ตรี  ​_๊ )
         * Rising tone (Thai: ไม้จัตวา _๋ )
 
-    :param str text: text in Thai language
-    :return: text without Thai tonemarks
+    Putting wrong tone mark is a common mistake in Thai writing.
+    By removing tone marks from the string, it could be used to
+    for a approximate string matching
+
+    :param str text: input text
+    :return: text without Thai tone marks
     :rtype: str
 
     :Example:
@@ -103,22 +124,88 @@ def remove_tonemark(text: str) -> str:
 def remove_zw(text: str) -> str:
     """
     Remove zero-width characters.
+
+    These non-visible characters may cause unexpected result from the
+    user's point of view. Removing them can make string matching more robust.
+
+    Characters to be removed:
+
+        * Zero-width space (ZWSP)
+        * Zero-with non-joiner (ZWJP)
+
+    :param str text: input text
+    :return: text without zero-width characters
+    :rtype: str
     """
     for ch in _ZERO_WIDTH_CHARS:
         while ch in text:
             text = text.replace(ch, "")
+
+    return text
+
+
+def reorder_vowels(text: str) -> str:
+    """
+    Reorder vowels and tone marks to the standard logical order/spelling.
+
+    Characters in input text will be reordered/transformed,
+    according to these rules:
+
+        * Sara E + Sara E -> Sara Ae
+        * Nikhahit + Sara Aa -> Sara Am
+        * tone mark + non-base vowel -> non-base vowel + tone mark
+        * follow vowel + tone mark -> tone mark + follow vowel
+
+    :param str text: input text
+    :return: text with vowels and tone marks in the standard logical order
+    :rtype: str
+    """
+    for pair in _REORDER_PAIRS:
+        text = re.sub(pair[0], pair[1], text)
+
+    return text
+
+
+def remove_repeat_vowels(text: str) -> str:
+    """
+    Remove repeating vowels, tone marks, and signs.
+
+    This function will call reorder_vowels() first, to make sure that
+    double Sara E will be converted to Sara Ae and not be removed.
+
+    :param str text: input text
+    :return: text without repeating Thai vowels, tone marks, and signs
+    :rtype: str
+    """
+    text = reorder_vowels(text)
+    for pair in _NOREPEAT_PAIRS:
+        text = re.sub(pair[0], pair[1], text)
+
+    # remove repeating tone marks, use last tone mark
+    text = _RE_TONEMARKS.sub(_last_char, text)
+
     return text
 
 
 def normalize(text: str) -> str:
     """
-    Normalize Thai text with normalizing rules as follows:
+    Normalize and clean Thai text with normalizing rules as follows:
 
-        * Remove redundant vowels and tonemarks
-        * Subsitute "เ" + "เ" with "แ"
+        * Remove zero-width spaces
+        * Remove duplicate spaces
+        * Reorder tone marks and vowels to standard order/spelling
+        * Remove duplicate vowels and signs
+        * Remove duplicate tone marks
+        * Remove dangling non-base characters at the beginning of text
 
-    :param str text: thai text to be normalized
-    :return: normalized Thai text according to the fules
+    normalize() simply call remove_zw(), remove_dup_spaces(),
+    remove_repeat_vowels(), and remove_dangling(), in that order.
+
+    If a user wants to customize the selection or the order of rules
+    to be applied, they can choose to call those functions by themselves.
+
+    :param str text: input text
+    :return: normalized text according to the fules
     :rtype: str
 
     :Example:
@@ -137,16 +224,8 @@ def normalize(text: str) -> str:
     """
     text = remove_zw(text)
     text = remove_dup_spaces(text)
-
-    for pair in _REORDER_PAIRS:
-        text = re.sub(pair[0], pair[1], text)
-    for pair in _NOREPEAT_PAIRS:
-        text = re.sub(pair[0], pair[1], text)
-
-    # remove repeating tonemarks, use last tonemark
-    text = _RE_TONEMARKS.sub(_last_char, text)
-
-    text = remove_phantom(text)
+    text = remove_repeat_vowels(text)
+    text = remove_dangling(text)
 
     return text
 
diff --git a/tests/test_util.py b/tests/test_util.py
index 7cd6ca8e5..cdd020a5c 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -27,8 +27,8 @@
     num_to_thaiword,
     rank,
     reign_year_to_ad,
+    remove_dangling,
     remove_dup_spaces,
-    remove_phantom,
     remove_tonemark,
     remove_zw,
     text_to_arabic_digit,
@@ -281,24 +281,24 @@ def test_normalize(self):
         # sara e + sara e
         self.assertEqual(normalize("เเปลก"), "แปลก")
 
-        # consonant + follow vowel + tonemark
+        # consonant + follow vowel + tone mark
         self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30")
 
         # consonant + nikhahit + sara aa
         self.assertEqual(normalize("นํา"), "นำ")
         self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33")
 
-        # consonant + nikhahit + tonemark + sara aa
+        # consonant + nikhahit + tone mark + sara aa
         self.assertEqual(
             normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33"
         )
 
-        # consonant + tonemark + nikhahit + sara aa
+        # consonant + tone mark + nikhahit + sara aa
         self.assertEqual(
             normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33"
         )
 
-        # consonant + follow vowel + tonemark
+        # consonant + follow vowel + tone mark
         self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32")
 
         # repeating following vowels
@@ -306,26 +306,26 @@ def test_normalize(self):
         self.assertEqual(normalize("กา า  า  า"), "กา")
         self.assertEqual(normalize("กา าาะา"), "กาะา")
 
-        # repeating tonemarks
+        # repeating tone marks
         self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48")
 
-        # repeating different tonemarks
+        # repeating different ton emarks
         self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49")
         self.assertEqual(
             normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49"
         )
 
-        # remove tonemark at the beginning of text
-        self.assertEqual(remove_phantom("\u0e48\u0e01"), "\u0e01")
-        self.assertEqual(remove_phantom("\u0e48\u0e48\u0e01"), "\u0e01")
-        self.assertEqual(remove_phantom("\u0e48\u0e49\u0e01"), "\u0e01")
-        self.assertEqual(remove_phantom("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
+        # remove tone mark at the beginning of text
+        self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01")
+        self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01")
+        self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
+        self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
 
         # remove duplicate spaces
         self.assertEqual(remove_dup_spaces("  ab  c d  "), "ab c d")
         self.assertEqual(remove_dup_spaces("\nab  c   \n d \n"), "ab c\nd")
 
-        # removing tonemarks
+        # removing tone marks
         self.assertEqual(remove_tonemark("จิ้น"), "จิน")
         self.assertEqual(remove_tonemark("เก๋า"), "เกา")