Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ Modules
.. autofunction:: num_to_thaiword
.. autofunction:: rank
.. autofunction:: reign_year_to_ad
.. autofunction:: remove_dangling
.. autofunction:: remove_dup_spaces
.. autofunction:: remove_tonemark
.. autofunction:: remove_zw
.. autofunction:: thai_time
.. autofunction:: text_to_arabic_digit
.. autofunction:: text_to_thai_digit
Expand Down
13 changes: 12 additions & 1 deletion pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
"num_to_thaiword",
"rank",
"reign_year_to_ad",
"remove_dangling",
"remove_dup_spaces",
"remove_tonemark",
"remove_zw",
"text_to_arabic_digit",
"text_to_thai_digit",
"thai_digit_to_arabic_digit",
Expand Down Expand Up @@ -49,7 +53,14 @@
)
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
from pythainlp.util.keywords import find_keyword, rank
from pythainlp.util.normalize import delete_tone, normalize
from pythainlp.util.normalize import (
delete_tone,
normalize,
remove_dangling,
remove_dup_spaces,
remove_tonemark,
remove_zw,
)
from pythainlp.util.numtoword import bahttext, num_to_thaiword
from pythainlp.util.thai import countthai, isthai, isthaichar
from pythainlp.util.thaiwordcheck import is_native_thai
Expand Down
218 changes: 180 additions & 38 deletions pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,75 +12,99 @@
from pythainlp import thai_tonemarks as tonemarks


# VOWELS + Phinthu,Thanthakhat, Nikhahit, Yamakkan
_NO_REPEAT_CHARS = (
f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
)
_NORMALIZE_REPETITION = list(
zip([ch + "+" for ch in _NO_REPEAT_CHARS], _NO_REPEAT_CHARS)
)
_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")

_NORMALIZE_REORDER = [
_ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ

_REORDER_PAIRS = [
("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae
(
f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
"\\2\\1",
), # TONE/Thanthakhat+ + A/BVOWELV+ -> A/BVOWEL+ + TONE/Thanthakhat+
), # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
(
f"\u0e4d([{tonemarks}]*)\u0e32",
"\\1\u0e33",
), # Nikhahit + TONEMARK* + Sara Aa -> TONEMARK* + Sara Am
), # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
(
f"([{follow_v}]+)([{tonemarks}]+)",
"\\2\\1",
), # FOLLOWVOWEL+ + TONEMARK+ -> TONEMARK+ + FOLLOWVOWEL+
), # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
]

# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
_NOREPEAT_CHARS = (
f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
)
_NOREPEAT_PAIRS = list(
zip([f"({ch}[ ]*)+" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
)

_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")

def normalize(text: str) -> str:
_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")


def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS
return matchobj.group(0)[-1]


def remove_dangling(text: str) -> str:
"""
This function normalize thai text with normalizing rules as follows:
Remove Thai non-base characters at the beginning of text.

* Remove redundant vowels and tonemarks
* Subsitute "เ" + "เ" with "แ"
This is a common "typo", especially for input field in a form,
as these non-base characters can be visually hidden from user
who may accidentally typed them in.

:param str text: thai text to be normalized
:return: normalized Thai text according to the fules
:rtype: str
A character to be removed should be both:

:Example:
::
* tone mark, above vowel, below vowel, or non-base sign AND
* located at the beginning of the text

from pythainlp.util import normalize
:param str text: input text
:return: text without dangling Thai characters at the beginning
:rtype: str
"""
return _RE_REMOVE_DANGLINGS.sub("", text)

normalize('สระะน้ำ')
# output: สระน้ำ

normalize('เเปลก')
# output: แปลก
def remove_dup_spaces(text: str) -> str:
"""
Remove duplicate spaces. Replace multiple spaces with one space.

normalize('นานาาา')
# output: นานา
Multiple newline characters and empty lines will be replaced
with one newline character.

:param str text: input text
:return: text without duplicated spaces and newlines
:rtype: str
"""
for data in _NORMALIZE_REORDER:
text = re.sub(data[0], data[1], text)
for data in _NORMALIZE_REPETITION:
text = re.sub(data[0], data[1], text)
while " " in text:
text = text.replace(" ", " ")
text = _RE_REMOVE_NEWLINES.sub("\n", text)
text = text.strip()
return text


def delete_tone(text: str) -> str:
def remove_tonemark(text: str) -> str:
"""
This function removes Thai tonemarks from the text.
There are 4 tonemarks indicating 4 tones as follows:
Remove all Thai tone marks from the text.

Thai script has four tone marks indicating four tones as follows:

* Down tone (Thai: ไม้เอก _่ )
* Falling tone (Thai: ไม้โท _้ )
* High tone (Thai: ไม้ตรี ​_๊ )
* Rising tone (Thai: ไม้จัตวา _๋ )

:param str text: text in Thai language
:return: text without Thai tonemarks
Putting wrong tone mark is a common mistake in Thai writing.
By removing tone marks from the string, it could be used to
for a approximate string matching

:param str text: input text
:return: text without Thai tone marks
:rtype: str

:Example:
Expand All @@ -91,5 +115,123 @@ def delete_tone(text: str) -> str:
delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
"""
chars = [ch for ch in text if ch not in tonemarks]
return "".join(chars)
for ch in tonemarks:
while ch in text:
text = text.replace(ch, "")
return text


def remove_zw(text: str) -> str:
"""
Remove zero-width characters.

These non-visible characters may cause unexpected result from the
user's point of view. Removing them can make string matching more robust.

Characters to be removed:

* Zero-width space (ZWSP)
* Zero-with non-joiner (ZWJP)

:param str text: input text
:return: text without zero-width characters
:rtype: str
"""
for ch in _ZERO_WIDTH_CHARS:
while ch in text:
text = text.replace(ch, "")

return text


def reorder_vowels(text: str) -> str:
"""
Reorder vowels and tone marks to the standard logical order/spelling.

Characters in input text will be reordered/transformed,
according to these rules:

* Sara E + Sara E -> Sara Ae
* Nikhahit + Sara Aa -> Sara Am
* tone mark + non-base vowel -> non-base vowel + tone mark
* follow vowel + tone mark -> tone mark + follow vowel

:param str text: input text
:return: text with vowels and tone marks in the standard logical order
:rtype: str
"""
for pair in _REORDER_PAIRS:
text = re.sub(pair[0], pair[1], text)

return text


def remove_repeat_vowels(text: str) -> str:
"""
Remove repeating vowels, tone marks, and signs.

This function will call reorder_vowels() first, to make sure that
double Sara E will be converted to Sara Ae and not be removed.

:param str text: input text
:return: text without repeating Thai vowels, tone marks, and signs
:rtype: str
"""
text = reorder_vowels(text)
for pair in _NOREPEAT_PAIRS:
text = re.sub(pair[0], pair[1], text)

# remove repeating tone marks, use last tone mark
text = _RE_TONEMARKS.sub(_last_char, text)

return text


def normalize(text: str) -> str:
"""
Normalize and clean Thai text with normalizing rules as follows:

* Remove zero-width spaces
* Remove duplicate spaces
* Reorder tone marks and vowels to standard order/spelling
* Remove duplicate vowels and signs
* Remove duplicate tone marks
* Remove dangling non-base characters at the beginning of text

normalize() simply call remove_zw(), remove_dup_spaces(),
remove_repeat_vowels(), and remove_dangling(), in that order.

If a user wants to customize the selection or the order of rules
to be applied, they can choose to call those functions by themselves.

:param str text: input text
:return: normalized text according to the fules
:rtype: str

:Example:
::

from pythainlp.util import normalize

normalize('สระะน้ำ')
# output: สระน้ำ

normalize('เเปลก')
# output: แปลก

normalize('นานาาา')
# output: นานา
"""
text = remove_zw(text)
text = remove_dup_spaces(text)
text = remove_repeat_vowels(text)
text = remove_dangling(text)

return text


def delete_tone(text: str) -> str:
"""
DEPRECATED: Please use remove_tonemark().
"""
return remove_tonemark(text)
51 changes: 42 additions & 9 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
bahttext,
collate,
countthai,
delete_tone,
dict_trie,
digit_to_text,
eng_to_thai,
Expand All @@ -28,6 +27,10 @@
num_to_thaiword,
rank,
reign_year_to_ad,
remove_dangling,
remove_dup_spaces,
remove_tonemark,
remove_zw,
text_to_arabic_digit,
text_to_thai_digit,
thai_day2datetime,
Expand Down Expand Up @@ -272,36 +275,66 @@ def test_trie(self):

# ### pythainlp.util.normalize

def test_delete_tone(self):
self.assertEqual(delete_tone("จิ้น"), "จิน")
self.assertEqual(delete_tone("เก๋า"), "เกา")

def test_normalize(self):
self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))

# sara e + sara e
self.assertEqual(normalize("เเปลก"), "แปลก")

# consonant + follow vowel + tonemark
# consonant + follow vowel + tone mark
self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30")

# consonant + nikhahit + sara aa
self.assertEqual(normalize("นํา"), "นำ")
self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33")

# consonant + nikhahit + tonemark + sara aa
# consonant + nikhahit + tone mark + sara aa
self.assertEqual(
normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33"
)

# consonant + tonemark + nikhahit + sara aa
# consonant + tone mark + nikhahit + sara aa
self.assertEqual(
normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33"
)

# consonant + follow vowel + tonemark
# consonant + follow vowel + tone mark
self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32")

# repeating following vowels
self.assertEqual(normalize("กาา"), "กา")
self.assertEqual(normalize("กา า า า"), "กา")
self.assertEqual(normalize("กา าาะา"), "กาะา")

# repeating tone marks
self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48")

# repeating different ton emarks
self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49")
self.assertEqual(
normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49"
)

# remove tone mark at the beginning of text
self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01")
self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01")
self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")

# remove duplicate spaces
self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d")
self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd")

# removing tone marks
self.assertEqual(remove_tonemark("จิ้น"), "จิน")
self.assertEqual(remove_tonemark("เก๋า"), "เกา")

# removing zero width chars
self.assertEqual(remove_zw("กา\u200b"), "กา")
self.assertEqual(remove_zw("ก\u200cา"), "กา")
self.assertEqual(remove_zw("\u200bกา"), "กา")
self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")

# ### pythainlp.util.thai

def test_countthai(self):
Expand Down