Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
.. autofunction:: dict_word_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: create_custom_dict_trie
.. autofunction:: dict_trie
5 changes: 4 additions & 1 deletion pythainlp/corpus/wordnet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
"""
WordNet
NLTK WordNet wrapper

API here is exactly the same as NLTK API,
except that lang (language) argument will be "tha" (Thai) by default.
"""
import nltk

Expand Down
4 changes: 0 additions & 4 deletions pythainlp/number/numtoword.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,3 @@ def num_to_thaiword(number):
pos += 1

return ret


if __name__ == "__main__":
print(bahttext(4000.0))
17 changes: 7 additions & 10 deletions pythainlp/rank/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,15 @@


# เรียงจำนวนคำของประโยค
def rank(data, stopword=False):
def rank(words, stopword=False):
"""
Sort words by frequency
รับค่าเป็น ''list'' คืนค่าเป็น ''dict'' [(คำ, จำนวน), (คำ, จำนวน), ...]
รับค่าเป็น ''list'' คืนค่าเป็น ''Counter'' Counter({"คำ": จำนวน, "คำ": จำนวน})
"""
if stopword:
data = [word for word in data if word not in _STOPWORDS]

rankdata = Counter(data)

return rankdata
if not words:
return None

if stopword:
words = [word for word in words if word not in _STOPWORDS]

if __name__ == "__main__":
print(rank(["แมว", "ชอบ", "ปลา", "แมว", "ชอบ", "นอน", "คน", "เป็น", "ทาส", "แมว"]))
return Counter(words)
4 changes: 0 additions & 4 deletions pythainlp/romanization/royin.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,3 @@ def romanize(word):
word2 = "".join(word2)
word2 = _replace_consonants(word2, res)
return word2


if __name__ == "__main__":
print(romanize("กร") == romanize("กอน"))
5 changes: 0 additions & 5 deletions pythainlp/sentiment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,3 @@ def sentiment(text, engine="old"):
featurized_test_sentence = {i: (i in text) for i in vocabulary}

return classifier.classify(featurized_test_sentence)


if __name__ == "__main__":
text = "เสียใจแย่มากเลย"
print(sentiment(text))
15 changes: 11 additions & 4 deletions pythainlp/soundex/lk82.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
"""
Thai soundex - lk82 system
Thai soundex - LK82 system

Python implementation: Korakot Chaovavanich
https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
Expand All @@ -23,22 +23,29 @@

def lk82(text):
"""
LK82 - It's a thai soundex rule.
LK82 - It's a Thai soundex rule.

:param str text: Thai word
:return: LK82 soundex
"""
res = []
if not text:
return ""

text = _RE_1.sub("", text) # 4.ลบวรรณยุกต์
text = _RE_2.sub("", text) # 4.ลบตัวการันต์
text = _RE_3.sub("", text) # 5.ทิ้งไม้ไต่คู่ ฯลฯ

if not text:
return ""

# 6.เข้ารหัสตัวแรก
res = []
if "ก" <= text[0] <= "ฮ":
res.append(text[0].translate(_TRANS1))
text = text[1:]
else:
res.append(text[1].translate(_TRANS1))
if len(text) > 1:
res.append(text[1].translate(_TRANS1))
res.append(text[0].translate(_TRANS2))
text = text[2:]

Expand Down
3 changes: 3 additions & 0 deletions pythainlp/soundex/metasound.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def metasound(text, length=4):
metasound("รักษ์") # 'ร100'
metasound("บูรณการ", 5)) # 'บ5515'
"""
if not text:
return ""

# keep only consonants and thanthakhat
chars = []
for ch in text:
Expand Down
6 changes: 6 additions & 0 deletions pythainlp/soundex/udom83.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def udom83(text):
:return: Udom83 soundex
"""

if not text:
return ""

text = _RE_1.sub("ัน\\1", text)
text = _RE_2.sub("ั\\1", text)
text = _RE_3.sub("ัน\\1", text)
Expand All @@ -49,6 +52,9 @@ def udom83(text):
text = _RE_10.sub("", text)
text = _RE_11.sub("", text)

if not text:
return ""

sd = text[0].translate(_TRANS1)
sd += text[1:].translate(_TRANS2)

Expand Down
16 changes: 8 additions & 8 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,18 @@ def segment(text):
return segment(text)


def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
def dict_word_tokenize(text, custom_dict, engine="newmm"):
"""
:meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
:param str text: text to be tokenized
:param dict custom_dict_trie: a dictionary trie
:param dict custom_dict: a dictionary trie
:param str engine: choose between different options of engine to token (newmm, longest)
:return: list of words
**Example**::
>>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
>>> listword = ["แมว", "ดี"]
>>> data_dict = create_custom_dict_trie(listword)
>>> dict_word_tokenize("แมวดีดีแมว", data_dict)
>>> from pythainlp.tokenize import dict_word_tokenize, dict_trie
>>> words = ["แมว", "ดี"]
>>> trie = dict_trie(words)
>>> dict_word_tokenize("แมวดีดีแมว", trie)
['แมว', 'ดี', 'ดี', 'แมว']
"""
if engine == "newmm" or engine == "onecut":
Expand All @@ -82,7 +82,7 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
else: # default, use "newmm" engine
from .newmm import mmcut as segment

return segment(text, custom_dict_trie)
return segment(text, custom_dict)


def sent_tokenize(text, engine="whitespace+newline"):
Expand Down Expand Up @@ -126,7 +126,7 @@ def syllable_tokenize(text):
words = word_tokenize(text)
trie = dict_trie(dict_source=thai_syllables())
for word in words:
tokens.extend(dict_word_tokenize(text=word, custom_dict_trie=trie))
tokens.extend(dict_word_tokenize(text=word, custom_dict=trie))

return tokens

Expand Down
5 changes: 0 additions & 5 deletions pythainlp/tokenize/longest.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,3 @@ def segment(text, trie=None):
if not trie:
trie = DEFAULT_DICT_TRIE
return Tokenizer(trie).tokenize(text)


if __name__ == "__main__":
text = "รถประจำทาง ลำปาง-แม่เมาะ AB-2390-30"
print(segment(text))
7 changes: 0 additions & 7 deletions pythainlp/tokenize/multi_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,3 @@ def find_all_segment(text, trie=None):
"""
ww = list(multicut(text, trie=trie))
return list(combine(ww))


if __name__ == "__main__":
text = "ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด"
print(mmcut(text))
for one in find_all_segment(text):
print(one)
4 changes: 0 additions & 4 deletions pythainlp/tokenize/pyicu.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,3 @@ def _gen_words(text):
def segment(text):
text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text)
return list(_gen_words(text))


if __name__ == "__main__":
print(segment("พูดไทย2คำEnglishคำ"))
5 changes: 0 additions & 5 deletions pythainlp/util/keyboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,3 @@ def thai_to_eng(text):
return "".join(
[TH_EN_KEYB_PAIRS[ch] if (ch in TH_EN_KEYB_PAIRS) else ch for ch in text]
)


if __name__ == "__main__":
print(eng_to_thai("l;ylfu8iy["))
print(thai_to_eng("นามรสนอำันี"))
20 changes: 19 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pythainlp.rank import rank
from pythainlp.romanization import romanize
from pythainlp.sentiment import sentiment
from pythainlp.soundex import lk82, metasound, udom83
from pythainlp.soundex import lk82, metasound, soundex, udom83
from pythainlp.spell import correct, spell
from pythainlp.summarize import summarize
from pythainlp.tag import pos_tag, pos_tag_sents
Expand Down Expand Up @@ -204,7 +204,9 @@ def test_number(self):
# ### pythainlp.rank

def test_rank(self):
self.assertEqual(rank([]), None)
self.assertEqual(rank(["แมว", "คน", "แมว"]), Counter({"แมว": 2, "คน": 1}))
self.assertIsNotNone(rank(["แมว", "คน", "แมว"], stopword=True))

# ### pythainlp.romanization

Expand All @@ -214,6 +216,7 @@ def test_romanization(self):

def test_romanization_royin(self):
engine = "royin"
self.assertIsNotNone(romanize("กก", engine=engine))
self.assertEqual(romanize("แมว", engine=engine), "maeo")
self.assertEqual(romanize("เดือน", engine=engine), "duean")
self.assertEqual(romanize("ดู", engine=engine), "du")
Expand All @@ -230,13 +233,26 @@ def test_sentiment(self):
# ### pythainlp.soundex

def test_soundex(self):
self.assertIsNotNone(soundex("a", engine="lk82"))
self.assertIsNotNone(soundex("a", engine="udom83"))
self.assertIsNotNone(soundex("a", engine="metasound"))
self.assertIsNotNone(soundex("a", engine="XXX"))

self.assertEqual(lk82("รถ"), "ร3000")
self.assertIsNotNone(lk82("เกาะ"))
self.assertIsNotNone(lk82("อุยกูร์"))
self.assertIsNotNone(lk82("หยากไย่"))
self.assertEqual(lk82(""), "")

self.assertEqual(udom83("รถ"), "ร800000")
self.assertEqual(udom83(None), "")

self.assertEqual(metasound("บูรณะ"), "บ550")
self.assertEqual(metasound("คน"), "ค500")
self.assertEqual(metasound("คนA"), "ค500")
self.assertEqual(metasound("ดา"), "ด000")
self.assertEqual(metasound("รักษ์"), metasound("รัก"))
self.assertEqual(metasound(""), "")

# ### pythainlp.spell

Expand Down Expand Up @@ -349,6 +365,7 @@ def test_deletetone(self):

def test_is_thai(self):
self.assertEqual(is_thai("ประเทศไทย"), {"thai": 100.0})
self.assertIsNotNone(is_thai("เผือก", check_all=True))

def test_is_thaichar(self):
self.assertEqual(is_thaichar("ก"), True)
Expand All @@ -362,6 +379,7 @@ def test_is_thaiword(self):

def test_normalize(self):
self.assertEqual(normalize("เเปลก"), "แปลก")
self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))

def test_keyboard(self):
self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ")
Expand Down