diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index c7ed5941e..f91e5a2da 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -9,4 +9,4 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu .. autofunction:: dict_word_tokenize .. autofunction:: subword_tokenize .. autofunction:: sent_tokenize -.. autofunction:: create_custom_dict_trie +.. autofunction:: dict_trie diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py index d3e23922f..07ec0f526 100644 --- a/pythainlp/corpus/wordnet.py +++ b/pythainlp/corpus/wordnet.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- """ -WordNet +NLTK WordNet wrapper + +API here is exactly the same as NLTK API, +except that lang (language) argument will be "tha" (Thai) by default. """ import nltk diff --git a/pythainlp/number/numtoword.py b/pythainlp/number/numtoword.py index 174b7a768..394984d70 100644 --- a/pythainlp/number/numtoword.py +++ b/pythainlp/number/numtoword.py @@ -95,7 +95,3 @@ def num_to_thaiword(number): pos += 1 return ret - - -if __name__ == "__main__": - print(bahttext(4000.0)) diff --git a/pythainlp/rank/__init__.py b/pythainlp/rank/__init__.py index dbf5781b7..84a7a7271 100644 --- a/pythainlp/rank/__init__.py +++ b/pythainlp/rank/__init__.py @@ -7,18 +7,15 @@ # เรียงจำนวนคำของประโยค -def rank(data, stopword=False): +def rank(words, stopword=False): """ Sort words by frequency - รับค่าเป็น ''list'' คืนค่าเป็น ''dict'' [(คำ, จำนวน), (คำ, จำนวน), ...] + รับค่าเป็น ''list'' คืนค่าเป็น ''Counter'' Counter({"คำ": จำนวน, "คำ": จำนวน}) """ - if stopword: - data = [word for word in data if word not in _STOPWORDS] - - rankdata = Counter(data) - - return rankdata + if not words: + return None + if stopword: + words = [word for word in words if word not in _STOPWORDS] -if __name__ == "__main__": - print(rank(["แมว", "ชอบ", "ปลา", "แมว", "ชอบ", "นอน", "คน", "เป็น", "ทาส", "แมว"])) + return Counter(words) diff --git a/pythainlp/romanization/royin.py b/pythainlp/romanization/royin.py index 87776b8e2..69a3671d9 100644 --- a/pythainlp/romanization/royin.py +++ b/pythainlp/romanization/royin.py @@ -177,7 +177,3 @@ def romanize(word): word2 = "".join(word2) word2 = _replace_consonants(word2, res) return word2 - - -if __name__ == "__main__": - print(romanize("กร") == romanize("กอน")) diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py index c2b018382..d0af157be 100644 --- a/pythainlp/sentiment/__init__.py +++ b/pythainlp/sentiment/__init__.py @@ -49,8 +49,3 @@ def sentiment(text, engine="old"): featurized_test_sentence = {i: (i in text) for i in vocabulary} return classifier.classify(featurized_test_sentence) - - -if __name__ == "__main__": - text = "เสียใจแย่มากเลย" - print(sentiment(text)) diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index e9a401b04..f7b21a764 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Thai soundex - lk82 system +Thai soundex - LK82 system Python implementation: Korakot Chaovavanich https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8 @@ -23,22 +23,29 @@ def lk82(text): """ - LK82 - It's a thai soundex rule. + LK82 - It's a Thai soundex rule. :param str text: Thai word :return: LK82 soundex """ - res = [] + if not text: + return "" + text = _RE_1.sub("", text) # 4.ลบวรรณยุกต์ text = _RE_2.sub("", text) # 4.ลบตัวการันต์ text = _RE_3.sub("", text) # 5.ทิ้งไม้ไต่คู่ ฯลฯ + if not text: + return "" + # 6.เข้ารหัสตัวแรก + res = [] if "ก" <= text[0] <= "ฮ": res.append(text[0].translate(_TRANS1)) text = text[1:] else: - res.append(text[1].translate(_TRANS1)) + if len(text) > 1: + res.append(text[1].translate(_TRANS1)) res.append(text[0].translate(_TRANS2)) text = text[2:] diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py index 0a6a9e4ba..91f813b13 100644 --- a/pythainlp/soundex/metasound.py +++ b/pythainlp/soundex/metasound.py @@ -34,6 +34,9 @@ def metasound(text, length=4): metasound("รักษ์") # 'ร100' metasound("บูรณการ", 5)) # 'บ5515' """ + if not text: + return "" + # keep only consonants and thanthakhat chars = [] for ch in text: diff --git a/pythainlp/soundex/udom83.py b/pythainlp/soundex/udom83.py index 8d4f464ff..bf7ec5bba 100644 --- a/pythainlp/soundex/udom83.py +++ b/pythainlp/soundex/udom83.py @@ -37,6 +37,9 @@ def udom83(text): :return: Udom83 soundex """ + if not text: + return "" + text = _RE_1.sub("ัน\\1", text) text = _RE_2.sub("ั\\1", text) text = _RE_3.sub("ัน\\1", text) @@ -49,6 +52,9 @@ def udom83(text): text = _RE_10.sub("", text) text = _RE_11.sub("", text) + if not text: + return "" + sd = text[0].translate(_TRANS1) sd += text[1:].translate(_TRANS2) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 804d11e23..e81c3214d 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -59,18 +59,18 @@ def segment(text): return segment(text) -def dict_word_tokenize(text, custom_dict_trie, engine="newmm"): +def dict_word_tokenize(text, custom_dict, engine="newmm"): """ :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized - :param dict custom_dict_trie: a dictionary trie + :param dict custom_dict: a dictionary trie :param str engine: choose between different options of engine to token (newmm, longest) :return: list of words **Example**:: - >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie - >>> listword = ["แมว", "ดี"] - >>> data_dict = create_custom_dict_trie(listword) - >>> dict_word_tokenize("แมวดีดีแมว", data_dict) + >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie + >>> words = ["แมว", "ดี"] + >>> trie = dict_trie(words) + >>> dict_word_tokenize("แมวดีดีแมว", trie) ['แมว', 'ดี', 'ดี', 'แมว'] """ if engine == "newmm" or engine == "onecut": @@ -82,7 +82,7 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"): else: # default, use "newmm" engine from .newmm import mmcut as segment - return segment(text, custom_dict_trie) + return segment(text, custom_dict) def sent_tokenize(text, engine="whitespace+newline"): @@ -126,7 +126,7 @@ def syllable_tokenize(text): words = word_tokenize(text) trie = dict_trie(dict_source=thai_syllables()) for word in words: - tokens.extend(dict_word_tokenize(text=word, custom_dict_trie=trie)) + tokens.extend(dict_word_tokenize(text=word, custom_dict=trie)) return tokens diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 1b50e41cb..483685da2 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -138,8 +138,3 @@ def segment(text, trie=None): if not trie: trie = DEFAULT_DICT_TRIE return Tokenizer(trie).tokenize(text) - - -if __name__ == "__main__": - text = "รถประจำทาง ลำปาง-แม่เมาะ AB-2390-30" - print(segment(text)) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 0b7e115f4..80f621c27 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -128,10 +128,3 @@ def find_all_segment(text, trie=None): """ ww = list(multicut(text, trie=trie)) return list(combine(ww)) - - -if __name__ == "__main__": - text = "ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด" - print(mmcut(text)) - for one in find_all_segment(text): - print(one) diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 4a02b9f4e..9a2ffa581 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -28,7 +28,3 @@ def _gen_words(text): def segment(text): text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text) return list(_gen_words(text)) - - -if __name__ == "__main__": - print(segment("พูดไทย2คำEnglishคำ")) diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py index 7cdbe2aa0..9615d53a1 100644 --- a/pythainlp/util/keyboard.py +++ b/pythainlp/util/keyboard.py @@ -121,8 +121,3 @@ def thai_to_eng(text): return "".join( [TH_EN_KEYB_PAIRS[ch] if (ch in TH_EN_KEYB_PAIRS) else ch for ch in text] ) - - -if __name__ == "__main__": - print(eng_to_thai("l;ylfu8iy[")) - print(thai_to_eng("นามรสนอำันี")) diff --git a/tests/__init__.py b/tests/__init__.py index 4cae93fd6..d80cc21d0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -34,7 +34,7 @@ from pythainlp.rank import rank from pythainlp.romanization import romanize from pythainlp.sentiment import sentiment -from pythainlp.soundex import lk82, metasound, udom83 +from pythainlp.soundex import lk82, metasound, soundex, udom83 from pythainlp.spell import correct, spell from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents @@ -204,7 +204,9 @@ def test_number(self): # ### pythainlp.rank def test_rank(self): + self.assertEqual(rank([]), None) self.assertEqual(rank(["แมว", "คน", "แมว"]), Counter({"แมว": 2, "คน": 1})) + self.assertIsNotNone(rank(["แมว", "คน", "แมว"], stopword=True)) # ### pythainlp.romanization @@ -214,6 +216,7 @@ def test_romanization(self): def test_romanization_royin(self): engine = "royin" + self.assertIsNotNone(romanize("กก", engine=engine)) self.assertEqual(romanize("แมว", engine=engine), "maeo") self.assertEqual(romanize("เดือน", engine=engine), "duean") self.assertEqual(romanize("ดู", engine=engine), "du") @@ -230,13 +233,26 @@ def test_sentiment(self): # ### pythainlp.soundex def test_soundex(self): + self.assertIsNotNone(soundex("a", engine="lk82")) + self.assertIsNotNone(soundex("a", engine="udom83")) + self.assertIsNotNone(soundex("a", engine="metasound")) + self.assertIsNotNone(soundex("a", engine="XXX")) + self.assertEqual(lk82("รถ"), "ร3000") + self.assertIsNotNone(lk82("เกาะ")) + self.assertIsNotNone(lk82("อุยกูร์")) + self.assertIsNotNone(lk82("หยากไย่")) + self.assertEqual(lk82(""), "") + self.assertEqual(udom83("รถ"), "ร800000") + self.assertEqual(udom83(None), "") + self.assertEqual(metasound("บูรณะ"), "บ550") self.assertEqual(metasound("คน"), "ค500") self.assertEqual(metasound("คนA"), "ค500") self.assertEqual(metasound("ดา"), "ด000") self.assertEqual(metasound("รักษ์"), metasound("รัก")) + self.assertEqual(metasound(""), "") # ### pythainlp.spell @@ -349,6 +365,7 @@ def test_deletetone(self): def test_is_thai(self): self.assertEqual(is_thai("ประเทศไทย"), {"thai": 100.0}) + self.assertIsNotNone(is_thai("เผือก", check_all=True)) def test_is_thaichar(self): self.assertEqual(is_thaichar("ก"), True) @@ -362,6 +379,7 @@ def test_is_thaiword(self): def test_normalize(self): self.assertEqual(normalize("เเปลก"), "แปลก") + self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) def test_keyboard(self): self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ")