PyThaiNLP · bact · Nov 6, 2018 · Nov 5, 2018 · Nov 5, 2018 · Nov 6, 2018
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -9,4 +9,4 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
 .. autofunction:: dict_word_tokenize
 .. autofunction:: subword_tokenize
 .. autofunction:: sent_tokenize
-.. autofunction:: create_custom_dict_trie
+.. autofunction:: dict_trie
diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """
-WordNet
+NLTK WordNet wrapper
+
+API here is exactly the same as NLTK API,
+except that lang (language) argument will be "tha" (Thai) by default.
 """
 import nltk
 

diff --git a/pythainlp/number/numtoword.py b/pythainlp/number/numtoword.py
@@ -95,7 +95,3 @@ def num_to_thaiword(number):
             pos += 1
 
     return ret
-
-
-if __name__ == "__main__":
-    print(bahttext(4000.0))
diff --git a/pythainlp/rank/__init__.py b/pythainlp/rank/__init__.py
@@ -7,18 +7,15 @@
 
 
 # เรียงจำนวนคำของประโยค
-def rank(data, stopword=False):
+def rank(words, stopword=False):
     """
     Sort words by frequency
-    รับค่าเป็น ''list'' คืนค่าเป็น ''dict'' [(คำ, จำนวน), (คำ, จำนวน), ...]
+    รับค่าเป็น ''list'' คืนค่าเป็น ''Counter'' Counter({"คำ": จำนวน, "คำ": จำนวน})
     """
-    if stopword:
-        data = [word for word in data if word not in _STOPWORDS]
-
-    rankdata = Counter(data)
-
-    return rankdata
+    if not words:
+        return None
 
+    if stopword:
+        words = [word for word in words if word not in _STOPWORDS]
 
-if __name__ == "__main__":
-    print(rank(["แมว", "ชอบ", "ปลา", "แมว", "ชอบ", "นอน", "คน", "เป็น", "ทาส", "แมว"]))
+    return Counter(words)
diff --git a/pythainlp/romanization/royin.py b/pythainlp/romanization/royin.py
@@ -177,7 +177,3 @@ def romanize(word):
         word2 = "".join(word2)
     word2 = _replace_consonants(word2, res)
     return word2
-
-
-if __name__ == "__main__":
-    print(romanize("กร") == romanize("กอน"))
diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py
@@ -49,8 +49,3 @@ def sentiment(text, engine="old"):
         featurized_test_sentence = {i: (i in text) for i in vocabulary}
 
         return classifier.classify(featurized_test_sentence)
-
-
-if __name__ == "__main__":
-    text = "เสียใจแย่มากเลย"
-    print(sentiment(text))
diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Thai soundex - lk82 system
+Thai soundex - LK82 system
 
 Python implementation: Korakot Chaovavanich
 https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
@@ -23,22 +23,29 @@
 
 def lk82(text):
     """
-    LK82 - It's a thai soundex rule.
+    LK82 - It's a Thai soundex rule.
 
     :param str text: Thai word
     :return: LK82 soundex
     """
-    res = []
+    if not text:
+        return ""
+
     text = _RE_1.sub("", text)  # 4.ลบวรรณยุกต์
     text = _RE_2.sub("", text)  # 4.ลบตัวการันต์
     text = _RE_3.sub("", text)  # 5.ทิ้งไม้ไต่คู่ ฯลฯ
 
+    if not text:
+        return ""
+
     # 6.เข้ารหัสตัวแรก
+    res = []
     if "ก" <= text[0] <= "ฮ":
         res.append(text[0].translate(_TRANS1))
         text = text[1:]
     else:
-        res.append(text[1].translate(_TRANS1))
+        if len(text) > 1:
+            res.append(text[1].translate(_TRANS1))
         res.append(text[0].translate(_TRANS2))
         text = text[2:]
 

diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py
@@ -34,6 +34,9 @@ def metasound(text, length=4):
         metasound("รักษ์")  # 'ร100'
         metasound("บูรณการ", 5))  # 'บ5515'
     """
+    if not text:
+        return ""
+
     # keep only consonants and thanthakhat
     chars = []
     for ch in text:

diff --git a/pythainlp/soundex/udom83.py b/pythainlp/soundex/udom83.py
@@ -37,6 +37,9 @@ def udom83(text):
     :return: Udom83 soundex
     """
 
+    if not text:
+        return ""
+
     text = _RE_1.sub("ัน\\1", text)
     text = _RE_2.sub("ั\\1", text)
     text = _RE_3.sub("ัน\\1", text)
@@ -49,6 +52,9 @@ def udom83(text):
     text = _RE_10.sub("", text)
     text = _RE_11.sub("", text)
 
+    if not text:
+        return ""
+
     sd = text[0].translate(_TRANS1)
     sd += text[1:].translate(_TRANS2)
 

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -59,18 +59,18 @@ def segment(text):
     return segment(text)
 
 
-def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
+def dict_word_tokenize(text, custom_dict, engine="newmm"):
     """
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
     :param str text: text to be tokenized
-    :param dict custom_dict_trie: a dictionary trie
+    :param dict custom_dict: a dictionary trie
     :param str engine: choose between different options of engine to token (newmm, longest)
     :return: list of words
     **Example**::
-        >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
-        >>> listword = ["แมว", "ดี"]
-        >>> data_dict = create_custom_dict_trie(listword)
-        >>> dict_word_tokenize("แมวดีดีแมว", data_dict)
+        >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie
+        >>> words = ["แมว", "ดี"]
+        >>> trie = dict_trie(words)
+        >>> dict_word_tokenize("แมวดีดีแมว", trie)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
     if engine == "newmm" or engine == "onecut":
@@ -82,7 +82,7 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
     else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
-    return segment(text, custom_dict_trie)
+    return segment(text, custom_dict)
 
 
 def sent_tokenize(text, engine="whitespace+newline"):
@@ -126,7 +126,7 @@ def syllable_tokenize(text):
         words = word_tokenize(text)
         trie = dict_trie(dict_source=thai_syllables())
         for word in words:
-            tokens.extend(dict_word_tokenize(text=word, custom_dict_trie=trie))
+            tokens.extend(dict_word_tokenize(text=word, custom_dict=trie))
 
     return tokens
 

diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -138,8 +138,3 @@ def segment(text, trie=None):
     if not trie:
         trie = DEFAULT_DICT_TRIE
     return Tokenizer(trie).tokenize(text)
-
-
-if __name__ == "__main__":
-    text = "รถประจำทาง ลำปาง-แม่เมาะ AB-2390-30"
-    print(segment(text))
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -128,10 +128,3 @@ def find_all_segment(text, trie=None):
     """
     ww = list(multicut(text, trie=trie))
     return list(combine(ww))
-
-
-if __name__ == "__main__":
-    text = "ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด"
-    print(mmcut(text))
-    for one in find_all_segment(text):
-        print(one)
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
@@ -28,7 +28,3 @@ def _gen_words(text):
 def segment(text):
     text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text)
     return list(_gen_words(text))
-
-
-if __name__ == "__main__":
-    print(segment("พูดไทย2คำEnglishคำ"))
diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py
@@ -121,8 +121,3 @@ def thai_to_eng(text):
     return "".join(
         [TH_EN_KEYB_PAIRS[ch] if (ch in TH_EN_KEYB_PAIRS) else ch for ch in text]
     )
-
-
-if __name__ == "__main__":
-    print(eng_to_thai("l;ylfu8iy["))
-    print(thai_to_eng("นามรสนอำันี"))
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -34,7 +34,7 @@
 from pythainlp.rank import rank
 from pythainlp.romanization import romanize
 from pythainlp.sentiment import sentiment
-from pythainlp.soundex import lk82, metasound, udom83
+from pythainlp.soundex import lk82, metasound, soundex, udom83
 from pythainlp.spell import correct, spell
 from pythainlp.summarize import summarize
 from pythainlp.tag import pos_tag, pos_tag_sents
@@ -204,7 +204,9 @@ def test_number(self):
     # ### pythainlp.rank
 
     def test_rank(self):
+        self.assertEqual(rank([]), None)
         self.assertEqual(rank(["แมว", "คน", "แมว"]), Counter({"แมว": 2, "คน": 1}))
+        self.assertIsNotNone(rank(["แมว", "คน", "แมว"], stopword=True))
 
     # ### pythainlp.romanization
 
@@ -214,6 +216,7 @@ def test_romanization(self):
 
     def test_romanization_royin(self):
         engine = "royin"
+        self.assertIsNotNone(romanize("กก", engine=engine))
         self.assertEqual(romanize("แมว", engine=engine), "maeo")
         self.assertEqual(romanize("เดือน", engine=engine), "duean")
         self.assertEqual(romanize("ดู", engine=engine), "du")
@@ -230,13 +233,26 @@ def test_sentiment(self):
     # ### pythainlp.soundex
 
     def test_soundex(self):
+        self.assertIsNotNone(soundex("a", engine="lk82"))
+        self.assertIsNotNone(soundex("a", engine="udom83"))
+        self.assertIsNotNone(soundex("a", engine="metasound"))
+        self.assertIsNotNone(soundex("a", engine="XXX"))
+
         self.assertEqual(lk82("รถ"), "ร3000")
+        self.assertIsNotNone(lk82("เกาะ"))
+        self.assertIsNotNone(lk82("อุยกูร์"))
+        self.assertIsNotNone(lk82("หยากไย่"))
+        self.assertEqual(lk82(""), "")
+
         self.assertEqual(udom83("รถ"), "ร800000")
+        self.assertEqual(udom83(None), "")
+
         self.assertEqual(metasound("บูรณะ"), "บ550")
         self.assertEqual(metasound("คน"), "ค500")
         self.assertEqual(metasound("คนA"), "ค500")
         self.assertEqual(metasound("ดา"), "ด000")
         self.assertEqual(metasound("รักษ์"), metasound("รัก"))
+        self.assertEqual(metasound(""), "")
 
     # ### pythainlp.spell
 
@@ -349,6 +365,7 @@ def test_deletetone(self):
 
     def test_is_thai(self):
         self.assertEqual(is_thai("ประเทศไทย"), {"thai": 100.0})
+        self.assertIsNotNone(is_thai("เผือก", check_all=True))
 
     def test_is_thaichar(self):
         self.assertEqual(is_thaichar("ก"), True)
@@ -362,6 +379,7 @@ def test_is_thaiword(self):
 
     def test_normalize(self):
         self.assertEqual(normalize("เเปลก"), "แปลก")
+        self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))
 
     def test_keyboard(self):
         self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ")