PyThaiNLP · wannaphong · Oct 24, 2018 · Oct 23, 2018 · Oct 23, 2018 · Oct 23, 2018
diff --git a/docs/conf.py b/docs/conf.py
@@ -16,7 +16,6 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 from datetime import datetime
-import sys, os
 
 # -- Project information -----------------------------------------------------
 

diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
diff --git a/examples/tokenize.py b/examples/tokenize.py
@@ -20,5 +20,5 @@
 print(word_tokenize(text2))
 # ['กฎหมายแรงงาน']
 
-print(word_tokenize(text2, engine="longest-matching"))
+print(word_tokenize(text2, engine="longest"))
 # ['กฎหมาย', 'แรงงาน']
diff --git a/pythainlp/MetaSound.py b/pythainlp/MetaSound.py
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -3,7 +3,7 @@
 from pythainlp.collation import collation
 from pythainlp.date import now
 from pythainlp.keywords import find_keyword
-from pythainlp.MetaSound import MetaSound
+from pythainlp.metasound import metasound
 from pythainlp.rank import rank
 from pythainlp.romanization import romanize
 from pythainlp.sentiment import sentiment

diff --git a/pythainlp/metasound.py b/pythainlp/metasound.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+MetaSound - Thai soundex system
+
+References:
+Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
+Ontology for Analysing Names Given in Accordance with Thai Astrology.
+https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
+"""
+
+_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
+_THANTHAKHAT = "์"  # \u0e4c
+_C1 = "กขฃคฆฅ"  # sound K -> coded letter 1
+_C2 = "จฉชฌซฐทฒดฎตสศษ"  # D -> 2
+_C3 = "ฟฝพผภบป"  # B -> 3
+_C4 = "ง"  # NG -> 4
+_C5 = "ลฬรนณฦญ"  # N -> 5
+_C6 = "ม"  # M -> 6
+_C7 = "ย"  # Y -> 7
+_C8 = "ว"  # W -> 8
+
+
+def metasound(text, length=4):
+    """
+    Thai MetaSound
+
+    :param str text: Thai text
+    :param int length: preferred length of the MetaSound (default is 4)
+    :return: MetaSound for the text
+    **Example**::
+        from pythainlp.metasound import metasound
+        metasound("ลัก")  # 'ล100'
+        metasound("รัก")  # 'ร100'
+        metasound("รักษ์")  # 'ร100'
+        metasound("บูรณการ", 5))  # 'บ5515'
+    """
+    # keep only consonants and thanthakhat
+    chars = []
+    for ch in text:
+        if ch in _CONS_THANTHAKHAT:
+            chars.append(ch)
+
+    # remove karan (thanthakhat and a consonant before it)
+    i = 0
+    while i < len(chars):
+        if chars[i] == _THANTHAKHAT:
+            if i > 0:
+                chars[i - 1] = " "
+            chars[i] = " "
+        i += 1
+
+    # retain first consonant, encode the rest
+    chars = chars[:length]
+    i = 1
+    while i < len(chars):
+        if chars[i] in _C1:
+            chars[i] = "1"
+        elif chars[i] in _C2:
+            chars[i] = "2"
+        elif chars[i] in _C3:
+            chars[i] = "3"
+        elif chars[i] in _C4:
+            chars[i] = "4"
+        elif chars[i] in _C5:
+            chars[i] = "5"
+        elif chars[i] in _C6:
+            chars[i] = "6"
+        elif chars[i] in _C7:
+            chars[i] = "7"
+        elif chars[i] in _C8:
+            chars[i] = "8"
+        else:
+            chars[i] = "0"
+        i += 1
+
+    while len(chars) < length:
+        chars.append("0")
+
+    return "".join(chars)
+
+
+if __name__ == "__main__":
+    print(metasound("บูรณะ"))  # บ550 (an example from the original paper [Figure 4])
+    print(metasound("บูรณการ", 5))  # บ5515
+    print(metasound("ลักษณะ"))  # ล125
+    print(metasound("ลัก"))  # ล100
+    print(metasound("รัก"))  # ร100
+    print(metasound("รักษ์"))  # ร100
+    print(metasound(""))  # 0000
+
+    print(metasound("คน"))
+    print(metasound("คนA"))
+    print(metasound("ดา"))
+    print(metasound("ปา"))
+    print(metasound("งา"))
+    print(metasound("ลา"))
+    print(metasound("มา"))
+    print(metasound("วา"))
diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 
-import sys
-
 try:
     import icu
 except ImportError:

diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
@@ -3,7 +3,6 @@
 Sentiment analyzer based on thai2vec ("ulmfit" engine)
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
-import sys
 from collections import defaultdict
 
 from pythainlp.corpus import download, get_file
@@ -85,7 +84,8 @@ def about():
     return """
     Sentiment analyzer based on thai2vec
     Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
-    89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline.
+    89% accuracy based on 15% validation set compared to
+    72% of fastText and 52% most-frequent-class baseline.
 
     Development: Charin Polpanumas
     GitHub: https://github.com/cstorm125/thai2vec

diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -2,9 +2,8 @@
 """
 Part-Of-Speech tagger
 """
-import sys
 
-ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
+_ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
 
 
 def pos_tag(words, engine="unigram", corpus="orchid"):
@@ -31,11 +30,11 @@ def _tag(text, corpus=None):
             except ImportError:
                 from pythainlp.tools import install_package
 
-                install_package(ARTAGGER_URL)
+                install_package(_ARTAGGER_URL)
                 try:
                     from artagger import Tagger
                 except ImportError:
-                    raise ImportError("Error: Try 'pip install " + ARTAGGER_URL + "'")
+                    raise ImportError("Error: Try 'pip install " + _ARTAGGER_URL + "'")
 
             words = Tagger().tag(" ".join(text))
 

diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -24,7 +24,7 @@ def pud_data():
     return model
 
 
-def tag(text, corpus):
+def tag(text, corpus="pud"):
     """
     รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
     if corpus == "orchid":

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -16,42 +16,48 @@
 
 def word_tokenize(text, engine="newmm", whitespaces=True):
     """
-    :param str text:  the text to be tokenized
-    :param str engine: the engine to tokenize text
-    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
+    :param str text: text to be tokenized
+    :param str engine: tokenizer to be used
+    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
     :Parameters for engine:
-        * newmm - Maximum Matching algorithm + TCC
-        * icu -  IBM ICU
-        * longest-matching - Longest matching
-        * mm - Maximum Matching algorithm
-        * pylexto - LexTo
-        * deepcut - Deep Neural Network
-        * wordcutpy - wordcutpy (https://github.com/veer66/wordcutpy)
-    :return: A list of words, tokenized from a text
+        * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
+        * longest - dictionary-based, Longest Matching
+        * icu - wrapper for ICU, dictionary-based
+        * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
+        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
+        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
+        * ulmfit - use newmm engine with a specific dictionary for use with thai2vec
+    :return: list of words, tokenized from the text
 
     **Example**::
-    from pythainlp.tokenize import word_tokenize
-    text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
-    word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
-    word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
+        >>> from pythainlp.tokenize import word_tokenize
+        >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
+        >>> word_tokenize(text, engine="newmm")
+        ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
+        >>> word_tokenize(text, engine="icu")
+        ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
     """
-    if engine == "icu":
-        from .pyicu import segment
-    elif engine == "multi_cut" or engine == "mm":
-        from .multi_cut import segment
+    if engine == "newmm" or engine == "onecut":
+        from .newmm import mmcut as segment
+    elif engine == "longest" or engine == "longest-matching":
+        from .longest import segment
     elif engine == "ulmfit":
         from .newmm import mmcut
+
         def segment(text):
             return mmcut(text, trie=FROZEN_DICT_TRIE)
-    elif engine == "longest-matching":
-        from .longest import segment
-    elif engine == "pylexto":
-        from .pylexto import segment
+
+    elif engine == "icu":
+        from .pyicu import segment
     elif engine == "deepcut":
         from .deepcut import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "pylexto":
+        from .pylexto import segment
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     if not whitespaces:
@@ -63,27 +69,28 @@ def segment(text):
 def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
     """
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
-
-    :param str text: the text to be tokenized
-    :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
-    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
-    :return: A list of words, tokenized from a text.
+    :param str text: text to be tokenized
+    :param dict custom_dict_trie: a dictionary trie
+    :param str engine: choose between different options of engine to token (newmm, longest, wordcutpy)
+    :return: list of words
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
-        >>> listword=['แมว',"ดี"]
-        >>> data_dict=create_custom_dict_trie(listword)
-        >>> dict_word_tokenize("แมวดีดีแมว",data_dict)
+        >>> listword = ["แมว", "ดี"]
+        >>> data_dict = create_custom_dict_trie(listword)
+        >>> dict_word_tokenize("แมวดีดีแมว", data_dict)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
-    if engine == "mm" or engine == "multi_cut":
-        from .multi_cut import segment
-    elif engine == "longest-matching":
+    if engine == "newmm" or engine == "onecut":
+        from .newmm import mmcut as segment
+    elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
 
         return segment(text, custom_dict_trie.keys())
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     return segment(text, custom_dict_trie)
@@ -167,11 +174,12 @@ def syllable_tokenize(text):
 
 
 def create_custom_dict_trie(custom_dict_source):
-    """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html
-
-    :param string/list custom_dict_source:  a list of vocaburaries or a path to source file
+    """
+    The function is used to create a custom dict trie which will be used for word_tokenize() function.
+    For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html
 
-    :return: A trie created from custom dict input
+    :param string/list custom_dict_source: a list of vocaburaries or a path to source file
+    :return: a trie created from custom dictionary input
     """
 
     if type(custom_dict_source) is str:

diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
@@ -2,7 +2,6 @@
 """
 Wrapper for deepcut Thai word segmentation
 """
-import sys
 
 try:
     import deepcut
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ @@
     """
     Wrapper for deepcut Thai word segmentation
     """
-    import sys
     try:
         import deepcut
@@ Expand Down @@