From eee54c17d4086d8fe21f6cffb06442d994fd29ae Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 19:25:59 +0700
Subject: [PATCH 01/12] Sorting tokenizers - more details on word tokenizers -
 add alias "longest" for longest-matching tokenizer - remove "dict" tokenizer
 from document, as there is no implementation in the code - remove "mm"
 tokenizer from document, as it is not recommended to use (maintenance mode),
 but keep the code, so people can call it - update doc: pythainlp-dev-thai.md
 - remove unused import sys

---
 docs/pythainlp-dev-thai.md         | 28 ++++++------
 examples/tokenize.py               |  2 +-
 pythainlp/romanization/pyicu.py    |  2 -
 pythainlp/sentiment/ulmfit_sent.py |  1 -
 pythainlp/tag/__init__.py          |  7 ++-
 pythainlp/tag/perceptron.py        |  2 +-
 pythainlp/tokenize/__init__.py     | 70 +++++++++++++++++-------------
 pythainlp/tokenize/deepcut.py      |  1 -
 pythainlp/tokenize/longest.py      |  3 +-
 pythainlp/tokenize/pyicu.py        |  1 -
 pythainlp/tokenize/pylexto.py      |  1 -
 pythainlp/tokenize/wordcutpy.py    |  1 -
 pythainlp/ulmfit/utils.py          | 12 +++--
 pythainlp/word_vector/thai2vec.py  |  2 -
 tests/__init__.py                  |  2 +-
 15 files changed, 65 insertions(+), 70 deletions(-)

diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index 82a5581e5..e93b0a6cb 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -50,18 +50,16 @@ word_tokenize(text, engine)
 ```
 text คือ ข้อความในรูปแบบสตริง str เท่านั้น
 
-engine คือ ระบบตัดคำ ปัจจุบัน PyThaiNLP มี 6 engine ดังนี้
+engine คือ ระบบตัดคำ ปัจจุบันมี engine ดังนี้
 
-1. newmm (ค่าเริ่มต้น) - ใช้วิธี Maximum Matching โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
-2. icu - เรียกใช้ตัวตัดคำจาก ICU ตัวดั้งเดิมของ PyThaiNLP (ความแม่นยำต่ำ)
-3. dict - ตัดคำโดยใช้พจานุกรมจาก thaiword.txt ใน corpus (ความแม่นยำปานกลาง) **จะคืนค่า False หากข้อความนั้นไม่สามารถตัดคำได้**
-4. longest-matching - ใช้วิธี Longest Matching
-5. mm - ใช้วิธี Maximum Matching **(โค้ดชุดเก่า อยู่ในสถานะบำรุงรักษาเท่านั้น)**
-6. pylexto - เรียกใช้ตัวตัดคำจาก LexTo ซึ่งเป็น Longest Matching
-7. deepcut - เรียกใช้ [deepcut](https://github.com/rkcosmos/deepcut) ซึ่งตัดคำจากโมเดลการเรียนรู้ของเครื่อง
-8. wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy)
+- newmm (ค่าเริ่มต้น) - ใช้พจนานุกรม ด้วยวิธี Maximum Matching โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
+- longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
+- icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
+- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
+- wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy) ใช้พจนานุกรม
+- deepcut - เรียกใช้ตัวตัดคำจาก [deepcut](https://github.com/rkcosmos/deepcut) ใช้การเรียนรู้ของเครื่อง
 
-คืนค่าเป็น ''list'' เช่น ['แมว','กิน']
+คืนค่าเป็น ''list'' เช่น ['แมว', 'กิน']
 
 **การใช้งาน**
 
@@ -86,11 +84,11 @@ text คือ ข้อความที่ต้องการตัดค
 
 filename คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
 
-engine คือ เครื่องมือตัดคำ
-- newmm ตัดคำด้วย newmm
-- wordcutpy ใช้ [wordcutpy](https://github.com/veer66/wordcutpy) ในการตัดคำ
-- mm ตัดคำด้วย mm
-- longest-matching ตัดคำโดยใช้ longest matching
+engine คือ ระบบตัดคำ (ดูรายละเอียดที่ word_tokenize)
+- newmm 
+- mm
+- longest
+- wordcutpy
 
 ตัวอย่างการใช้งาน https://gist.github.com/wannaphongcom/1e862583051bf0464b6ef4ed592f739c
 
diff --git a/examples/tokenize.py b/examples/tokenize.py
index c6b6028e5..0b8a0d00b 100644
--- a/examples/tokenize.py
+++ b/examples/tokenize.py
@@ -20,5 +20,5 @@
 print(word_tokenize(text2))
 # ['กฎหมายแรงงาน']
 
-print(word_tokenize(text2, engine="longest-matching"))
+print(word_tokenize(text2, engine="longest"))
 # ['กฎหมาย', 'แรงงาน']
diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py
index 3ba61fc18..732db3e24 100644
--- a/pythainlp/romanization/pyicu.py
+++ b/pythainlp/romanization/pyicu.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 
-import sys
-
 try:
     import icu
 except ImportError:
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
index fb58d227f..ee9809e13 100644
--- a/pythainlp/sentiment/ulmfit_sent.py
+++ b/pythainlp/sentiment/ulmfit_sent.py
@@ -3,7 +3,6 @@
 Sentiment analyzer based on thai2vec ("ulmfit" engine)
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
-import sys
 from collections import defaultdict
 
 from pythainlp.corpus import download, get_file
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
index 8efb234e4..ec367816c 100644
--- a/pythainlp/tag/__init__.py
+++ b/pythainlp/tag/__init__.py
@@ -2,9 +2,8 @@
 """
 Part-Of-Speech tagger
 """
-import sys
 
-ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
+_ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
 
 
 def pos_tag(words, engine="unigram", corpus="orchid"):
@@ -31,11 +30,11 @@ def _tag(text, corpus=None):
             except ImportError:
                 from pythainlp.tools import install_package
 
-                install_package(ARTAGGER_URL)
+                install_package(_ARTAGGER_URL)
                 try:
                     from artagger import Tagger
                 except ImportError:
-                    raise ImportError("Error: Try 'pip install " + ARTAGGER_URL + "'")
+                    raise ImportError("Error: Try 'pip install " + _ARTAGGER_URL + "'")
 
             words = Tagger().tag(" ".join(text))
 
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
index 1b0927d9a..519b35a03 100644
--- a/pythainlp/tag/perceptron.py
+++ b/pythainlp/tag/perceptron.py
@@ -24,7 +24,7 @@ def pud_data():
     return model
 
 
-def tag(text, corpus):
+def tag(text, corpus="pud"):
     """
     รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
     if corpus == "orchid":
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 2cc0c689c..560a8ec80 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -16,42 +16,48 @@
 
 def word_tokenize(text, engine="newmm", whitespaces=True):
     """
-    :param str text:  the text to be tokenized
-    :param str engine: the engine to tokenize text
-    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
+    :param str text: text to be tokenized
+    :param str engine: tokenizer to be used
+    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
     :Parameters for engine:
-        * newmm - Maximum Matching algorithm + TCC
-        * icu -  IBM ICU
-        * longest-matching - Longest matching
-        * mm - Maximum Matching algorithm
-        * pylexto - LexTo
-        * deepcut - Deep Neural Network
-        * wordcutpy - wordcutpy (https://github.com/veer66/wordcutpy)
-    :return: A list of words, tokenized from a text
+        * newmm (default) - dictionary-based, Maximum Matching + TCC
+        * mm - dictionary-based, Maximum Matching
+        * longest - dictionary-based, Longest Matching
+        * icu - wrapper for ICU, dictionary-based
+        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
+        * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
+        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
+    :return: list of words, tokenized from the text
 
     **Example**::
-    from pythainlp.tokenize import word_tokenize
-    text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
-    word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
-    word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
+        >>> from pythainlp.tokenize import word_tokenize
+        >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
+        >>> word_tokenize(text, engine="newmm")
+        ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
+        >>> word_tokenize(text, engine="icu")
+        ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
     """
-    if engine == "icu":
+    if engine == "newmm" or engine == "onecut":
         from .pyicu import segment
-    elif engine == "multi_cut" or engine == "mm":
-        from .multi_cut import segment
+    elif engine == "longest" or engine == "longest-matching":
+        from .longest import segment
     elif engine == "ulmfit":
         from .newmm import mmcut
+
         def segment(text):
             return mmcut(text, trie=FROZEN_DICT_TRIE)
-    elif engine == "longest-matching":
-        from .longest import segment
-    elif engine == "pylexto":
-        from .pylexto import segment
+
+    elif engine == "icu":
+        from .pyicu import segment
     elif engine == "deepcut":
         from .deepcut import segment
+    elif engine == "pylexto":
+        from .pylexto import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     if not whitespaces:
@@ -66,24 +72,26 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
 
     :param str text: the text to be tokenized
     :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
-    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
+    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest)
     :return: A list of words, tokenized from a text.
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
-        >>> listword=['แมว',"ดี"]
-        >>> data_dict=create_custom_dict_trie(listword)
-        >>> dict_word_tokenize("แมวดีดีแมว",data_dict)
+        >>> listword = ["แมว", "ดี"]
+        >>> data_dict = create_custom_dict_trie(listword)
+        >>> dict_word_tokenize("แมวดีดีแมว", data_dict)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
-    if engine == "mm" or engine == "multi_cut":
-        from .multi_cut import segment
-    elif engine == "longest-matching":
+    if engine == "newmm" or engine == "onecut":
+        from .pyicu import segment
+    elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
 
         return segment(text, custom_dict_trie.keys())
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     return segment(text, custom_dict_trie)
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
index bad0eb906..20f744f25 100644
--- a/pythainlp/tokenize/deepcut.py
+++ b/pythainlp/tokenize/deepcut.py
@@ -2,7 +2,6 @@
 """
 Wrapper for deepcut Thai word segmentation
 """
-import sys
 
 try:
     import deepcut
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index a6b1ad8d8..2cb90479e 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -2,7 +2,8 @@
 """
 Longest-matching Thai word segmentation
 
-Based on code from https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
+Based on code from
+https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
 """
 import re
 
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
index d8c107e0d..45b5adb05 100644
--- a/pythainlp/tokenize/pyicu.py
+++ b/pythainlp/tokenize/pyicu.py
@@ -3,7 +3,6 @@
 Wrapper for ICU word segmentation
 """
 import re
-import sys
 
 try:
     import icu
diff --git a/pythainlp/tokenize/pylexto.py b/pythainlp/tokenize/pylexto.py
index ba137db7b..a90bb3109 100644
--- a/pythainlp/tokenize/pylexto.py
+++ b/pythainlp/tokenize/pylexto.py
@@ -2,7 +2,6 @@
 """
 Wrapper for LexTo Thai word segmentation
 """
-import sys
 
 _LEXTO_URL = "https://github.com/PyThaiNLP/pylexto/archive/master.zip"
 
diff --git a/pythainlp/tokenize/wordcutpy.py b/pythainlp/tokenize/wordcutpy.py
index 2dff149e3..04b810816 100644
--- a/pythainlp/tokenize/wordcutpy.py
+++ b/pythainlp/tokenize/wordcutpy.py
@@ -2,7 +2,6 @@
 """
 Wrapper for WordCut Thai word segmentation
 """
-import sys
 
 try:
     from wordcut import Wordcut
diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py
index 138db40c1..75ebd00c5 100644
--- a/pythainlp/ulmfit/utils.py
+++ b/pythainlp/ulmfit/utils.py
@@ -4,7 +4,6 @@
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
 import re
-import sys
 
 from pythainlp.corpus import download, get_file
 from pythainlp.tokenize import word_tokenize
@@ -43,12 +42,11 @@ class ThaiTokenizer:
     def __init__(self, engine="newmm"):
         """
         :parameters for tokenization engine:
-            * newmm - Maximum Matching algorithm + TCC
-            * icu - IBM ICU
-            * longest-matching - Longest matching
-            * mm - Maximum Matching algorithm
-            * pylexto - LexTo
-            * deepcut - Deep Neural Network
+            * newmm - dictionary-based, Maximum Matching algorithm + TCC
+            * longest - dictionary-based, Longest Matching
+            * icu - use ICU, dictionary-based
+            * pylexto - use LexTo, dictionary-based
+            * deepcut - use deepcut, language model-based
         """
         self.engine = engine
         self.__RE_BR = re.compile(r"<\s*br\s*/?>", re.IGNORECASE)
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
index c31f8f685..dd7a03ca2 100644
--- a/pythainlp/word_vector/thai2vec.py
+++ b/pythainlp/word_vector/thai2vec.py
@@ -3,8 +3,6 @@
 thai2vec - Thai word vector
 Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
 """
-import sys
-
 from pythainlp.corpus import download as download_data
 from pythainlp.corpus import get_file
 from pythainlp.tokenize import word_tokenize
diff --git a/tests/__init__.py b/tests/__init__.py
index f606f6a00..06fb1a772 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -78,7 +78,7 @@ def test_segment_newmm(self):
 
     def test_segment_longest_matching(self):
         self.assertEqual(
-            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest-matching"),
+            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
 

From 060dae253e5b41ab4bd10269e67d3b8ecfb0ab60 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 19:31:47 +0700
Subject: [PATCH 02/12] Update doc

---
 docs/pythainlp-dev-thai.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index e93b0a6cb..ca2a41ee1 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -52,7 +52,7 @@ text คือ ข้อความในรูปแบบสตริง str
 
 engine คือ ระบบตัดคำ ปัจจุบันมี engine ดังนี้
 
-- newmm (ค่าเริ่มต้น) - ใช้พจนานุกรม ด้วยวิธี Maximum Matching โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
+- newmm (ค่าเริ่มต้น) - ใช้พจนานุกรม ด้วยวิธี Maximum Matching + Thai Character Cluster โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
 - longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
 - icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
 - pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching

From 370a2077cadfac236e5b449d2821c17ff2b2ea16 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 19:46:41 +0700
Subject: [PATCH 03/12] - consistent indentation - update doc

---
 docs/conf.py                       |   1 -
 docs/pythainlp-dev-thai.md         | 135 +++++++++++++----------------
 pythainlp/sentiment/ulmfit_sent.py |   3 +-
 pythainlp/tokenize/longest.py      |   1 -
 pythainlp/word_vector/thai2vec.py  |  18 ++--
 5 files changed, 73 insertions(+), 85 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index c3f8194b8..bc1b294f1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -16,7 +16,6 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 from datetime import datetime
-import sys, os
 
 # -- Project information -----------------------------------------------------
 
diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index ca2a41ee1..3e8677460 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -145,9 +145,9 @@ check_all สำหรับส่งคืนค่า True หรือ False
 
 **เครดิต**
 
-TCC: Jakkrit TeCho
-Grammar: Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
-Python code: Korakot Chaovavanich 
+- TCC: Jakkrit TeCho
+- Grammar: Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
+- Python code: Korakot Chaovavanich 
 
 **การใช้งาน**
 
@@ -163,9 +163,8 @@ tcc.tcc("ประเทศไทย")  # 'ป/ระ/เท/ศ/ไท/ย'
 **การใช้งาน**
 
 ```python
->>> from pythainlp.tokenize import etcc
->>> etcc.etcc('คืนความสุข')
-'/คืน/ความสุข'
+from pythainlp.tokenize import etcc
+etcc.etcc('คืนความสุข')  # '/คืน/ความสุข'
 ```
 
 ### tag
@@ -179,16 +178,14 @@ pos_tag(text, engine="unigram", corpus="orchid")
 
 list คือ list ที่เก็บข้อความหลังผ่านการตัดคำแล้ว
 
-engine คือ ตัวติดป้ายกำกับคำ (pos tagger) มี 3 ตัวดังนี้
-
-1. unigram (ค่าเริ่มต้น) - UnigramTagger
-2. perceptron - PerceptronTagger
-3. artagger - RDR POS Tagger ละเอียดยิ่งกว่าเดิม
+engine คือ ตัวติดป้ายกำกับคำ (pos tagger) มีดังนี้
+- unigram (ค่าเริ่มต้น) - UnigramTagger
+- perceptron - PerceptronTagger
+- artagger - RDR POS Tagger ละเอียดยิ่งกว่าเดิม
 
 corpus ที่รองรับ
-
-1. orchid
-2. pud ใช้ข้อมูล Parallel Universal Dependencies (PUD) treebanks
+- orchid ใช้ข้อมูลจากคลังคำ ORCHID โดยเนคเทค
+- pud ใช้ข้อมูล Parallel Universal Dependencies (PUD) treebanks
 
 ### summarize
 
@@ -199,16 +196,18 @@ summarize_text(text, n , engine="frequency")
 ```
 
 text เป็นข้อความ
+
 n คือ จำนวนประโยคสรุป
+
 engine ที่รองรับ
-frequency
+- frequency
 
 **การใช้งาน**
 
 ```python
->>> from pythainlp.summarize import summarize_text
->>> summarize_text(text="อาหาร หมายถึง ของแข็งหรือของเหลว ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว จะทำให้เกิดพลังงานและความร้อนยเจริญเติบโต ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ อาหารจะต้องงกาย", n=1, engine="frequency")
-['อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย']
+from pythainlp.summarize import summarize_text
+summarize_text(text="อาหาร หมายถึง ของแข็งหรือของเหลว ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว จะทำให้เกิดพลังงานและความร้อนยเจริญเติบโต ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ อาหารจะต้องงกาย", n=1, engine="frequency")
+# ['อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย']
 ```
 
 ### word_vector
@@ -226,7 +225,6 @@ word_vector เป็นระบบ word vector ใน PyThaiNLP
 #### thai2vec
 
 ความต้องการโมดูล
-
 - gensim
 - numpy
 
@@ -263,13 +261,11 @@ lentext คือ จำนวนคำขั้นต่ำที่ต้อ
 from pythainlp.romanization import romanize
 romanize(str, engine="royin")
 ```
-มี 2 engine ดังนี้
 
+มี engine ดังนี้
 - pyicu ส่งค่าสัทอักษร
 - royin ใช้หลักเกณฑ์การถอดอักษรไทยเป็นอักษรโรมัน ฉบับราชบัณฑิตยสถาน (**หากมีข้อผิดพลาด ให้ใช้คำอ่าน เนื่องจากตัว royin ไม่มีตัวแปลงคำเป็นคำอ่าน**) 
 
-data :
-
 รับค่า ''str'' ข้อความ 
 
 คืนค่าเป็น ''str'' ข้อความ
@@ -290,9 +286,8 @@ spell(word, engine="pn")
 ```
 
 engine ที่รองรับ
-
-- pn พัฒนามาจาก Peter Norvig (ค่าเริ่มต้น)
-- hunspell ใช้ hunspell
+- pn พัฒนาจาก Peter Norvig (ค่าเริ่มต้น)
+- hunspell เรียก hunspell ที่ติดตั้งอยู่ในระบบปฏิบัติการ (มีในระบบ Linux)
 
 **ตัวอย่างการใช้งาน**
 
@@ -307,39 +302,35 @@ print(a)  # ['สี่เหลี่ยม']
 correction(word)
 ```
 
-แสดงคำที่เป็นไปได้มากที่สุด
+จะคืนค่าคำที่เป็นไปได้มากที่สุด
 
 **ตัวอย่างการใช้งาน**
 
 ```python
 from pythainlp.spell.pn import correction
 a = correction("สี่เหลียม")
-print(a) # ['สี่เหลี่ยม']
-```
-
-ผลลัพธ์
-
-```
-สี่เหลี่ยม
+print(a)  # ['สี่เหลี่ยม']
 ```
 
 ### pythainlp.number
 
+จัดการกับตัวเลข
+
 ```python
 from pythainlp.number import *
 ```
-จัดการกับตัวเลข โดยมีดังนี้
 
-- thai_num_to_num(str) - เป็นการแปลงเลขไทยสู่เลข
-- thai_num_to_text(str) - เลขไทยสู่ข้อความ
-- num_to_thai_num(str) - เลขสู่เลขไทย
+มีฟังก์ชันดังนี้
+- thai_num_to_num(str) - แปลงเลขไทยสู่เลขอารบิก
+- thai_num_to_text(str) - เลขไทยสู่คำอ่านไทย
+- num_to_thai_num(str) - เลขอารบิกสู่เลขไทย
 - num_to_text(str) - เลขสู่ข้อความ
 - text_to_num(str) - ข้อความสู่เลข
-- numtowords(float) - อ่านจำนวนตัวเลขภาษาไทย (บาท) รับค่าเป็น ''float'' คืนค่าเป็น 'str'
+- numtowords(float) - อ่านจำนวนภาษาไทย (บาท) รับค่าเป็น ''float'' คืนค่าเป็น 'str'
 
 ### collation
 
-ใช้ในการเรียงลำดับข้อมูลภาษาไทยใน List
+เรียงลำดับข้อมูลภาษาไทยใน List
 
 ```python
 from pythainlp.collation import collation
@@ -405,17 +396,12 @@ from pythainlp.change import *
 **การใช้งาน**
 
 ```python
->>> from pythainlp.soundex import LK82, Udom83
->>> print(LK82("รถ"))
-ร3000
->>> print(LK82("รด"))
-ร3000
->>> print(LK82("จัน"))
-จ4000
->>> print(LK82("จันทร์"))
-จ4000
->>> print(Udom83("รถ"))
-ร800000
+from pythainlp.soundex import LK82, Udom83
+print(LK82("รถ"))  # ร3000
+print(LK82("รด"))  # ร3000
+print(LK82("จัน"))  # จ4000
+print(LK82("จันทร์"))  # จ4000
+print(Udom83("รถ"))  # ร800000
 ```
 
 ### MetaSound ภาษาไทย
@@ -427,9 +413,8 @@ Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statisti
 **การใช้งาน**
 
 ```python
->>> from pythainlp.MetaSound import *
->>> MetaSound("คน")
-'15'
+from pythainlp.MetaSound import *
+MetaSound("คน")  # '15'
 ```
 
 ### sentiment
@@ -441,7 +426,9 @@ from pythainlp.sentiment import sentiment
 sentiment(str)
 ```
 
-รับค่า str ส่งออกเป็น pos , neg
+รับค่า str
+
+คืนค่าเป็น str ซึ่งมีค่า "pos" หรือ "neg"
 
 ### Util
 
@@ -508,8 +495,7 @@ listtext_num2num(list)
 **ตัวอย่าง**
 
 ```python
->>> listtext_num2num(["หก", "ล้าน", "หกแสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"])
-6666666
+listtext_num2num(["หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"])  # 66666
 ```
 
 ### Corpus
@@ -543,17 +529,22 @@ API เหมือนกับ NLTK โดยรองรับ API ดัง
 **ตัวอย่าง**
 
 ```python
->>> from pythainlp.corpus import wordnet
->>> print(wordnet.synsets("หนึ่ง"))
-[Synset('one.s.05'), Synset('one.s.04'), Synset('one.s.01'), Synset('one.n.01')]
->>> print(wordnet.synsets("หนึ่ง")[0].lemma_names("tha"))
-[]
->>> print(wordnet.synset("one.s.05"))
-Synset('one.s.05')
->>> print(wordnet.synset("spy.n.01").lemmas())
-[Lemma('spy.n.01.spy'), Lemma('spy.n.01.undercover_agent')]
->>> print(wordnet.synset("spy.n.01").lemma_names("tha"))
-['สปาย', 'สายลับ']
+from pythainlp.corpus import wordnet
+
+print(wordnet.synsets("หนึ่ง"))
+# [Synset('one.s.05'), Synset('one.s.04'), Synset('one.s.01'), Synset('one.n.01')]
+
+print(wordnet.synsets("หนึ่ง")[0].lemma_names("tha"))
+# []
+
+print(wordnet.synset("one.s.05"))
+# Synset('one.s.05')
+
+print(wordnet.synset("spy.n.01").lemmas())
+# [Lemma('spy.n.01.spy'), Lemma('spy.n.01.undercover_agent')]
+
+print(wordnet.synset("spy.n.01").lemma_names("tha"))
+# ['สปาย', 'สายลับ']
 ```
 
 #### stopword ภาษาไทย
@@ -589,6 +580,7 @@ alphabet.get_data()
 ```python
 from pythainlp.corpus.thaiword import get_data  # ข้อมูลเก่า
 get_data()
+
 from pythainlp.corpus.newthaiword import get_data  # ข้อมูลใหม่
 get_data()
 ```
@@ -620,9 +612,9 @@ text_list คือ ข้อความภาษาไทยที่อยู
 **ตัวอย่าง**
 
 ```python
->>> d = ["หนองคาย", "น่าอยู่", "นอกจากนี้", "ยัง", "มี", "เชียงใหม่"]
->>> parsed_docs(d)
-["[LOC : 'หนองคาย']", 'น่าอยู่', 'นอกจากนี้', 'ยัง', 'มี', "[LOC : 'เชียงใหม่']"]
+d = ["หนองคาย", "น่าอยู่", "นอกจากนี้", "ยัง", "มี", "เชียงใหม่"]
+parsed_docs(d)
+# ["[LOC : 'หนองคาย']", 'น่าอยู่', 'นอกจากนี้', 'ยัง', 'มี', "[LOC : 'เชียงใหม่']"]
 ```
 
 #### ConceptNet
@@ -654,7 +646,6 @@ word คือ คำ
 domain คือ หมวดหมู่ของคำ
 
 มีหมวดหมู่ดังนี้
-
 - all
 - imaginative
 - natural-pure-science
@@ -666,5 +657,3 @@ domain คือ หมวดหมู่ของคำ
 - belief-thought
 - leisure
 - others
-
-เขียนโดย PyThaiNLP
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
index ee9809e13..2fd6a1d27 100644
--- a/pythainlp/sentiment/ulmfit_sent.py
+++ b/pythainlp/sentiment/ulmfit_sent.py
@@ -84,7 +84,8 @@ def about():
     return """
     Sentiment analyzer based on thai2vec
     Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
-    89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline.
+    89% accuracy based on 15% validation set compared to
+    72% of fastText and 52% most-frequent-class baseline.
 
     Development: Charin Polpanumas
     GitHub: https://github.com/cstorm125/thai2vec
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index 2cb90479e..1b50e41cb 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -36,7 +36,6 @@
 
 
 class Tokenizer(object):
-
     def __init__(self, trie):
         self.__trie = trie
 
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
index dd7a03ca2..a390eae06 100644
--- a/pythainlp/word_vector/thai2vec.py
+++ b/pythainlp/word_vector/thai2vec.py
@@ -38,9 +38,9 @@ def get_model():
 
 def most_similar_cosmul(positive, negative):
     """
-	การใช้งาน
-	input list
-	"""
+    การใช้งาน
+    input list
+    """
     return get_model().most_similar_cosmul(positive=positive, negative=negative)
 
 
@@ -72,10 +72,10 @@ def sentence_vectorizer(text, dim=300, use_mean=False):
 
 def about():
     return """
-	thai2vec
-	State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
+    thai2vec
+    State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
     Created as part of pyThaiNLP with ULMFit implementation from fast.ai
-	
-	Development: Charin Polpanumas
-	GitHub: https://github.com/cstorm125/thai2vec
-	"""
+
+    Development: Charin Polpanumas
+    GitHub: https://github.com/cstorm125/thai2vec
+    """

From 101cdc8b71de7a07c757181f6bc28c386a4e7870 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 19:52:47 +0700
Subject: [PATCH 04/12] update doc

---
 docs/pythainlp-dev-thai.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index 3e8677460..10a78ffbf 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -29,9 +29,9 @@ pip install pythainlp
 **ติดตั้ง PyICU บน macOS**
 
 ```sh
-$ brew install icu4c --force
-$ brew link --force icu4c
-$ CFLAGS=-I/usr/local/opt/icu4c/include LDFLAGS=-L/usr/local/opt/icu4c/lib pip install pythainlp
+brew install icu4c --force
+brew link --force icu4c
+CFLAGS=-I/usr/local/opt/icu4c/include LDFLAGS=-L/usr/local/opt/icu4c/lib pip install pythainlp
 ```
 
 ข้อมูลเพิ่มเติมที่ https://medium.com/data-science-cafe/install-polyglot-on-mac-3c90445abc1f
@@ -85,7 +85,7 @@ text คือ ข้อความที่ต้องการตัดค
 filename คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
 
 engine คือ ระบบตัดคำ (ดูรายละเอียดที่ word_tokenize)
-- newmm 
+- newmm
 - mm
 - longest
 - wordcutpy
@@ -147,7 +147,7 @@ check_all สำหรับส่งคืนค่า True หรือ False
 
 - TCC: Jakkrit TeCho
 - Grammar: Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
-- Python code: Korakot Chaovavanich 
+- Python code: Korakot Chaovavanich
 
 **การใช้งาน**
 
@@ -158,7 +158,7 @@ tcc.tcc("ประเทศไทย")  # 'ป/ระ/เท/ศ/ไท/ย'
 
 #### Enhanced Thai Character Cluster (ETCC)
 
-นอกจาก TCC แล้ว PyThaiNLP 1.4 ยังรองรับ Enhanced Thai Character Cluster (ETCC) โดยแบ่งกลุ่มด้วย /
+นอกจาก TCC แล้ว PyThaiNLP ยังรองรับ Enhanced Thai Character Cluster (ETCC) โดยแบ่งกลุ่มด้วย /
 
 **การใช้งาน**
 
@@ -264,9 +264,9 @@ romanize(str, engine="royin")
 
 มี engine ดังนี้
 - pyicu ส่งค่าสัทอักษร
-- royin ใช้หลักเกณฑ์การถอดอักษรไทยเป็นอักษรโรมัน ฉบับราชบัณฑิตยสถาน (**หากมีข้อผิดพลาด ให้ใช้คำอ่าน เนื่องจากตัว royin ไม่มีตัวแปลงคำเป็นคำอ่าน**) 
+- royin ใช้หลักเกณฑ์การถอดอักษรไทยเป็นอักษรโรมัน ฉบับราชบัณฑิตยสถาน (**หากมีข้อผิดพลาด ให้ใช้คำอ่าน เนื่องจากตัว royin ไม่มีตัวแปลงคำเป็นคำอ่าน**)
 
-รับค่า ''str'' ข้อความ 
+รับค่า ''str'' ข้อความ
 
 คืนค่าเป็น ''str'' ข้อความ
 
@@ -277,16 +277,16 @@ from pythainlp.romanization import romanize
 romanize("แมว") # 'maew'
 ```
 
-### spell 
+### spell
 
-ตรวจสอบคำผิดในภาษาไทย 
+ตรวจสอบคำผิดในภาษาไทย
 
 ```python
 spell(word, engine="pn")
 ```
 
 engine ที่รองรับ
-- pn พัฒนาจาก Peter Norvig (ค่าเริ่มต้น)
+- pn (ค่าเริ่มต้น) พัฒนาจาก Peter Norvig
 - hunspell เรียก hunspell ที่ติดตั้งอยู่ในระบบปฏิบัติการ (มีในระบบ Linux)
 
 **ตัวอย่างการใช้งาน**
@@ -440,7 +440,7 @@ from pythainlp.util import *
 
 #### ngrams
 
-สำหรับสร้าง n-grams 
+สำหรับสร้าง n-grams
 
 ```python
 ngrams(token, num)
@@ -481,7 +481,7 @@ normalize(text)
 
 ```python
 # เ เ ป ล ก กับ แปลก
-normalize("เเปลก") == "แปลก"  # True 
+normalize("เเปลก") == "แปลก"  # True
 ```
 
 #### listtext_num2num

From ff654d133e74dd016c6e6f01639ccc40f8a83bf8 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 23:50:52 +0700
Subject: [PATCH 05/12] - Fix tokenizer selector - minor docs additions

---
 docs/pythainlp-dev-thai.md     |  41 ++++++-------
 mkdocs.yml                     |   8 ++-
 pythainlp/MetaSound.py         | 104 ++++++++++++++++-----------------
 pythainlp/tokenize/__init__.py |  30 +++++-----
 4 files changed, 92 insertions(+), 91 deletions(-)

diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index 10a78ffbf..42d92914e 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -55,8 +55,8 @@ engine คือ ระบบตัดคำ ปัจจุบันมี engi
 - newmm (ค่าเริ่มต้น) - ใช้พจนานุกรม ด้วยวิธี Maximum Matching + Thai Character Cluster โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
 - longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
 - icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
-- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
 - wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy) ใช้พจนานุกรม
+- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
 - deepcut - เรียกใช้ตัวตัดคำจาก [deepcut](https://github.com/rkcosmos/deepcut) ใช้การเรียนรู้ของเครื่อง
 
 คืนค่าเป็น ''list'' เช่น ['แมว', 'กิน']
@@ -67,8 +67,8 @@ engine คือ ระบบตัดคำ ปัจจุบันมี engi
 from pythainlp.tokenize import word_tokenize
 
 text = "โอเคบ่เรารักภาษาถิ่น"
->>> word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'เรา', 'รัก', 'ภาษาถิ่น']
->>> word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'เรา', 'รัก', 'ภาษา', 'ถิ่น']
+word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'เรา', 'รัก', 'ภาษาถิ่น']
+word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'เรา', 'รัก', 'ภาษา', 'ถิ่น']
 ```
 
 #### dict_word_tokenize
@@ -86,7 +86,6 @@ filename คือ ที่ตั้งไฟล์ที่ต้องกา
 
 engine คือ ระบบตัดคำ (ดูรายละเอียดที่ word_tokenize)
 - newmm
-- mm
 - longest
 - wordcutpy
 
@@ -115,9 +114,8 @@ engine คือ เครื่องมือสำหรับใช้ตั
 ใช้ตัดคำ/ประโยคจากช่องว่างในสตริง
 
 ```python
->>> from pythainlp.tokenize import WhitespaceTokenizer
->>> WhitespaceTokenizer("ทดสอบ ตัดคำช่องว่าง")
-['ทดสอบ', 'ตัดคำช่องว่าง']
+from pythainlp.tokenize import WhitespaceTokenizer
+WhitespaceTokenizer("ทดสอบ ตัดคำช่องว่าง")  # ['ทดสอบ', 'ตัดคำช่องว่าง']
 ```
 
 
@@ -189,10 +187,10 @@ corpus ที่รองรับ
 
 ### summarize
 
-เป็นระบบสรุปเอกสารภาษาไทยแบบง่าย ๆ
+สรุปเอกสารภาษาไทยแบบง่าย ๆ
 
 ```python
-summarize_text(text, n , engine="frequency")
+summarize_text(text, n, engine="frequency")
 ```
 
 text เป็นข้อความ
@@ -212,13 +210,14 @@ summarize_text(text="อาหาร หมายถึง ของแข็ง
 
 ### word_vector
 
+สร้างเวกเตอร์คำ
+
 ```python
 from pythainlp.word_vector import thai2vec
 ```
 
-word_vector เป็นระบบ word vector ใน PyThaiNLP
 
-ปัจจุบันนี้รองรับเฉพาะ thai2vec (https://github.com/cstorm125/thai2vec)
+ปัจจุบันรองรับเฉพาะ thai2vec (https://github.com/cstorm125/thai2vec)
 
 พัฒนาโดย Charin Polpanumas
 
@@ -231,25 +230,25 @@ word_vector เป็นระบบ word vector ใน PyThaiNLP
 ##### API
 
 - get_model() - รับข้อมูล model ในรูปแบบของ gensim
-- most_similar_cosmul(positive,negative)
+- most_similar_cosmul(positive, negative)
 - doesnt_match(listdata)
-- similarity(word1,word2) - หาค่าความคล้ายกันระหว่าง 2 คำ โดยทั้งคู่เป็น str
-- sentence_vectorizer(ss,dim=300,use_mean=False)
+- similarity(word1, word2) - หาค่าความคล้ายระหว่าง 2 คำ โดยทั้งคู่เป็น str
+- sentence_vectorizer(ss, dim=300, use_mean=False)
 - about() - รายละเอียด thai2vec
 
 ### keywords
 
-ใช้หาคำสำคัญ (keyword) จากข้อความภาษาไทย
+หาคำสำคัญ (keyword) จากข้อความภาษาไทย
 
 #### find_keyword
 
-การทำงาน หาคำที่ถูกใช้งานมากกว่าค่าขั้นต่ำที่กำหนดได้ โดยจะลบ stopword ออกไป
+การทำงาน หาคำที่ถูกใช้งานมากกว่าค่าขั้นต่ำที่กำหนดได้ โดยจะลบ stopword ออก
 
 ```python
 find_keyword(word_list, lentext=3)
 ```
 
-word_list คือ list ของข้อความที่ผ่านการตัดคำแล้ว
+word_list คือ list ของข้อความที่ตัดคำแล้ว
 
 lentext คือ จำนวนคำขั้นต่ำที่ต้องการหา keyword
 
@@ -365,13 +364,12 @@ rank(list)
 **ตัวอย่างการใช้งาน**
 
 ```python
->>> rank(["แมง", "แมง", "คน"])
-Counter({'แมง': 2, 'คน': 1})
+rank(["แมง", "แมง", "คน"])  # Counter({'แมง': 2, 'คน': 1})
 ```
 
 ### change
 
-#### แก้ไขปัญหาการพิมพ์ลืมเปลี่ยนภาษา
+#### แก้ไขปัญหาการลืมเปลี่ยนภาษาแป้นพิมพ์
 
 ```python
 from pythainlp.change import *
@@ -388,8 +386,7 @@ from pythainlp.change import *
 
 เครดิต Korakot Chaovavanich https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
 
-กฎที่รองรับในรุ่น 1.4
-
+กฎที่รองรับ
 - LK82 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วิชิตหล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์
 - Udom83 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วรรณี อุดมพาณิชย์
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 17d2bd23d..6d8f2d20c 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,6 +1,10 @@
 site_name: PyThaiNLP
+site_description: A Python library for natural language processing of Thai language.
+site_author: PyThaiNLP team
+copyright: Apache License 2.0
+repo_url: https://github.com/PyThaiNLP/pythainlp
 theme: readthedocs
 pages:
 - Home: index.md
-- English: pythainlp-1-4-eng.md
-- Thai: pythainlp-1-4-thai.md
+- English: pythainlp-1-6-eng.md
+- Thai: pythainlp-1-6-thai.md
diff --git a/pythainlp/MetaSound.py b/pythainlp/MetaSound.py
index 1027462f0..4a2862859 100644
--- a/pythainlp/MetaSound.py
+++ b/pythainlp/MetaSound.py
@@ -1,64 +1,64 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals,print_function
-from builtins import *
-'''
+"""
 MetaSound
 
-References
-
-Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
-'''
+References:
+Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
+Ontology for Analysing Names Given in Accordance with Thai Astrology.
+https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
+"""
 import re
-def MetaSound(name):
-    '''
+
+
+def metasound(text):
+    """
     Thai MetaSound
 
-    :param str name: thai text
-    :return: MetaSound for thai text
+    :param str text: Thai text
+    :return: MetaSound for Thai text
     **Example**::
-        >>> from pythainlp.MetaSound import MetaSound
+        >>> from pythainlp.metasound import metasound
         >>> MetaSound('รัก')
         '501'
         >>> MetaSound('ลัก')
         '501'
-    '''
-    name1=list(name)
-    count=len(name1)
-    word=[]
-    i=0
-    while i<count:
-        if (re.search(r'[ก-ฮ]',name1[i]),re.U):
-            word.append(name1[i])
-        i+=1
-    i=0
-    count=len(name1)
-    while i<count:
-        if (re.search('์',name1[i],re.U)):
-            word[i-1]=''
-            word[i]=''
-        i+=1
-    i=0
-    while i<count:
-        if (re.search('[กขฃคฆฅ]',word[i],re.U)):
-            name1[i]='1'
-        elif (re.search('[จฉชฌซฐทฒดฎตสศษ]',word[i],re.U)):
-            name1[i]='2'
-        elif (re.search('[ฟฝพผภบป]',word[i],re.U)):
-            name1[i]='3'
-        elif (re.search('[ง]',word[i],re.U)):
-            name1[i]='4'
-        elif (re.search('[ลฬรนณฦญ]',word[i],re.U)):
-            name1[i]='5'
-        elif (re.search('[ม]',word[i],re.U)):
-            name1[i]='6'
-        elif (re.search('[ย]',word[i],re.U)):
-            name1[i]='7'
-        elif (re.search('[ว]',word[i],re.U)):
-            name1[i]='8'
+    """
+    count = len(text)
+    sound = list(text)
+
+    i = 0
+    while i < count:
+        if re.search("์", sound[i]):
+            text[i - 1] = ""
+            text[i] = ""
+        i += 1
+
+    i = 0
+    while i < count:
+        if re.search("[กขฃคฆฅ]", text[i]):
+            sound[i] = "1"
+        elif re.search("[จฉชฌซฐทฒดฎตสศษ]", text[i]):
+            sound[i] = "2"
+        elif re.search("[ฟฝพผภบป]", text[i]):
+            sound[i] = "3"
+        elif re.search("[ง]", text[i]):
+            sound[i] = "4"
+        elif re.search("[ลฬรนณฦญ]", text[i]):
+            sound[i] = "5"
+        elif re.search("[ม]", text[i]):
+            sound[i] = "6"
+        elif re.search("[ย]", text[i]):
+            sound[i] = "7"
+        elif re.search("[ว]", text[i]):
+            sound[i] = "8"
         else:
-            name1[i]='0'
-        i+=1
-    return ''.join(name1)
-if __name__ == '__main__':
-    print(MetaSound('รัก'))
-    print(MetaSound('ลัก'))
+            sound[i] = "0"
+        i += 1
+
+    return "".join(sound)
+
+
+if __name__ == "__main__":
+    print(metasound("รัก"))
+    print(metasound("ลัก"))
+    print(metasound("น้อง"))
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 560a8ec80..c46f50a29 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -20,13 +20,13 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
     :param str engine: tokenizer to be used
     :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
     :Parameters for engine:
-        * newmm (default) - dictionary-based, Maximum Matching + TCC
-        * mm - dictionary-based, Maximum Matching
+        * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
         * longest - dictionary-based, Longest Matching
         * icu - wrapper for ICU, dictionary-based
-        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
         * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
+        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
         * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
+        * ulmfit - use newmm engine with a specific dictionary for use with thai2vec
     :return: list of words, tokenized from the text
 
     **Example**::
@@ -38,7 +38,7 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
         ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
     """
     if engine == "newmm" or engine == "onecut":
-        from .pyicu import segment
+        from .newmm import mmcut as segment
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "ulmfit":
@@ -51,10 +51,10 @@ def segment(text):
         from .pyicu import segment
     elif engine == "deepcut":
         from .deepcut import segment
-    elif engine == "pylexto":
-        from .pylexto import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
+    elif engine == "pylexto":
+        from .pylexto import segment
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
     else:  # default, use "newmm" engine
@@ -69,11 +69,10 @@ def segment(text):
 def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
     """
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
-
-    :param str text: the text to be tokenized
-    :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
-    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest)
-    :return: A list of words, tokenized from a text.
+    :param str text: text to be tokenized
+    :param dict custom_dict_trie: a dictionary trie
+    :param str engine: choose between different options of engine to token (newmm, longest, wordcutpy)
+    :return: list of words
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
         >>> listword = ["แมว", "ดี"]
@@ -175,11 +174,12 @@ def syllable_tokenize(text):
 
 
 def create_custom_dict_trie(custom_dict_source):
-    """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html
-
-    :param string/list custom_dict_source:  a list of vocaburaries or a path to source file
+    """
+    The function is used to create a custom dict trie which will be used for word_tokenize() function.
+    For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html
 
-    :return: A trie created from custom dict input
+    :param string/list custom_dict_source: a list of vocaburaries or a path to source file
+    :return: a trie created from custom dictionary input
     """
 
     if type(custom_dict_source) is str:

From 02748f9ff980ffca402850a10838b3cb4177c632 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 23:55:27 +0700
Subject: [PATCH 06/12] delete mkdocs.yml

---
 mkdocs.yml | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 mkdocs.yml

diff --git a/mkdocs.yml b/mkdocs.yml
deleted file mode 100644
index 6d8f2d20c..000000000
--- a/mkdocs.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-site_name: PyThaiNLP
-site_description: A Python library for natural language processing of Thai language.
-site_author: PyThaiNLP team
-copyright: Apache License 2.0
-repo_url: https://github.com/PyThaiNLP/pythainlp
-theme: readthedocs
-pages:
-- Home: index.md
-- English: pythainlp-1-6-eng.md
-- Thai: pythainlp-1-6-thai.md

From 4f2dd0a71cccb807b6e2a787730f5bc2fbdfcc2c Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 23 Oct 2018 23:58:48 +0700
Subject: [PATCH 07/12] revert MetaSound for now

---
 pythainlp/MetaSound.py | 104 ++++++++++++++++++++---------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/pythainlp/MetaSound.py b/pythainlp/MetaSound.py
index 4a2862859..1027462f0 100644
--- a/pythainlp/MetaSound.py
+++ b/pythainlp/MetaSound.py
@@ -1,64 +1,64 @@
 # -*- coding: utf-8 -*-
-"""
+from __future__ import absolute_import,division,unicode_literals,print_function
+from builtins import *
+'''
 MetaSound
 
-References:
-Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
-Ontology for Analysing Names Given in Accordance with Thai Astrology.
-https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
-"""
-import re
-
+References
 
-def metasound(text):
-    """
+Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
+'''
+import re
+def MetaSound(name):
+    '''
     Thai MetaSound
 
-    :param str text: Thai text
-    :return: MetaSound for Thai text
+    :param str name: thai text
+    :return: MetaSound for thai text
     **Example**::
-        >>> from pythainlp.metasound import metasound
+        >>> from pythainlp.MetaSound import MetaSound
         >>> MetaSound('รัก')
         '501'
         >>> MetaSound('ลัก')
         '501'
-    """
-    count = len(text)
-    sound = list(text)
-
-    i = 0
-    while i < count:
-        if re.search("์", sound[i]):
-            text[i - 1] = ""
-            text[i] = ""
-        i += 1
-
-    i = 0
-    while i < count:
-        if re.search("[กขฃคฆฅ]", text[i]):
-            sound[i] = "1"
-        elif re.search("[จฉชฌซฐทฒดฎตสศษ]", text[i]):
-            sound[i] = "2"
-        elif re.search("[ฟฝพผภบป]", text[i]):
-            sound[i] = "3"
-        elif re.search("[ง]", text[i]):
-            sound[i] = "4"
-        elif re.search("[ลฬรนณฦญ]", text[i]):
-            sound[i] = "5"
-        elif re.search("[ม]", text[i]):
-            sound[i] = "6"
-        elif re.search("[ย]", text[i]):
-            sound[i] = "7"
-        elif re.search("[ว]", text[i]):
-            sound[i] = "8"
+    '''
+    name1=list(name)
+    count=len(name1)
+    word=[]
+    i=0
+    while i<count:
+        if (re.search(r'[ก-ฮ]',name1[i]),re.U):
+            word.append(name1[i])
+        i+=1
+    i=0
+    count=len(name1)
+    while i<count:
+        if (re.search('์',name1[i],re.U)):
+            word[i-1]=''
+            word[i]=''
+        i+=1
+    i=0
+    while i<count:
+        if (re.search('[กขฃคฆฅ]',word[i],re.U)):
+            name1[i]='1'
+        elif (re.search('[จฉชฌซฐทฒดฎตสศษ]',word[i],re.U)):
+            name1[i]='2'
+        elif (re.search('[ฟฝพผภบป]',word[i],re.U)):
+            name1[i]='3'
+        elif (re.search('[ง]',word[i],re.U)):
+            name1[i]='4'
+        elif (re.search('[ลฬรนณฦญ]',word[i],re.U)):
+            name1[i]='5'
+        elif (re.search('[ม]',word[i],re.U)):
+            name1[i]='6'
+        elif (re.search('[ย]',word[i],re.U)):
+            name1[i]='7'
+        elif (re.search('[ว]',word[i],re.U)):
+            name1[i]='8'
         else:
-            sound[i] = "0"
-        i += 1
-
-    return "".join(sound)
-
-
-if __name__ == "__main__":
-    print(metasound("รัก"))
-    print(metasound("ลัก"))
-    print(metasound("น้อง"))
+            name1[i]='0'
+        i+=1
+    return ''.join(name1)
+if __name__ == '__main__':
+    print(MetaSound('รัก'))
+    print(MetaSound('ลัก'))

From ad1f8f9c7f64ad326dc53425187bf1145f242a2c Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 24 Oct 2018 00:01:47 +0700
Subject: [PATCH 08/12] remove unused imports

---
 pythainlp/MetaSound.py | 109 +++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/pythainlp/MetaSound.py b/pythainlp/MetaSound.py
index 1027462f0..e9062c835 100644
--- a/pythainlp/MetaSound.py
+++ b/pythainlp/MetaSound.py
@@ -1,64 +1,69 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals,print_function
-from builtins import *
-'''
+"""
 MetaSound
 
-References
-
-Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
-'''
+References:
+Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
+Ontology for Analysing Names Given in Accordance with Thai Astrology.
+https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
+"""
 import re
+
+
 def MetaSound(name):
-    '''
+    """
     Thai MetaSound
 
     :param str name: thai text
     :return: MetaSound for thai text
     **Example**::
-        >>> from pythainlp.MetaSound import MetaSound
-        >>> MetaSound('รัก')
-        '501'
-        >>> MetaSound('ลัก')
-        '501'
-    '''
-    name1=list(name)
-    count=len(name1)
-    word=[]
-    i=0
-    while i<count:
-        if (re.search(r'[ก-ฮ]',name1[i]),re.U):
+        from pythainlp.MetaSound import MetaSound
+        MetaSound('รัก')  # '501'
+        MetaSound('ลัก')  # '501'
+    """
+    name1 = list(name)
+    count = len(name1)
+    word = []
+
+    i = 0
+    while i < count:
+        if (re.search(r"[ก-ฮ]", name1[i]), re.U):
             word.append(name1[i])
-        i+=1
-    i=0
-    count=len(name1)
-    while i<count:
-        if (re.search('์',name1[i],re.U)):
-            word[i-1]=''
-            word[i]=''
-        i+=1
-    i=0
-    while i<count:
-        if (re.search('[กขฃคฆฅ]',word[i],re.U)):
-            name1[i]='1'
-        elif (re.search('[จฉชฌซฐทฒดฎตสศษ]',word[i],re.U)):
-            name1[i]='2'
-        elif (re.search('[ฟฝพผภบป]',word[i],re.U)):
-            name1[i]='3'
-        elif (re.search('[ง]',word[i],re.U)):
-            name1[i]='4'
-        elif (re.search('[ลฬรนณฦญ]',word[i],re.U)):
-            name1[i]='5'
-        elif (re.search('[ม]',word[i],re.U)):
-            name1[i]='6'
-        elif (re.search('[ย]',word[i],re.U)):
-            name1[i]='7'
-        elif (re.search('[ว]',word[i],re.U)):
-            name1[i]='8'
+        i += 1
+
+    i = 0
+    count = len(name1)
+    while i < count:
+        if re.search("์", name1[i], re.U):
+            word[i - 1] = ""
+            word[i] = ""
+        i += 1
+
+    i = 0
+    while i < count:
+        if re.search("[กขฃคฆฅ]", word[i], re.U):
+            name1[i] = "1"
+        elif re.search("[จฉชฌซฐทฒดฎตสศษ]", word[i], re.U):
+            name1[i] = "2"
+        elif re.search("[ฟฝพผภบป]", word[i], re.U):
+            name1[i] = "3"
+        elif re.search("[ง]", word[i], re.U):
+            name1[i] = "4"
+        elif re.search("[ลฬรนณฦญ]", word[i], re.U):
+            name1[i] = "5"
+        elif re.search("[ม]", word[i], re.U):
+            name1[i] = "6"
+        elif re.search("[ย]", word[i], re.U):
+            name1[i] = "7"
+        elif re.search("[ว]", word[i], re.U):
+            name1[i] = "8"
         else:
-            name1[i]='0'
-        i+=1
-    return ''.join(name1)
-if __name__ == '__main__':
-    print(MetaSound('รัก'))
-    print(MetaSound('ลัก'))
+            name1[i] = "0"
+        i += 1
+
+    return "".join(name1)
+
+
+if __name__ == "__main__":
+    print(MetaSound("รัก"))
+    print(MetaSound("ลัก"))

From 94ae5be97beba4fcb61f8d63375346896fbb3ad2 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 24 Oct 2018 00:23:45 +0700
Subject: [PATCH 09/12] Fix tokenizer selector

---
 pythainlp/tokenize/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index c46f50a29..c5e7818c4 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -81,7 +81,7 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
     if engine == "newmm" or engine == "onecut":
-        from .pyicu import segment
+        from .newmm import mmcut as segment
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "wordcutpy":

From a646f5c67aaf8b229342d2896ef6376a74b265fe Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 24 Oct 2018 01:45:19 +0700
Subject: [PATCH 10/12] =?UTF-8?q?Fix=20metasound=20-=20follow=20algorithm?=
 =?UTF-8?q?=20as=20explained=20in=20the=20paper=20Snae=20&=20Br=C3=BCckner?=
 =?UTF-8?q?=20(2009)=20https://pdfs.semanticscholar.org/3983/963e87ddc6dfd?=
 =?UTF-8?q?bb291099aa3927a0e3e4ea6.pdf=20-=20padding=20zeros=20to=204=20ch?=
 =?UTF-8?q?aracters=20length=20(default=20number,=20as=20specified=20in=20?=
 =?UTF-8?q?the=20paper)=20-=20retain=20the=20first=20alphabet=20-=20rename?=
 =?UTF-8?q?=20MetaSound=20to=20metasound=20(small=20caps=20function=20nami?=
 =?UTF-8?q?ng=20convention)=20-=20remove=20the=20use=20of=20regex?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/pythainlp-dev-thai.md |  23 ++++++--
 pythainlp/MetaSound.py     | 117 +++++++++++++++++++++++--------------
 pythainlp/__init__.py      |   2 +-
 tests/__init__.py          |  19 +++---
 4 files changed, 102 insertions(+), 59 deletions(-)

diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index 42d92914e..fce9ce741 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -46,6 +46,7 @@ CFLAGS=-I/usr/local/opt/icu4c/include LDFLAGS=-L/usr/local/opt/icu4c/lib pip ins
 
 ```python
 from pythainlp.tokenize import word_tokenize
+
 word_tokenize(text, engine)
 ```
 text คือ ข้อความในรูปแบบสตริง str เท่านั้น
@@ -115,6 +116,7 @@ engine คือ เครื่องมือสำหรับใช้ตั
 
 ```python
 from pythainlp.tokenize import WhitespaceTokenizer
+
 WhitespaceTokenizer("ทดสอบ ตัดคำช่องว่าง")  # ['ทดสอบ', 'ตัดคำช่องว่าง']
 ```
 
@@ -151,6 +153,7 @@ check_all สำหรับส่งคืนค่า True หรือ False
 
 ```python
 from pythainlp.tokenize import tcc
+
 tcc.tcc("ประเทศไทย")  # 'ป/ระ/เท/ศ/ไท/ย'
 ```
 
@@ -162,6 +165,7 @@ tcc.tcc("ประเทศไทย")  # 'ป/ระ/เท/ศ/ไท/ย'
 
 ```python
 from pythainlp.tokenize import etcc
+
 etcc.etcc('คืนความสุข')  # '/คืน/ความสุข'
 ```
 
@@ -171,6 +175,7 @@ Part-of-speech tagging ภาษาไทย
 
 ```python
 from pythainlp.tag import pos_tag
+
 pos_tag(text, engine="unigram", corpus="orchid")
 ```
 
@@ -204,6 +209,7 @@ engine ที่รองรับ
 
 ```python
 from pythainlp.summarize import summarize_text
+
 summarize_text(text="อาหาร หมายถึง ของแข็งหรือของเหลว ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว จะทำให้เกิดพลังงานและความร้อนยเจริญเติบโต ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ อาหารจะต้องงกาย", n=1, engine="frequency")
 # ['อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย']
 ```
@@ -258,6 +264,7 @@ lentext คือ จำนวนคำขั้นต่ำที่ต้อ
 
 ```python
 from pythainlp.romanization import romanize
+
 romanize(str, engine="royin")
 ```
 
@@ -273,7 +280,8 @@ romanize(str, engine="royin")
 
 ```python
 from pythainlp.romanization import romanize
-romanize("แมว") # 'maew'
+
+romanize("แมว")  # 'maew'
 ```
 
 ### spell
@@ -291,7 +299,8 @@ engine ที่รองรับ
 **ตัวอย่างการใช้งาน**
 
 ```python
-from pythainlp.spell import *
+from pythainlp.spell import spell
+
 a = spell("สี่เหลียม")
 print(a)  # ['สี่เหลี่ยม']
 ```
@@ -307,6 +316,7 @@ correction(word)
 
 ```python
 from pythainlp.spell.pn import correction
+
 a = correction("สี่เหลียม")
 print(a)  # ['สี่เหลี่ยม']
 ```
@@ -346,6 +356,7 @@ print(collation(["ไก่", "ไข่", "กา", "ฮา"]))  # ['กา', '
 
 ```python
 from pythainlp.date import now
+
 now()  # '30 พฤษภาคม 2560 18:45:24'
 ```
 ### rank
@@ -356,6 +367,7 @@ now()  # '30 พฤษภาคม 2560 18:45:24'
 
 ```python
 from pythainlp.rank import rank
+
 rank(list)
 ```
 
@@ -394,6 +406,7 @@ from pythainlp.change import *
 
 ```python
 from pythainlp.soundex import LK82, Udom83
+
 print(LK82("รถ"))  # ร3000
 print(LK82("รด"))  # ร3000
 print(LK82("จัน"))  # จ4000
@@ -410,8 +423,9 @@ Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statisti
 **การใช้งาน**
 
 ```python
-from pythainlp.MetaSound import *
-MetaSound("คน")  # '15'
+from pythainlp.metasound import metasound
+
+metasound("รัก")  # 'ร100'
 ```
 
 ### sentiment
@@ -420,6 +434,7 @@ sentiment analysis ภาษาไทย ใช้ข้อมูลจาก [h
 
 ```python
 from pythainlp.sentiment import sentiment
+
 sentiment(str)
 ```
 
diff --git a/pythainlp/MetaSound.py b/pythainlp/MetaSound.py
index e9062c835..fb07aabe3 100644
--- a/pythainlp/MetaSound.py
+++ b/pythainlp/MetaSound.py
@@ -1,69 +1,98 @@
 # -*- coding: utf-8 -*-
 """
-MetaSound
+MetaSound - Thai soundex system
 
 References:
 Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
 Ontology for Analysing Names Given in Accordance with Thai Astrology.
 https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
 """
-import re
 
+_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
+_THANTHAKHAT = "์"  # \u0e4c
+_C1 = "กขฃคฆฅ"  # sound K -> coded letter 1
+_C2 = "จฉชฌซฐทฒดฎตสศษ"  # D -> 2
+_C3 = "ฟฝพผภบป"  # B -> 3
+_C4 = "ง"  # NG -> 4
+_C5 = "ลฬรนณฦญ"  # N -> 5
+_C6 = "ม"  # M -> 6
+_C7 = "ย"  # Y -> 7
+_C8 = "ว"  # W -> 8
 
-def MetaSound(name):
+
+def metasound(text, length=4):
     """
     Thai MetaSound
 
-    :param str name: thai text
-    :return: MetaSound for thai text
+    :param str text: Thai text
+    :param int length: preferred length of the MetaSound (default is 4)
+    :return: MetaSound for the text
     **Example**::
-        from pythainlp.MetaSound import MetaSound
-        MetaSound('รัก')  # '501'
-        MetaSound('ลัก')  # '501'
+        from pythainlp.metasound import metasound
+        metasound("ลัก")  # 'ล100'
+        metasound("รัก")  # 'ร100'
+        metasound("รักษ์")  # 'ร100'
+        metasound("บูรณการ", 5))  # 'บ5515'
     """
-    name1 = list(name)
-    count = len(name1)
-    word = []
-
-    i = 0
-    while i < count:
-        if (re.search(r"[ก-ฮ]", name1[i]), re.U):
-            word.append(name1[i])
-        i += 1
+    # keep only consonants and thanthakhat
+    chars = []
+    for ch in text:
+        if ch in _CONS_THANTHAKHAT:
+            chars.append(ch)
 
+    # remove karan (thanthakhat and a consonant before it)
     i = 0
-    count = len(name1)
-    while i < count:
-        if re.search("์", name1[i], re.U):
-            word[i - 1] = ""
-            word[i] = ""
+    while i < len(chars):
+        if chars[i] == _THANTHAKHAT:
+            if i > 0:
+                chars[i - 1] = " "
+            chars[i] = " "
         i += 1
 
-    i = 0
-    while i < count:
-        if re.search("[กขฃคฆฅ]", word[i], re.U):
-            name1[i] = "1"
-        elif re.search("[จฉชฌซฐทฒดฎตสศษ]", word[i], re.U):
-            name1[i] = "2"
-        elif re.search("[ฟฝพผภบป]", word[i], re.U):
-            name1[i] = "3"
-        elif re.search("[ง]", word[i], re.U):
-            name1[i] = "4"
-        elif re.search("[ลฬรนณฦญ]", word[i], re.U):
-            name1[i] = "5"
-        elif re.search("[ม]", word[i], re.U):
-            name1[i] = "6"
-        elif re.search("[ย]", word[i], re.U):
-            name1[i] = "7"
-        elif re.search("[ว]", word[i], re.U):
-            name1[i] = "8"
+    # retain first consonant, encode the rest
+    chars = chars[:length]
+    i = 1
+    while i < len(chars):
+        if chars[i] in _C1:
+            chars[i] = "1"
+        elif chars[i] in _C2:
+            chars[i] = "2"
+        elif chars[i] in _C3:
+            chars[i] = "3"
+        elif chars[i] in _C4:
+            chars[i] = "4"
+        elif chars[i] in _C5:
+            chars[i] = "5"
+        elif chars[i] in _C6:
+            chars[i] = "6"
+        elif chars[i] in _C7:
+            chars[i] = "7"
+        elif chars[i] in _C8:
+            chars[i] = "8"
         else:
-            name1[i] = "0"
+            chars[i] = "0"
         i += 1
 
-    return "".join(name1)
+    while len(chars) < length:
+        chars.append("0")
+
+    return "".join(chars)
 
 
 if __name__ == "__main__":
-    print(MetaSound("รัก"))
-    print(MetaSound("ลัก"))
+    print(metasound("บูรณะ"))  # บ550 (an example from the original paper [Figure 4])
+    print(metasound("บูรณการ", 5))  # บ5515
+    print(metasound("ลักษณะ"))  # ล125
+    print(metasound("ลัก"))  # ล100
+    print(metasound("รัก"))  # ร100
+    print(metasound("รักษ์"))  # ร100
+    print(metasound(""))  # 0000
+
+    print(metasound("คน"))
+    print(metasound("คนA"))
+    print(metasound("ดา"))
+    print(metasound("ปา"))
+    print(metasound("งา"))
+    print(metasound("ลา"))
+    print(metasound("มา"))
+    print(metasound("วา"))
\ No newline at end of file
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
index edae7c220..2130a245e 100644
--- a/pythainlp/__init__.py
+++ b/pythainlp/__init__.py
@@ -3,7 +3,7 @@
 from pythainlp.collation import collation
 from pythainlp.date import now
 from pythainlp.keywords import find_keyword
-from pythainlp.MetaSound import MetaSound
+from pythainlp.metasound import metasound
 from pythainlp.rank import rank
 from pythainlp.romanization import romanize
 from pythainlp.sentiment import sentiment
diff --git a/tests/__init__.py b/tests/__init__.py
index cdd267ecf..87e7f5ddd 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -17,7 +17,7 @@
 )
 from pythainlp.date import now, reign_year_to_ad, now_reign_year
 from pythainlp.keywords import find_keyword
-from pythainlp.MetaSound import MetaSound
+from pythainlp.metasound import metasound
 from pythainlp.ner import ThaiNameRecognizer
 from pythainlp.number import numtowords
 from pythainlp.rank import rank
@@ -30,6 +30,7 @@
 from pythainlp.util import listtext_num2num, normalize
 from pythainlp.Text import Text
 
+
 class TestUM(unittest.TestCase):
     """
     ทดสอบการทำงาน
@@ -132,15 +133,11 @@ def test_lk82(self):
         self.assertEqual(Udom83("รถ"), "ร800000")
 
     def test_ms(self):
-        self.assertEqual(MetaSound("คน"), "15")
-        self.assertEqual(MetaSound("คนA"), "150")
-        self.assertEqual(MetaSound("ดา"), "20")
-        self.assertEqual(MetaSound("ปา"), "30")
-        self.assertEqual(MetaSound("งา"), "40")
-        self.assertEqual(MetaSound("ลา"), "50")
-        self.assertEqual(MetaSound("มา"), "60")
-        self.assertEqual(MetaSound("วา"), "80")
-        self.assertEqual(MetaSound("ลัก"), MetaSound("รัก"))
+        self.assertEqual(metasound("บูรณะ"), "บ550")
+        self.assertEqual(metasound("คน"), "ค500")
+        self.assertEqual(metasound("คนA"), "ค500")
+        self.assertEqual(metasound("ดา"), "ด000")
+        self.assertEqual(metasound("รักษ์"), metasound("รัก"))
 
     def test_wordnet(self):
         self.assertEqual(
@@ -251,8 +248,10 @@ def test_ner(self):
                 ("เช้า", "I-TIME"),
             ],
         )
+
     def test_Text(self):
         self.assertIsNotNone(Text("ทดสอบภาษาไทย"))
 
+
 if __name__ == "__main__":
     unittest.main()

From 995b0ea7d148d5f64b7b73ae24f6f46b74bc4977 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 24 Oct 2018 10:15:23 +0700
Subject: [PATCH 11/12] trying to rename MetaSound.py to metasound.py (step 1 -
 temporary)

---
 pythainlp/{MetaSound.py => metasound_.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pythainlp/{MetaSound.py => metasound_.py} (100%)

diff --git a/pythainlp/MetaSound.py b/pythainlp/metasound_.py
similarity index 100%
rename from pythainlp/MetaSound.py
rename to pythainlp/metasound_.py

From fb229b2090c5c4faad39cf0dbc69edec02bb4358 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 24 Oct 2018 10:17:08 +0700
Subject: [PATCH 12/12] rename MetaSound.py to metasound.py (step 2 - finish)

---
 pythainlp/{metasound_.py => metasound.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pythainlp/{metasound_.py => metasound.py} (100%)

diff --git a/pythainlp/metasound_.py b/pythainlp/metasound.py
similarity index 100%
rename from pythainlp/metasound_.py
rename to pythainlp/metasound.py