From fc0bc9f7df6b3cc4baf670b7e800da02044a540d Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 21 Oct 2018 18:58:20 +0700
Subject: [PATCH 1/3] Clean code - precompile regexes - more description for
 each file (at the beginning) - naming convention: rename thai2rom ->
 ThaiTransliterator [CamelCase for Class], romanization() -> romanize()
 [function name should be a verb] - update examples and tests to new names -
 add __ prefix for private members (ThaiTransliterator) - make sure a function
 will always return something -- if no engine found, use default (romanize(),
 sentiment(), spell()) - sort imports, remove unused imports - make consistent
 indentation (replace tabs with spaces)

---
 examples/romanization.py           |   4 +-
 pythainlp/__init__.py              |   2 +-
 pythainlp/chunk/__init__.py        |   6 +-
 pythainlp/collation/__init__.py    |  30 +++-
 pythainlp/romanization/__init__.py |  50 +++---
 pythainlp/romanization/pyicu.py    |  30 ++--
 pythainlp/romanization/royin.py    | 266 +++++++++++++++--------------
 pythainlp/romanization/thai2rom.py | 243 +++++++++++++++-----------
 pythainlp/sentiment/__init__.py    |  90 +++++-----
 pythainlp/sentiment/ulmfit_sent.py | 125 +++++++-------
 pythainlp/spell/__init__.py        |  26 +--
 pythainlp/spell/hunspell.py        |  79 +++++----
 pythainlp/spell/pn.py              | 126 ++++++++++++--
 tests/__init__.py                  |  16 +-
 14 files changed, 639 insertions(+), 454 deletions(-)

diff --git a/examples/romanization.py b/examples/romanization.py
index 38ac4840a..abbbd94fc 100644
--- a/examples/romanization.py
+++ b/examples/romanization.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 
-from pythainlp.romanization import romanization
+from pythainlp.romanization import romanize
 
-print(romanization("แมว"))
+print(romanize("แมว"))
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
index a7841831c..3da15f7a5 100644
--- a/pythainlp/__init__.py
+++ b/pythainlp/__init__.py
@@ -3,7 +3,7 @@
 __version__ = 1.7
 from pythainlp.sentiment import sentiment
 from pythainlp.spell import spell
-from pythainlp.romanization import romanization
+from pythainlp.romanization import romanize
 from pythainlp.tokenize import word_tokenize,sent_tokenize,tcc,etcc
 from pythainlp.rank import rank
 from pythainlp.change import texttothai,texttoeng
diff --git a/pythainlp/chunk/__init__.py b/pythainlp/chunk/__init__.py
index aea1f0adb..99e5bc68b 100644
--- a/pythainlp/chunk/__init__.py
+++ b/pythainlp/chunk/__init__.py
@@ -1,3 +1,5 @@
 # -*- coding: utf-8 -*-
-#from __future__ import absolute_import,unicode_literals
-# TODO
\ No newline at end of file
+
+# from __future__ import absolute_import, unicode_literals
+
+# TODO: Chunking
diff --git a/pythainlp/collation/__init__.py b/pythainlp/collation/__init__.py
index 33687c763..2ddc8851c 100644
--- a/pythainlp/collation/__init__.py
+++ b/pythainlp/collation/__init__.py
@@ -1,16 +1,27 @@
 # -*- coding: utf-8 -*-
+"""
+Thai collation (sort according to dictionary order)
+For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR)
+https://unicode.org/cldr/charts/latest/collation/th.html
+"""
 from __future__ import absolute_import, unicode_literals, print_function
 import re
 
+RE_TONE = re.compile(r"[็-์]")
+RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
+
 try:
     import icu
-    thkey = icu.Collator.createInstance(icu.Locale('th_TH')).getSortKey
+
+    thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey
 except ImportError:
+
     def thkey(word):
-        cv = re.sub('[็-์]', '', word,re.U) # remove tone
-        cv = re.sub('([เ-ไ])([ก-ฮ])', '\\2\\1', cv,re.U) # switch lead vowel
-        tone = re.sub('[^็-์]', ' ', word,re.U) # just tone
-        return cv+tone
+        cv = RE_TONE.sub("", word)  # remove tone
+        cv = RE_LV_C.sub("\\2\\1", cv)  # switch lead vowel
+        tone = RE_TONE.sub(" ", word)  # just tone
+        return cv + tone
+
 
 def collation(data):
     """
@@ -23,8 +34,9 @@ def collation(data):
     """
     return sorted(data, key=thkey)
 
+
 if __name__ == "__main__":
-	a=collation(['ไก่','ไข่','ก','ฮา'])==['ก', 'ไก่', 'ไข่', 'ฮา']
-	print(a)
-	print(collation(['หลาย','หญิง'])==['หญิง','หลาย'])
-	print(collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])==['ไก่', 'เป็ด', 'วัว', 'หมู'])
+    a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"]
+    print(a)
+    print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"])
+    print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"])
diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py
index 34593bbd9..3279adbc5 100644
--- a/pythainlp/romanization/__init__.py
+++ b/pythainlp/romanization/__init__.py
@@ -1,27 +1,27 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
+
+from __future__ import absolute_import, unicode_literals
 from pythainlp.tokenize import word_tokenize
-# ถอดเสียงภาษาไทยเป็น Latin
-def romanization(data,engine='royin'):
-	"""
-	:param str data: Thai text to be romanized
-	:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
-	:return: English (more or less) text that spells out how the Thai text should read.
-	"""
-	word_list=word_tokenize(data)
-	listword=[]
-	i=0
-	if engine=='royin':
-    		from .royin import romanization
-	elif engine=='pyicu':
-    		from .pyicu import romanization
-	elif engine=='thai2rom':
-    		from pythainlp.romanization.thai2rom import thai2rom
-    		thai=thai2rom()
-    		return thai.romanization(data)
-	else:
-    		raise Exception("error no have engine.")
-	while i<len(word_list):
-		listword.append(romanization(word_list[i]))
-		i+=1
-	return ''.join(listword)
+
+
+# ถอดเสียงภาษาไทยเป็นอักษรละติน
+def romanize(text, engine="royin"):
+    """
+    :param str data: Thai text to be romanized
+    :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
+    :return: English (more or less) text that spells out how the Thai text should read.
+    """
+    if engine == "pyicu":
+        from .pyicu import romanize
+    elif engine == "thai2rom":
+        from .thai2rom import ThaiTransliterator
+
+        thai2rom = ThaiTransliterator()
+        return thai2rom.romanize(text)
+    else:  # use default engine "royin"
+        from .royin import romanize
+
+    words = word_tokenize(text)
+    romanized_words = [romanize(word) for word in words]
+
+    return "".join(romanized_words)
diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py
index e6c144b17..f01936392 100644
--- a/pythainlp/romanization/pyicu.py
+++ b/pythainlp/romanization/pyicu.py
@@ -1,18 +1,22 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
+
+from __future__ import absolute_import, unicode_literals
 import sys
+
 try:
-	import icu
+    import icu
 except ImportError:
-	from pythainlp.tools import install_package
-	install_package('pyicu')
-	try:
-		import icu
-	except ImportError:
-		sys.exit('Error ! using pip install pyicu')
+    from pythainlp.tools import install_package
+
+    install_package("pyicu")
+    try:
+        import icu
+    except ImportError:
+        sys.exit("Error: please pip install pyicu")
+
 
-# ถอดเสียงภาษาไทยเป็น Latin
-def romanization(data):
-	"""เป็นคำสั่ง ถอดเสียงภาษาไทยเป็น Latin รับค่า ''str'' ข้อความ คืนค่าเป็น ''str'' ข้อความ Latin"""
-	thai2latin = icu.Transliterator.createInstance('Thai-Latin')
-	return thai2latin.transliterate(data)
\ No newline at end of file
+# ถอดเสียงภาษาไทยเป็นอักษรละติน
+def romanize(data):
+    """ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน"""
+    thai2latin = icu.Transliterator.createInstance("Thai-Latin")
+    return thai2latin.transliterate(data)
diff --git a/pythainlp/romanization/royin.py b/pythainlp/romanization/royin.py
index ea05cd229..13ba1b9c2 100644
--- a/pythainlp/romanization/royin.py
+++ b/pythainlp/romanization/royin.py
@@ -1,10 +1,11 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import, division
-from __future__ import unicode_literals
-from __future__ import print_function
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import re
-import copy
-vowel_data_py3 = """เ*ียว,\\1iao
+
+# สระ
+vowel_patterns = """เ*ียว,\\1iao
 แ*็ว,\\1aeo
 เ*ือย,\\1ueai
 แ*ว,\\1aeo
@@ -54,161 +55,168 @@
 *ะ,\\1a
 #ฤ,\\1rue
 $ฤ,\\1ri"""
-vowel_data = vowel_data_py3.replace('*', '([ก-ฮ])')
-vowel_data = vowel_data.replace('#', '([คนพมห])')
-vowel_data = vowel_data.replace('$', '([กตทปศส])')
-reader = [x.split(',') for x in vowel_data.split('\n')]
-
-
-def delete(data):
-    # ลบตัวการันต์
-    data = re.sub('จน์|มณ์|ณฑ์|ทร์|ตร์|[ก-ฮ]์|[ก-ฮ][ะ-ู]์', "", data)
-    data = re.sub("[ๆฯ]", "", data)
-    '''โค้ดส่วนตัดวรรณยุกต์ออก'''
-    # ลบวรรณยุกต์
-    data = re.sub("[่-๋]", "", data)
-    if re.search(u'\w'+'์', data, re.U):
-        search = re.findall(u'\w'+'์', data, re.U)
-        for i in search:
-                data = re.sub(i, '', data, flags=re.U)
-    return data
+vowel_patterns = vowel_patterns.replace("*", "([ก-ฮ])")
+vowel_patterns = vowel_patterns.replace("#", "([คนพมห])")
+vowel_patterns = vowel_patterns.replace("$", "([กตทปศส])")
+
+VOWELS = [x.split(",") for x in vowel_patterns.split("\n")]
+
 # พยัญชนะ ต้น สะกด
-consonants_data = {
-    'ก': ['k', 'k'],
-    'ข': ['kh', 'k'],
-    'ฃ': ['kh', 'k'],
-    'ค': ['kh', 'k'],
-    'ฅ': ['kh', 'k'],
-    'ฆ': ['kh', 'k'],
-    'ง': ['ng', 'ng'],
-    'จ': ['ch', 't'],
-    'ฉ': ['ch', 't'],
-    'ช': ['ch', 't'],
-    'ซ': ['s', 't'],
-    'ฌ': ['ch', 't'],
-    'ญ': ['y', 'n'],
-    'ฎ': ['d', 't'],
-    'ฏ': ['t', 't'],
-    'ฐ': ['th', 't'],
+CONSONANTS = {
+    "ก": ["k", "k"],
+    "ข": ["kh", "k"],
+    "ฃ": ["kh", "k"],
+    "ค": ["kh", "k"],
+    "ฅ": ["kh", "k"],
+    "ฆ": ["kh", "k"],
+    "ง": ["ng", "ng"],
+    "จ": ["ch", "t"],
+    "ฉ": ["ch", "t"],
+    "ช": ["ch", "t"],
+    "ซ": ["s", "t"],
+    "ฌ": ["ch", "t"],
+    "ญ": ["y", "n"],
+    "ฎ": ["d", "t"],
+    "ฏ": ["t", "t"],
+    "ฐ": ["th", "t"],
     # ฑ พยัญชนะต้น เป็น d ได้
-    'ฑ': ['th', 't'],
-    'ฒ': ['th', 't'],
-    'ณ': ['n', 'n'],
-    'ด': ['d', 't'],
-    'ต': ['t', 't'],
-    'ถ': ['th', 't'],
-    'ท': ['th', 't'],
-    'ธ': ['th', 't'],
-    'น': ['n', 'n'],
-    'บ': ['b', 'p'],
-    'ป': ['p', 'p'],
-    'ผ': ['ph', 'p'],
-    'ฝ': ['f', 'p'],
-    'พ': ['ph', 'p'],
-    'ฟ': ['f', 'p'],
-    'ภ': ['ph', 'p'],
-    'ม': ['m', 'm'],
-    'ย': ['y', ''],
-    'ร': ['r', 'n'],
-    'ฤ': ['rue', ''],
-    'ล': ['l', 'n'],
-    'ว': ['w', ''],
-    'ศ': ['s', 't'],
-    'ษ': ['s', 't'],
-    'ส': ['s', 't'],
-    'ห': ['h', ''],
-    'ฬ': ['l', 'n'],
-    'อ': ['', ''],
-    'ฮ': ['h', '']
+    "ฑ": ["th", "t"],
+    "ฒ": ["th", "t"],
+    "ณ": ["n", "n"],
+    "ด": ["d", "t"],
+    "ต": ["t", "t"],
+    "ถ": ["th", "t"],
+    "ท": ["th", "t"],
+    "ธ": ["th", "t"],
+    "น": ["n", "n"],
+    "บ": ["b", "p"],
+    "ป": ["p", "p"],
+    "ผ": ["ph", "p"],
+    "ฝ": ["f", "p"],
+    "พ": ["ph", "p"],
+    "ฟ": ["f", "p"],
+    "ภ": ["ph", "p"],
+    "ม": ["m", "m"],
+    "ย": ["y", ""],
+    "ร": ["r", "n"],
+    "ฤ": ["rue", ""],
+    "ล": ["l", "n"],
+    "ว": ["w", ""],
+    "ศ": ["s", "t"],
+    "ษ": ["s", "t"],
+    "ส": ["s", "t"],
+    "ห": ["h", ""],
+    "ฬ": ["l", "n"],
+    "อ": ["", ""],
+    "ฮ": ["h", ""],
 }
 
+RE_CONSONANT = re.compile(r"[ก-ฮ]")
+RE_KARANT = re.compile(r"จน์|มณ์|ณฑ์|ทร์|ตร์|[ก-ฮ]์|[ก-ฮ][ะ-ู]์")
+RE_KARANT2 = re.compile(r"\w" + r"์")
+RE_YAMOK_PAIYANNOI = re.compile(r"[ๆฯ]")
+RE_TONE = re.compile(r"[่-๋]")
+
+
+def _normalize(text):
+    """ตัดอักษรที่ไม่ออกเสียง (การันต์ ไปยาลน้อย ไม้ยมก*) และวรรณยุกต์ทิ้ง"""
+    text = RE_KARANT.sub("", text)
+    text = RE_YAMOK_PAIYANNOI.sub("", text)
+    text = RE_TONE.sub("", text)
+    if re.search(RE_KARANT2, text):
+        karants = re.findall(RE_KARANT2, text)
+        for karant in karants:
+            text = re.sub(karant, "", text)
+    return text
+
+
+def _replace_vowels(word):
+    for vowel in VOWELS:
+        word = re.sub(vowel[0], vowel[1], word)
 
-def vowel(word):
-    i = 0
-    while i < len(reader):
-        word = re.sub(reader[i][0], reader[i][1], word)
-        i += 1
     return word
 
 
-def consonants(word, res):
+def _replace_consonants(word, res):
     if res is None:
         pass
     elif len(res) == 1:
-        word = word.replace(res[0], consonants_data[res[0]][0])
+        word = word.replace(res[0], CONSONANTS[res[0]][0])
     else:
         i = 0
         lenword = len(res)
         while i < lenword:
             if i == 0 and res[0] == "ห":
-                word = word.replace(res[0], consonants_data[res[0]][0])
+                word = word.replace(res[0], CONSONANTS[res[0]][0])
                 i += 1
             elif i == 0 and res[0] != "ห":
-                word = word.replace(res[0], consonants_data[res[0]][0])
+                word = word.replace(res[0], CONSONANTS[res[0]][0])
                 i += 1
-            elif res[i] == "ร" and (word[i] == "ร" and len(word) == i+1):
-                word = word.replace(res[i], consonants_data[res[i]][1])
-            elif res[i] == "ร" and (word[i] == "ร" and word[i+1] == "ร"):
+            elif res[i] == "ร" and (word[i] == "ร" and len(word) == i + 1):
+                word = word.replace(res[i], CONSONANTS[res[i]][1])
+            elif res[i] == "ร" and (word[i] == "ร" and word[i + 1] == "ร"):
                 word = list(word)
-                del word[i+1]
-                if i+2 == lenword:
+                del word[i + 1]
+                if i + 2 == lenword:
                     word[i] = "an"
                 else:
                     word[i] = "a"
                 word = "".join(word)
                 i += 1
             else:
-                word = word.replace(res[i], consonants_data[res[i]][1])
+                word = word.replace(res[i], CONSONANTS[res[i]][1])
                 i += 1
     return word
 
 
-def romanization(word):
-    pattern = re.compile(r"[ก-ฮ]", re.U)
-    word2 = vowel(delete(word))
-    res = re.findall(pattern, word2)
+def romanize(word):
+    word2 = _replace_vowels(_normalize(word))
+    res = re.findall(RE_CONSONANT, word2)
+    # 2-character word, all consonants
     if len(word2) == 2 and len(res) == 2:
         word2 = list(word2)
-        word2.insert(1, 'o')
-        word2 = ''.join(word2)
-    word2 = consonants(word2, res)
+        word2.insert(1, "o")
+        word2 = "".join(word2)
+    word2 = _replace_consonants(word2, res)
     return word2
+
+
 if __name__ == "__main__":
-    print(romanization("แมว") == "maeo")
-    print(romanization("น้าว") == "nao")
-    print(romanization("รวม") == "ruam")
-    print(romanization("ไทย") == "thai")
-    print(romanization("ผัวะ") == "phua")
-    print(romanization("ใย") == "yai")
-    print(romanization("ไล่") == "lai")
-    print(romanization("เมา") == "mao")
-    print(romanization("ต้น") == "ton")
-    print(romanization("ตาล") == "tan")
-    print(romanization("แสง") == "saeng")
-    print(romanization("เลียน") == "lian")
-    print(romanization("เลือก") == "lueak")
-    print(romanization("เธอ") == "thoe")
-    print(romanization("หรู") == "ru")
-    print(romanization("ลอม") == "lom")
-    print(romanization("และ") == "lae")
-    print(romanization("เลาะ") == "lo")
-    print(romanization("ลอม") == "lom")
-    print(romanization("เล็ง") == "leng")
-    print(romanization("นึก") == "nuek")
-    print(romanization("มัว") == "mua")
-    print(romanization("มีด") == "mit")
-    print(romanization("โค") == "kho")
-    print(romanization("ขอ") == "kho")
-    print(romanization("วรร") == "wan")
-    print(romanization("สรรพ") == "sap")
-    print(romanization('วัน') + romanization('นะ') + romanization('พง'))
-    print(romanization('นัด') + romanization('ชะ') + romanization('โนน'))
-    print(romanization('สรรพ'))
-    print(romanization('สรร') + romanization('หา'))
-    print(romanization('สรร') + romanization('หา'))
-    print(romanization('แมว'))
-    print(romanization('กร') == romanization('กอน'))
-    print(romanization('คฤ') + romanization('หาสน์'))
-    print(romanization('กฤ') + romanization('ศะ') + romanization('ฎา'))
-    print(romanization('ฤกษ์'))
-    print(romanization('ฤ')+romanization('ดู')+romanization('กาล'))
+    print(romanize("แมว") == "maeo")
+    print(romanize("น้าว") == "nao")
+    print(romanize("รวม") == "ruam")
+    print(romanize("ไทย") == "thai")
+    print(romanize("ผัวะ") == "phua")
+    print(romanize("ใย") == "yai")
+    print(romanize("ไล่") == "lai")
+    print(romanize("เมา") == "mao")
+    print(romanize("ต้น") == "ton")
+    print(romanize("ตาล") == "tan")
+    print(romanize("แสง") == "saeng")
+    print(romanize("เลียน") == "lian")
+    print(romanize("เลือก") == "lueak")
+    print(romanize("เธอ") == "thoe")
+    print(romanize("หรู") == "ru")
+    print(romanize("ลอม") == "lom")
+    print(romanize("และ") == "lae")
+    print(romanize("เลาะ") == "lo")
+    print(romanize("ลอม") == "lom")
+    print(romanize("เล็ง") == "leng")
+    print(romanize("นึก") == "nuek")
+    print(romanize("มัว") == "mua")
+    print(romanize("มีด") == "mit")
+    print(romanize("โค") == "kho")
+    print(romanize("ขอ") == "kho")
+    print(romanize("วรร") == "wan")
+    print(romanize("สรรพ") == "sap")
+    print(romanize("วัน") + romanize("นะ") + romanize("พง"))
+    print(romanize("นัด") + romanize("ชะ") + romanize("โนน"))
+    print(romanize("สรรพ"))
+    print(romanize("สรร") + romanize("หา"))
+    print(romanize("สรร") + romanize("หา"))
+    print(romanize("แมว"))
+    print(romanize("กร") == romanize("กอน"))
+    print(romanize("คฤ") + romanize("หาสน์"))
+    print(romanize("กฤ") + romanize("ศะ") + romanize("ฎา"))
+    print(romanize("ฤกษ์"))
+    print(romanize("ฤ") + romanize("ดู") + romanize("กาล"))
diff --git a/pythainlp/romanization/thai2rom.py b/pythainlp/romanization/thai2rom.py
index 730a541f8..6426cf976 100644
--- a/pythainlp/romanization/thai2rom.py
+++ b/pythainlp/romanization/thai2rom.py
@@ -1,115 +1,164 @@
 # -*- coding: utf-8 -*-
+"""
+Romanization of Thai words based on machine-learnt engine ("thai2rom")
+"""
 from __future__ import print_function
 
+from pythainlp.corpus import download, get_file
+
 try:
     import numpy as np
-    import keras
+    from keras.layers import Input
+    from keras.models import Model, load_model
 except ImportError:
     from pythainlp.tools import install_package
-    install_package('keras')
-    install_package('numpy')
 
-from pythainlp.corpus import get_file,download
+    install_package("keras")
+    install_package("numpy")
+
 
-from keras.models import Model, load_model
-from keras.layers import Input
-import numpy as np
-class thai2rom:
+class ThaiTransliterator:
     def __init__(self):
-        '''
-        Thai2Rom
-        '''
-        self.batch_size = 64
-        self.epochs = 100
-        self.latent_dim = 256
-        self.num_samples = 648241
-        self.data_path = get_file('thai2rom-dataset')
-        if self.data_path==None:
-            download('thai2rom-dataset')
-            self.data_path = get_file('thai2rom-dataset')
-        self.input_texts = []
-        self.target_texts = []
-        self.input_characters = set()
-        self.target_characters = set()
-        with open(self.data_path, 'r', encoding='utf-8-sig') as self.f:
-            self.lines = self.f.read().split('\n')
-        for self.line in self.lines[: min(self.num_samples, len(self.lines) - 1)]:
-            self.input_text, self.target_text = self.line.split('\t')
-            if len(self.input_text)<30 and len(self.target_text)<90:
-                self.target_text = '\t' + self.target_text + '\n'
-                self.input_texts.append(self.input_text)
-                self.target_texts.append(self.target_text)
-                for self.char in self.input_text:
-                    if self.char not in self.input_characters:
-                        self.input_characters.add(self.char)
-                for self.char in self.target_text:
-                    if self.char not in self.target_characters:
-                        self.target_characters.add(self.char)
-        self.input_characters = sorted(list(self.input_characters))
-        self.target_characters = sorted(list(self.target_characters))
-        self.num_encoder_tokens = len(self.input_characters)
-        self.num_decoder_tokens = len(self.target_characters)
-        self.max_encoder_seq_length = max([len(self.txt) for self.txt in self.input_texts])
-        self.max_decoder_seq_length = max([len(self.txt) for self.txt in self.target_texts])
-        '''print('Number of samples:', len(self.input_texts))
+        """
+        Transliteration of Thai words
+        Now supports Thai to Latin (romanization)
+        """
+        self.__batch_size = 64
+        self.__epochs = 100
+        self.__latent_dim = 256
+        self.__num_samples = 648241
+        self.__data_path = get_file("thai2rom-dataset")
+        if not self.__data_path:
+            download("thai2rom-dataset")
+            self.__data_path = get_file("thai2rom-dataset")
+
+        self.__input_texts = []
+        self.__target_texts = []
+        self.__input_characters = set()
+        self.__target_characters = set()
+
+        with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh:
+            self.__lines = self.__fh.read().split("\n")
+
+        for line in self.__lines[: min(self.__num_samples, len(self.__lines) - 1)]:
+            input_text, target_text = line.split("\t")
+            if len(input_text) < 30 and len(target_text) < 90:
+                target_text = "\t" + target_text + "\n"
+                self.__input_texts.append(self.input_text)
+                self.__target_texts.append(self.target_text)
+                for char in input_text:
+                    if char not in self.__input_characters:
+                        self.__input_characters.add(char)
+                for char in target_text:
+                    if char not in self.__target_characters:
+                        self.__target_characters.add(char)
+
+        self.__input_characters = sorted(list(self.__input_characters))
+        self.__target_characters = sorted(list(self.__target_characters))
+        self.__num_encoder_tokens = len(self.__input_characters)
+        self.__num_decoder_tokens = len(self.__target_characters)
+        self.__max_encoder_seq_length = max(
+            [len(text) for text in self.__input_texts]
+        )
+        self.__max_decoder_seq_length = max(
+            [len(text) for text in self.__target_texts]
+        )
+        """print('Number of samples:', len(self.input_texts))
         print('Number of unique input tokens:', self.num_encoder_tokens)
         print('Number of unique output tokens:', self.num_decoder_tokens)
         print('Max sequence length for inputs:', self.max_encoder_seq_length)
-        print('Max sequence length for outputs:', self.max_decoder_seq_length)'''
-        self.input_token_index = dict([(char, i) for i, char in enumerate(self.input_characters)])
-        self.target_token_index = dict([(char, i) for i, char in enumerate(self.target_characters)])
-        self.encoder_input_data = np.zeros((len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32')
-        for i, input_text in enumerate(self.input_texts):
-            for t, char in enumerate(self.input_text):
-                self.encoder_input_data[i, t, self.input_token_index[char]] = 1.
+        print('Max sequence length for outputs:', self.max_decoder_seq_length)"""
+        self.__input_token_index = dict(
+            [(char, i) for i, char in enumerate(self.__input_characters)]
+        )
+        self.__target_token_index = dict(
+            [(char, i) for i, char in enumerate(self.__target_characters)]
+        )
+        self.__encoder_input_data = np.zeros(
+            (
+                len(self.__input_texts),
+                self.__max_encoder_seq_length,
+                self.__num_encoder_tokens,
+            ),
+            dtype="float32",
+        )
+        for i, input_text in enumerate(self.__input_texts):
+            for t, char in enumerate(input_text):
+                self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1.
+
         # Restore the model and construct the encoder and decoder.
-        self.filemodel=get_file('thai2rom')
-        if self.filemodel==None:
-            download('thai2rom')
-            self.filemodel=get_file('thai2rom')
-        self.model = load_model(self.filemodel)
-        self.encoder_inputs = self.model.input[0]   # input_1
-        self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[2].output   # lstm_1
-        self.encoder_states = [self.state_h_enc, self.state_c_enc]
-        self.encoder_model = Model(self.encoder_inputs, self.encoder_states)
-        self.decoder_inputs = self.model.input[1]   # input_2
-        self.decoder_state_input_h = Input(shape=(self.latent_dim,), name='input_3')
-        self.decoder_state_input_c = Input(shape=(self.latent_dim,), name='input_4')
-        self.decoder_states_inputs = [self.decoder_state_input_h, self.decoder_state_input_c]
-        self.decoder_lstm = self.model.layers[3]
-        self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(self.decoder_inputs, initial_state=self.decoder_states_inputs)
-        self.decoder_states = [self.state_h_dec, self.state_c_dec]
-        self.decoder_dense = self.model.layers[4]
-        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
-        self.decoder_model = Model([self.decoder_inputs] + self.decoder_states_inputs,[self.decoder_outputs] + self.decoder_states)
+        self.__filemodel = get_file("thai2rom")
+        if not self.__filemodel:
+            download("thai2rom")
+            self.__filemodel = get_file("thai2rom")
+        self.__model = load_model(self.__filemodel)
+        self.__encoder_inputs = self.__model.input[0]  # input_1
+        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
+            2
+        ].output  # lstm_1
+        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
+        self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states)
+        self.__decoder_inputs = self.__model.input[1]  # input_2
+        self.__decoder_state_input_h = Input(shape=(self.__latent_dim,), name="input_3")
+        self.__decoder_state_input_c = Input(shape=(self.__latent_dim,), name="input_4")
+        self.__decoder_states_inputs = [
+            self.__decoder_state_input_h,
+            self.__decoder_state_input_c,
+        ]
+        self.__decoder_lstm = self.__model.layers[3]
+        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
+            self.__decoder_inputs, initial_state=self.__decoder_states_inputs
+        )
+        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
+        self.__decoder_dense = self.__model.layers[4]
+        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
+        self.__decoder_model = Model(
+            [self.__decoder_inputs] + self.__decoder_states_inputs,
+            [self.__decoder_outputs] + self.__decoder_states,
+        )
+
+        self.__reverse_input_char_index = dict(
+            (i, char) for char, i in self.__input_token_index.items()
+        )
+        self.__reverse_target_char_index = dict(
+            (i, char) for char, i in self.__target_token_index.items()
+        )
+
+    def __decode_sequence(self, input_seq):
+        self.__states_value = self.__encoder_model.predict(input_seq)
+        self.__target_seq = np.zeros((1, 1, self.__num_decoder_tokens))
+        self.__target_seq[0, 0, self.__target_token_index["\t"]] = 1.
+        self.__stop_condition = False
+        self.__decoded_sentence = ""
 
-        self.reverse_input_char_index = dict((i, char) for char, i in self.input_token_index.items())
-        self.reverse_target_char_index = dict((i, char) for char, i in self.target_token_index.items())
-    def decode_sequence(self,input_seq):
-        self.states_value = self.encoder_model.predict(input_seq)
-        self.target_seq = np.zeros((1, 1, self.num_decoder_tokens))
-        self.target_seq[0, 0, self.target_token_index['\t']] = 1.
-        self.stop_condition = False
-        self.decoded_sentence = ''
-        while not self.stop_condition:
-            self.output_tokens, self.h, self.c = self.decoder_model.predict([self.target_seq] + self.states_value)
-            self.sampled_token_index = np.argmax(self.output_tokens[0, -1, :])
-            self.sampled_char = self.reverse_target_char_index[self.sampled_token_index]
-            self.decoded_sentence += self.sampled_char
-            if (self.sampled_char == '\n' or len(self.decoded_sentence) > self.max_decoder_seq_length):
-                self.stop_condition = True
-            self.target_seq = np.zeros((1, 1, self.num_decoder_tokens))
-            self.target_seq[0, 0, self.sampled_token_index] = 1.
-            self.states_value = [self.h, self.c]
-        return self.decoded_sentence
-    def encode_input(self,name):
-        self.test_input = np.zeros((1, self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32')
+        while not self.__stop_condition:
+            self.__output_tokens, self.__h, self.__c = self.__decoder_model.predict(
+                [self.__target_seq] + self.__states_value
+            )
+            self.__sampled_token_index = np.argmax(self.__output_tokens[0, -1, :])
+            self.__sampled_char = self.__reverse_target_char_index[self.__sampled_token_index]
+            self.__decoded_sentence += self.__sampled_char
+            if (
+                self.__sampled_char == "\n"
+                or len(self.__decoded_sentence) > self.__max_decoder_seq_length
+            ):
+                self.__stop_condition = True
+            self.__target_seq = np.zeros((1, 1, self.__num_decoder_tokens))
+            self.__target_seq[0, 0, self.__sampled_token_index] = 1.
+            self.__states_value = [self.__h, self.__c]
+        return self.__decoded_sentence
+
+    def __encode_input(self, name):
+        self.__test_input = np.zeros(
+            (1, self.__max_encoder_seq_length, self.__num_encoder_tokens), dtype="float32"
+        )
         for t, char in enumerate(name):
-            self.test_input[0, t, self.input_token_index[char]] = 1.
-        return self.test_input
-    def romanization(self,text):
-        '''
+            self.__test_input[0, t, self.__input_token_index[char]] = 1.
+        return self.__test_input
+
+    def romanize(self, text):
+        """
         :param str text: Thai text to be romanized
         :return: English (more or less) text that spells out how the Thai text should read.
-        '''
-        return self.decode_sequence(self.encode_input(text))
+        """
+        return self.__decode_sequence(self.__encode_input(text))
diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py
index af7995af2..a4fc92dee 100644
--- a/pythainlp/sentiment/__init__.py
+++ b/pythainlp/sentiment/__init__.py
@@ -1,50 +1,50 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals,print_function
+
+from __future__ import absolute_import, print_function, unicode_literals
+import os
+import dill
+
 import pythainlp
 from pythainlp.corpus import stopwords
-import os
 from pythainlp.tokenize import word_tokenize
-import dill
 
-templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'sentiment')
-def sentiment(text, engine='old'):
-	"""
-	:param str text: thai text
-	:param str engine: sentiment analysis engine (old or ulmfit)
-	:return: pos or neg
-
-	**Example**::
-		>>> from pythainlp.sentiment import sentiment
-		>>> text="วันนี้อากาศดีจัง"
-		>>> sentiment(text)
-		'pos'
-		>>> sentiment(text,'ulmfit')
-		'pos'
-		>>> text="วันนี้อารมณ์เสียมาก"
-		>>> sentiment(text)
-		'neg'
-		>>> sentiment(text,'ulmfit')
-		'neg'
-	"""
-	if engine=='old':
-		with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm:
-			vocabulary = dill.load(in_strm)
-		with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm:
-			classifier = dill.load(in_strm)
-		text=set(word_tokenize(text))-set(stopwords.words('thai'))
-		featurized_test_sentence =  {i:(i in text) for i in vocabulary}
-		return classifier.classify(featurized_test_sentence)
-	elif engine=='ulmfit':
-		from pythainlp.sentiment import ulmfit_sent
-		tag=ulmfit_sent.get_sentiment(text)
-		sa=""
-		if tag==0:
-			sa="neg"
-		else:
-			sa="pos"
-		return sa
-	else:
-		raise Exception("error no have engine.")
-if __name__ == '__main__':
-	d="เสียใจแย่มากเลย"
-	print(sentiment(d))
+templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "sentiment")
+
+
+def sentiment(text, engine="old"):
+    """
+    :param str text: thai text
+    :param str engine: sentiment analysis engine (old or ulmfit)
+    :return: pos or neg
+
+    **Example**::
+        >>> from pythainlp.sentiment import sentiment
+        >>> text="วันนี้อากาศดีจัง"
+        >>> sentiment(text)
+        'pos'
+        >>> sentiment(text,'ulmfit')
+        'pos'
+        >>> text="วันนี้อารมณ์เสียมาก"
+        >>> sentiment(text)
+        'neg'
+        >>> sentiment(text,'ulmfit')
+        'neg'
+    """
+    if engine == "ulmfit":
+        from pythainlp.sentiment import ulmfit_sent
+
+        tag = ulmfit_sent.get_sentiment(text)
+        return "pos" if tag else "neg"
+    else:  # default, use "old" vocabulary-based engine
+        with open(os.path.join(templates_dir, "vocabulary.data"), "rb") as in_strm:
+            vocabulary = dill.load(in_strm)
+        with open(os.path.join(templates_dir, "sentiment.data"), "rb") as in_strm:
+            classifier = dill.load(in_strm)
+        text = set(word_tokenize(text)) - set(stopwords.words("thai"))
+        featurized_test_sentence = {i: (i in text) for i in vocabulary}
+        return classifier.classify(featurized_test_sentence)
+
+
+if __name__ == "__main__":
+    text = "เสียใจแย่มากเลย"
+    print(sentiment(text))
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
index 85bd7b790..1e56d923a 100644
--- a/pythainlp/sentiment/ulmfit_sent.py
+++ b/pythainlp/sentiment/ulmfit_sent.py
@@ -1,90 +1,95 @@
 # -*- coding: utf-8 -*-
-'''
+"""
+Sentiment analyzer based on thai2vec ("ulmfit" engine)
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
-'''
-from __future__ import absolute_import,unicode_literals
-import os
+"""
+from __future__ import absolute_import, unicode_literals
+
 import sys
 from collections import defaultdict
 
-#numpy and dill
+from pythainlp.corpus import download, get_file
+from pythainlp.tokenize import word_tokenize
+
 try:
     import numpy as np
     import dill as pickle
 except ImportError:
     from pythainlp.tools import install_package
-    install_package('numpy')
-    install_package('dill')
+
+    install_package("numpy")
+    install_package("dill")
     try:
         import numpy as np
         import dill as pickle
     except ImportError:
-        print("Error installing using 'pip install numpy dill'")
+        print("Error: Try 'pip install numpy dill'")
         sys.exit(0)
 
-#import torch
 try:
     import torch
+    from torch import LongTensor
+    from torch.autograd import Variable
 except ImportError:
-    print('PyTorch required. See https://pytorch.org/.')
-import torch
-from torch.autograd import Variable
-from torch import LongTensor
+    print("PyTorch required. See https://pytorch.org/.")
 
-#import fastai for multiBatchRNN
-try:
-    from fastai.text import *
-except ImportError:
-    print(
-    """
-    fastai required for multiBatchRNN. 
-    Run 'pip install https://github.com/fastai/fastai/archive/master.zip'
-    """)
+# try:
+#     from fastai.text import multiBatchRNN
+# except ImportError:
+#     print(
+#         """
+#     fastai required for multiBatchRNN.
+#     Run 'pip install https://github.com/fastai/fastai/archive/master.zip'
+#     """
+#     )
 
-from pythainlp.tokenize import word_tokenize
-from pythainlp.corpus import get_file
-from pythainlp.corpus import download
 
-MODEL_NAME = 'sent_model'
-ITOS_NAME = 'itos_sent'
+MODEL_NAME = "sent_model"
+ITOS_NAME = "itos_sent"
 
-#download pretrained model
+
+# download pretrained model
 def get_path(fname):
-	path = get_file(fname)
-	if path==None:
-		download(fname)
-		path = get_file(fname)
-	return(path)
-
-#load model
-m = torch.load(get_path(MODEL_NAME))
-m.eval()
-#load itos and stoi
-itos = pickle.load(open(get_path(ITOS_NAME),'rb'))
-stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
-
-
-#get sentiment; 1 for positive and 0 for negative
-#or score if specified return_score=True
-softmax = lambda x : np.exp(x)/np.sum(np.exp(x))
-def get_sentiment(ss,return_score=False):
-    s = word_tokenize(ss)
-    t = LongTensor([stoi[i] for i in s]).view(-1,1).cpu()
-    t = Variable(t,volatile=False)
-    m.reset()
-    pred,*_ = m(t)
+    path = get_file(fname)
+    if not path:
+        download(fname)
+        path = get_file(fname)
+    return path
+
+
+# load model
+model = torch.load(get_path(MODEL_NAME))
+model.eval()
+
+# load itos and stoi
+itos = pickle.load(open(get_path(ITOS_NAME), "rb"))
+stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)})
+
+# get sentiment; 1 for positive and 0 for negative
+# or score if specified return_score=True
+softmax = lambda x: np.exp(x) / np.sum(np.exp(x))
+
+
+def get_sentiment(text, return_score=False):
+    words = word_tokenize(text)
+    tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu()
+    tensor = Variable(tensor, volatile=False)
+    model.reset()
+    pred, *_ = model(tensor)
     result = pred.data.cpu().numpy().reshape(-1)
+
     if return_score:
-        return(softmax(result))
+        return softmax(result)
     else:
-        return(np.argmax(result))
+        return np.argmax(result)
+
 
 def about():
-	return '''
-	Sentiment Analyzer based on thai2vec
-	Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
+    return """
+    Sentiment analyzer based on thai2vec
+    Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
     89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline.
-	
-	Development : Charin Polpanumas
-	GitHub : https://github.com/cstorm125/thai2vec
-	'''
\ No newline at end of file
+
+    Development: Charin Polpanumas
+    GitHub: https://github.com/cstorm125/thai2vec
+    """
diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py
index df503e3d7..09fb8576a 100644
--- a/pythainlp/spell/__init__.py
+++ b/pythainlp/spell/__init__.py
@@ -1,15 +1,21 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,unicode_literals
-def spell(word,engine='pn'):
+"""
+Spell checking
+"""
+from __future__ import absolute_import, unicode_literals
+
+
+def spell(word, engine="pn"):
     """
-    :param str word: the word to check spelling
+    :param str word: word to check spelling
     :param str engine:
         * pn - Peter Norvig's algorithm
-        * hunspell - uses hunspell's algorithm, which should already exist in linux
-    :return: list word
+        * hunspell - uses hunspell's algorithm, which should already exist in Linux
+    :return: list of words
     """
-    if engine=='pn':
-        from .pn import spell as spell1
-    elif engine=='hunspell':
-        from .hunspell import spell as spell1
-    return spell1(word)
+    if engine == "hunspell":
+        from .hunspell import spell as _spell
+    else:  # default, use "pn" engine
+        from .pn import spell as _spell
+
+    return _spell(word)
diff --git a/pythainlp/spell/hunspell.py b/pythainlp/spell/hunspell.py
index c940126f5..718d7596b 100644
--- a/pythainlp/spell/hunspell.py
+++ b/pythainlp/spell/hunspell.py
@@ -1,40 +1,49 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,print_function,unicode_literals
-from builtins import *
+"""
+Spell checking using hunspell
+"""
+from __future__ import absolute_import, print_function, unicode_literals
+
 import subprocess
 import sys
 
-def spel1(word,lang='th_TH'):
-	"""เป็นคำสั่งตรวจคำผิดโดยใช้ hunspell
-	รับค่า str ส่งออกเป็น list
-	"""
-	try:
-		if sys.platform == 'win32':
-			cmd = "echo "+word+" | hunspell -d "+lang
-		else:
-			cmd = 'echo "'+word+'" | hunspell -d '+lang
-		getoutput = subprocess.getoutput(cmd)
-		del cmd
-		get = getoutput.split("\n")
-		del get[0]
-		if get[0] == '*':
-			getoutput = []
-		else:
-			if get[1] == "":
-				del get[1]
-			get = get[0].split(":")
-			del get[0]
-			getoutput = get[0].replace(" ","")
-			getoutput = getoutput.split(",")
-		del get
-		return getoutput
-	except subprocess.CalledProcessError:
-		print('please install hunspell')
-		return None
+
+def spell(word, lang="th_TH"):
+    """เป็นคำสั่งตรวจคำผิดโดยใช้ hunspell
+    รับค่า str ส่งออกเป็น list
+    """
+    try:
+        if sys.platform == "win32":
+            cmd = "echo " + word + " | hunspell -d " + lang
+        else:
+            cmd = 'echo "' + word + '" | hunspell -d ' + lang
+        getoutput = subprocess.getoutput(cmd)
+        del cmd
+        get = getoutput.split("\n")
+        del get[0]
+        if get[0] == "*":
+            getoutput = []
+        else:
+            if get[1] == "":
+                del get[1]
+            get = get[0].split(":")
+            del get[0]
+            getoutput = get[0].replace(" ", "")
+            getoutput = getoutput.split(",")
+        del get
+        return getoutput
+    except subprocess.CalledProcessError:
+        print("Error: Please install hunspell.")
+        return None
+    except BaseException:
+        print("Errr: Other error.")
+        return None
+
+
 if __name__ == "__main__":
-  Input = spell("appoe","")
-  print(Input)
-  InputTH = spell("คลินิค","th_TH")
-  print(InputTH)
-  trueth = spell("สี่เหลียม","th_TH")
-  print(trueth)
+    input1 = spell("appoe", "")
+    print(input1)
+    input2 = spell("คลินิค", "th_TH")
+    print(input2)
+    input3 = spell("สี่เหลียม", "th_TH")
+    print(input3)
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 272753dfc..4fd942aac 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -1,32 +1,122 @@
 # -*- coding: utf-8 -*-
 """
-Fork from Peter Norvig's Python codes at http://norvig.com/spell-correct.html
+Fork from Peter Norvig's Python code at http://norvig.com/spell-correct.html
 """
-from __future__ import absolute_import,print_function,unicode_literals
-from builtins import *
-from pythainlp.corpus.thaiword import get_data
+from __future__ import absolute_import, print_function, unicode_literals
+
 from collections import Counter
+from pythainlp.corpus.thaiword import get_data
+
 WORDS = Counter(get_data())
-def P(word, N=sum(WORDS.values())):
-    'Probability of `word`.'
-    return WORDS[word] / N
+
+
+def prob(word, n=sum(WORDS.values())):
+    "Probability of `word`."
+    return WORDS[word] / n
+
+
 def correction(word):
-    'แสดงคำที่เป็นไปได้มากที่สุด'
-    return max(spell(word), key=P)
+    "แสดงคำที่เป็นไปได้มากที่สุด"
+    return max(spell(word), key=prob)
+
+
 def known(words):
     return list(w for w in words if w in WORDS)
+
+
 def edits1(word):
-    letters = ['ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '\u0e3b', '\u0e3c', '\u0e3d', '\u0e3e', '฿', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์']
-    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
-    deletes    = [L + R[1:]               for L, R in splits if R]
-    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
-    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
-    inserts    = [L + c + R               for L, R in splits for c in letters]
+    letters = [
+        "ก",
+        "ข",
+        "ฃ",
+        "ค",
+        "ฅ",
+        "ฆ",
+        "ง",
+        "จ",
+        "ฉ",
+        "ช",
+        "ซ",
+        "ฌ",
+        "ญ",
+        "ฎ",
+        "ฏ",
+        "ฐ",
+        "ฑ",
+        "ฒ",
+        "ณ",
+        "ด",
+        "ต",
+        "ถ",
+        "ท",
+        "ธ",
+        "น",
+        "บ",
+        "ป",
+        "ผ",
+        "ฝ",
+        "พ",
+        "ฟ",
+        "ภ",
+        "ม",
+        "ย",
+        "ร",
+        "ฤ",
+        "ล",
+        "ฦ",
+        "ว",
+        "ศ",
+        "ษ",
+        "ส",
+        "ห",
+        "ฬ",
+        "อ",
+        "ฮ",
+        "ฯ",
+        "ะ",
+        "ั",
+        "า",
+        "ำ",
+        "ิ",
+        "ี",
+        "ึ",
+        "ื",
+        "ุ",
+        "ู",
+        "ฺ",
+        "\u0e3b",
+        "\u0e3c",
+        "\u0e3d",
+        "\u0e3e",
+        "฿",
+        "เ",
+        "แ",
+        "โ",
+        "ใ",
+        "ไ",
+        "ๅ",
+        "ๆ",
+        "็",
+        "่",
+        "้",
+        "๊",
+        "๋",
+        "์",
+    ]
+    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+    deletes = [L + R[1:] for L, R in splits if R]
+    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
+    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
+    inserts = [L + c + R for L, R in splits for c in letters]
     return set(deletes + transposes + replaces + inserts)
+
+
 def edits2(word):
     return (e2 for e1 in edits1(word) for e2 in edits1(e1))
+
+
 def spell(word):
-    if word=='':
-        return ''
+    if not word:
+        return ""
     else:
-        return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
+        return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]
diff --git a/tests/__init__.py b/tests/__init__.py
index 354849d63..956a7de19 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -22,7 +22,7 @@
 from pythainlp.ner import thainer
 from pythainlp.number import numtowords
 from pythainlp.rank import rank
-from pythainlp.romanization import romanization
+from pythainlp.romanization import romanize
 from pythainlp.soundex import LK82, Udom83
 from pythainlp.spell import spell
 from pythainlp.summarize import summarize_text
@@ -139,16 +139,16 @@ def test_change(self):
         self.assertEqual(texttoeng('สวัสดีครับ'), 'l;ylfu8iy[')
 
     def test_romanization(self):
-        self.assertEqual(romanization('แมว'), 'maeo')
-        self.assertEqual(romanization('แมว', 'pyicu'), 'mæw')
+        self.assertEqual(romanize('แมว'), 'maeo')
+        self.assertEqual(romanize('แมว', 'pyicu'), 'mæw')
 
     def test_romanization_royin(self):
         engine = 'royin'
-        self.assertEqual(romanization('แมว', engine=engine), 'maeo')
-        self.assertEqual(romanization('เดือน', engine=engine), 'duean')
-        self.assertEqual(romanization('ดู', engine=engine), 'du')
-        self.assertEqual(romanization('ดำ', engine=engine), 'dam')
-        self.assertEqual(romanization('บัว', engine=engine), 'bua')
+        self.assertEqual(romanize('แมว', engine=engine), 'maeo')
+        self.assertEqual(romanize('เดือน', engine=engine), 'duean')
+        self.assertEqual(romanize('ดู', engine=engine), 'du')
+        self.assertEqual(romanize('ดำ', engine=engine), 'dam')
+        self.assertEqual(romanize('บัว', engine=engine), 'bua')
 
     def test_number(self):
         self.assertEqual(

From 6f2d2560ab6c935e3ba068e7794c2b27c4a25cc8 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 21 Oct 2018 19:36:17 +0700
Subject: [PATCH 2/3] - Remove Python 2 condition check - make indentation -
 make sure pos tagger will always return something (will use "unigrame" and
 "pud" as default)

---
 pythainlp/romanization/__init__.py |   2 +-
 pythainlp/sentiment/__init__.py    |   2 +-
 pythainlp/spell/__init__.py        |   2 +-
 pythainlp/spell/hunspell.py        |   4 +-
 pythainlp/tag/__init__.py          |  47 +++---
 pythainlp/tag/old.py               |  56 ++++---
 pythainlp/tag/perceptron.py        |  54 ++++---
 pythainlp/ulmfit/utils.py          |   6 +-
 pythainlp/util/__init__.py         | 229 +++++++++++++++--------------
 pythainlp/word_vector/thai2vec.py  | 112 ++++++++------
 10 files changed, 288 insertions(+), 226 deletions(-)

diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py
index 3279adbc5..fe0f12290 100644
--- a/pythainlp/romanization/__init__.py
+++ b/pythainlp/romanization/__init__.py
@@ -8,7 +8,7 @@
 def romanize(text, engine="royin"):
     """
     :param str data: Thai text to be romanized
-    :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
+    :param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization.
     :return: English (more or less) text that spells out how the Thai text should read.
     """
     if engine == "pyicu":
diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py
index a4fc92dee..7df87ef8e 100644
--- a/pythainlp/sentiment/__init__.py
+++ b/pythainlp/sentiment/__init__.py
@@ -14,7 +14,7 @@
 def sentiment(text, engine="old"):
     """
     :param str text: thai text
-    :param str engine: sentiment analysis engine (old or ulmfit)
+    :param str engine: sentiment analysis engine ("old" [default] or "ulmfit")
     :return: pos or neg
 
     **Example**::
diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py
index 09fb8576a..876fc27ca 100644
--- a/pythainlp/spell/__init__.py
+++ b/pythainlp/spell/__init__.py
@@ -9,7 +9,7 @@ def spell(word, engine="pn"):
     """
     :param str word: word to check spelling
     :param str engine:
-        * pn - Peter Norvig's algorithm
+        * pn - Peter Norvig's algorithm (default)
         * hunspell - uses hunspell's algorithm, which should already exist in Linux
     :return: list of words
     """
diff --git a/pythainlp/spell/hunspell.py b/pythainlp/spell/hunspell.py
index 718d7596b..e5444e4b7 100644
--- a/pythainlp/spell/hunspell.py
+++ b/pythainlp/spell/hunspell.py
@@ -35,8 +35,8 @@ def spell(word, lang="th_TH"):
     except subprocess.CalledProcessError:
         print("Error: Please install hunspell.")
         return None
-    except BaseException:
-        print("Errr: Other error.")
+    except BaseException as exception:
+        print("Errr: Other error: {}".format(exception))
         return None
 
 
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
index c2c45f7c7..09447c253 100644
--- a/pythainlp/tag/__init__.py
+++ b/pythainlp/tag/__init__.py
@@ -1,13 +1,21 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,print_function,unicode_literals
+"""
+Part-Of-Speech Tagging
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import sys
-def pos_tag(list_text,engine='unigram',corpus='orchid'):
+
+ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
+
+
+def pos_tag(texts, engine="unigram", corpus="orchid"):
     """
     Part of Speech tagging function.
 
-    :param list list_text: takes in a list of tokenized words (put differently, a list of string)
+    :param list texts: takes in a list of tokenized words (put differently, a list of strings)
     :param str engine:
-        * unigram - unigram tagger
+        * unigram - unigram tagger (default)
         * perceptron - perceptron tagger
         * artagger - RDR POS tagger
     :param str corpus:
@@ -15,29 +23,34 @@ def pos_tag(list_text,engine='unigram',corpus='orchid'):
         * pud - Parallel Universal Dependencies (PUD) treebanks
     :return: returns a list of labels regarding which part of speech it is
     """
-    if engine=='old' or engine=='unigram':
-        from .old import tag
-    elif engine=='perceptron':
+    if engine == "perceptron":
         from .perceptron import tag
-    elif engine=='artagger':
-        def tag(text1):
+    elif engine == "artagger":
+
+        def tag(text):
             try:
                 from artagger import Tagger
             except ImportError:
                 from pythainlp.tools import install_package
-                install_package('https://github.com/wannaphongcom/artagger/archive/master.zip')
+
+                install_package(ARTAGGER_URL)
                 try:
                     from artagger import Tagger
                 except ImportError:
-                    print("Error ! using 'pip install https://github.com/wannaphongcom/artagger/archive/master.zip'")
+                    print("Error: Try 'pip install " + ARTAGGER_URL + "'")
                     sys.exit(0)
-            words = Tagger().tag(' '.join(text1))
-            totag=[]
+            words = Tagger().tag(" ".join(text))
+            totag = []
             for word in words:
                 totag.append((word.word, word.tag))
             return totag
-        return tag(list_text)
-    return tag(list_text,corpus=corpus)
 
-def pos_tag_sents(sentences,engine='unigram',corpus='orchid'):
-    return [pos_tag(i,engine=engine,corpus=corpus) for i in sentences]
+        return tag(texts)
+    else:  # default, use "unigram" ("old") engine
+        from .old import tag
+
+    return tag(texts, corpus=corpus)
+
+
+def pos_tag_sents(sentences, engine="unigram", corpus="orchid"):
+    return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences]
diff --git a/pythainlp/tag/old.py b/pythainlp/tag/old.py
index acaf72841..d5233fb9a 100644
--- a/pythainlp/tag/old.py
+++ b/pythainlp/tag/old.py
@@ -1,28 +1,40 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals
+"""
+Unigram Part-Of-Speech Tagger
+"""
+from __future__ import absolute_import, division, unicode_literals
+
 import codecs
-import os
 import json
-import pythainlp
-import nltk.tag
+import os
+
 import dill
-templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
+import nltk.tag
+import pythainlp
+
+templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus")
+
+
 def orchid_data():
-	template_file = os.path.join(templates_dir, 'thaipos.json')
-	with codecs.open(template_file,'r',encoding='utf-8-sig') as handle:
-		model = json.load(handle)
-	return model
+    template_file = os.path.join(templates_dir, "thaipos.json")
+    with codecs.open(template_file, "r", encoding="utf-8-sig") as handle:
+        model = json.load(handle)
+    return model
+
+
 def pud_data():
-	template_file = os.path.join(templates_dir, 'ud_thai-pud_unigram_tagger.dill')
-	with open(template_file,'rb') as handle:
-		model = dill.load(handle)
-	return model
-def tag(text,corpus):
-	"""
-	รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
-	if corpus=='orchid':
-		tagger = nltk.tag.UnigramTagger(model=orchid_data())# backoff=default_tagger)
-		return tagger.tag(text)
-	elif corpus=='pud':
-		tagger = pud_data()
-		return tagger.tag(text)
+    template_file = os.path.join(templates_dir, "ud_thai-pud_unigram_tagger.dill")
+    with open(template_file, "rb") as handle:
+        model = dill.load(handle)
+    return model
+
+
+def tag(text, corpus):
+    """
+    รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
+    if corpus == "orchid":
+        tagger = nltk.tag.UnigramTagger(model=orchid_data())  # backoff=default_tagger)
+        return tagger.tag(text)
+    else:  # default, use "pud" as a corpus
+        tagger = pud_data()
+        return tagger.tag(text)
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
index 16ce35969..a5806b6e8 100644
--- a/pythainlp/tag/perceptron.py
+++ b/pythainlp/tag/perceptron.py
@@ -1,27 +1,37 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals
-import sys
+"""
+Perceptron Part-Of-Speech Tagger
+"""
+from __future__ import absolute_import, division, unicode_literals
+
 import os
-import pythainlp
-import nltk.tag
+
 import dill
-templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
+import pythainlp
+
+templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus")
+
+
 def orchid_data():
-	template_file = os.path.join(templates_dir, 'pt_tagger_1.dill')
-	with open(template_file,'rb') as handle:
-		model = dill.load(handle)
-	return model
+    template_file = os.path.join(templates_dir, "pt_tagger_1.dill")
+    with open(template_file, "rb") as handle:
+        model = dill.load(handle)
+    return model
+
+
 def pud_data():
-	template_file = os.path.join(templates_dir, 'ud_thai-pud_pt_tagger.dill')
-	with open(template_file,'rb') as handle:
-		model = dill.load(handle)
-	return model
-def tag(text,corpus):
-	"""
-	รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
-	if corpus=='orchid':
-		tagger = orchid_data()
-		return tagger.tag(text)
-	elif corpus=='pud':
-		tagger = pud_data()
-		return tagger.tag(text)
+    template_file = os.path.join(templates_dir, "ud_thai-pud_pt_tagger.dill")
+    with open(template_file, "rb") as handle:
+        model = dill.load(handle)
+    return model
+
+
+def tag(text, corpus):
+    """
+    รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
+    if corpus == "orchid":
+        tagger = orchid_data()
+        return tagger.tag(text)
+    else:  # default, use "pud" as a corpus
+        tagger = pud_data()
+        return tagger.tag(text)
diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py
index dd5adaad4..242f4c229 100644
--- a/pythainlp/ulmfit/utils.py
+++ b/pythainlp/ulmfit/utils.py
@@ -28,7 +28,7 @@
         from fastai.text import *
         import dill as pickle
     except ImportError:
-        print("Error installing using 'pip install fastai numpy dill'")
+        print("Error: Try 'pip install fastai numpy dill'")
         sys.exit(0)
 
 # import torch
@@ -281,6 +281,6 @@ def about():
     State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
     Created as part of PyThaiNLP with ULMFit implementation from fast.ai
 
-    Development : Charin Polpanumas
-    GitHub : https://github.com/cstorm125/thai2vec
+    Development: Charin Polpanumas
+    GitHub: https://github.com/cstorm125/thai2vec
     """
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index 332ef853d..b8c996fd7 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -1,29 +1,40 @@
 ﻿# -*- coding: utf-8 -*-
+"""
+Utility functions
+"""
 import re
-import six
+
 from nltk.util import ngrams as ngramsdata
-def ngrams(token,num):
-	'''
-	ngrams สร้าง ngrams
-	ngrams(token,num)
-	- token คือ list
-	- num คือ จำนวน ngrams
-	'''
-	return ngramsdata(token,int(num))
+
+
+def ngrams(token, num):
+    """
+    ngrams สร้าง ngrams
+    ngrams(token,num)
+    - token คือ list
+    - num คือ จำนวน ngrams
+    """
+    return ngramsdata(token, int(num))
+
+
 def bigrams(sequence):
-	"""
-	bigrams ใน Python
-	bigrams(sequence)
-	"""
-	return ngrams(sequence,2)
+    """
+    bigrams ใน Python
+    bigrams(sequence)
+    """
+    return ngrams(sequence, 2)
+
+
 def trigram(token):
-	'''
-	Trigram สร้าง trigram
-	trigram(token)
-	- token คือ list
-	'''
-	return ngrams(token,3)
-rule1=[
+    """
+    Trigram สร้าง trigram
+    trigram(token)
+    - token คือ list
+    """
+    return ngrams(token, 3)
+
+
+RULE1 = [
     u"ะ",
     u"ั",
     u"็",
@@ -38,7 +49,7 @@ def trigram(token):
     u"ใ",
     u"ไ",
     u"โ",
-    u"ื"
+    u"ื",
     u"่",
     u"้",
     u"๋",
@@ -46,27 +57,21 @@ def trigram(token):
     u"ึ",
     u"์",
     u"๋",
-    u"ำ"
-] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา
-if six.PY2:
-	rule2=[
-		(u"เเ",u"แ"),
-		(u"ํ(t)า",u"\1ำ"),
-		(u"ํา(t)",u"\1ำ"),
-		(u"([่-๋])([ัิ-ื])",u"\2\1"),
-		(u"([่-๋])([ูุ])", u"\2\1"),
-		(u"ำ([่-๋])", u"\1ำ"),
-		(u"(์)([ัิ-ื])",u"\2\1")
-	] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ
-else:
-	rule2=[
-		(u"เเ",u"แ"), # เ เ -> แ
-		(u"ํ(t)า",u"\\1ำ"),
-		(u"ํา(t)",u"\\1ำ"),
-		(u"([่-๋])([ัิ-ื])",u"\\2\\1"),
-		(u"([่-๋])([ูุ])", u"\\2\\1"),
-		(u"ำ([่-๋])", u"\\1ำ"),
-		(u"(์)([ัิ-ื])",u"\\2\\1")]
+    u"ำ",
+]  # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา
+
+
+RULE2 = [
+    (u"เเ", u"แ"),  # เ เ -> แ
+    (u"ํ(t)า", u"\\1ำ"),
+    (u"ํา(t)", u"\\1ำ"),
+    (u"([่-๋])([ัิ-ื])", u"\\2\\1"),
+    (u"([่-๋])([ูุ])", u"\\2\\1"),
+    (u"ำ([่-๋])", u"\\1ำ"),
+    (u"(์)([ัิ-ื])", u"\\2\\1"),
+]  # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ
+
+
 def normalize(text):
     """
     จัดการกับข้อความภาษาไทยให้เป็นปกติ
@@ -76,79 +81,87 @@ def normalize(text):
     >>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก
     True
     """
-    if six.PY2:
-        for data in rule2:
-            text=re.sub(data[0].replace(u"t",u"[่้๊๋]"),data[1],text,re.U)
-    else:
-        for data in rule2:
-            text=re.sub(data[0].replace("t","[่้๊๋]"),data[1],text,re.U)
-    for data in list(zip(rule1,rule1)):
-        text=re.sub(data[0].replace(u"t",u"[่้๊๋]")+"+",data[1],text,re.U)
+    for data in RULE2:
+        text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text)
+    for data in list(zip(RULE1, RULE1)):
+        text = re.sub(data[0].replace(u"t", u"[่้๊๋]") + "+", data[1], text)
     return text
+
+
 def deletetone(data):
-	'''โค้ดส่วนตัดวรรณยุกต์ออก'''
-	for tone in ['่','้','๊','๋']:
-		if (re.search(tone,data)):
-				data = re.sub(tone,'',data)
-	if re.search(u'\w'+'์',data, re.U):
-		search=re.findall(u'\w'+'์',data, re.U)
-		for i in search:
-				data=re.sub(i,'',data,flags=re.U)
-	return data
+    """โค้ดส่วนตัดวรรณยุกต์ออก"""
+    for tone in ["่", "้", "๊", "๋"]:
+        if re.search(tone, data):
+            data = re.sub(tone, "", data)
+    if re.search(r"\w" + "์", data):
+        search = re.findall(r"\w" + "์", data)
+        for i in search:
+            data = re.sub(i, "", data)
+    return data
+
+
 # Notebook : https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q
 # Cr. Korakot Chaovavanich
-thaiword_nums = set('ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า'.split())
-thaiword_units = set('สิบ ร้อย พัน หมื่น แสน ล้าน'.split())
+thaiword_nums = set("ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า".split())
+thaiword_units = set("สิบ ร้อย พัน หมื่น แสน ล้าน".split())
 thaiword_nums_units = thaiword_nums | thaiword_units
 thai_int_map = {
-    'ศูนย์': 0,
-    'หนึ่ง': 1,
-    'เอ็ด': 1,
-    'สอง': 2,
-    'ยี่': 2,
-    'สาม': 3,
-    'สี่': 4,
-    'ห้า': 5,
-    'หก': 6,
-    'เจ็ด': 7,
-    'แปด': 8,
-    'เก้า': 9,
-    'สิบ': 10,
-    'ร้อย': 100,
-    'พัน':  1000,
-    'หมื่น': 10000,
-    'แสน': 100000,
-    'ล้าน': 1000000,
+    "ศูนย์": 0,
+    "หนึ่ง": 1,
+    "เอ็ด": 1,
+    "สอง": 2,
+    "ยี่": 2,
+    "สาม": 3,
+    "สี่": 4,
+    "ห้า": 5,
+    "หก": 6,
+    "เจ็ด": 7,
+    "แปด": 8,
+    "เก้า": 9,
+    "สิบ": 10,
+    "ร้อย": 100,
+    "พัน": 1000,
+    "หมื่น": 10000,
+    "แสน": 100000,
+    "ล้าน": 1000000,
 }
-nu_pat = re.compile('(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?')  # หกสิบ, ร้อยเอ็ด
+nu_pat = re.compile("(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?")  # หกสิบ, ร้อยเอ็ด
 # assuming that the units are separated already
+
+
 def listtext_num2num_(tokens):
-  if len(tokens)==0:
-    return 0
-  if len(tokens)==1:
-    return thai_int_map[tokens[0]]
-  if len(tokens)==2:
-    a, b = tokens
+    len_tokens = len(tokens)
+
+    if len_tokens == 0:
+        return 0
+
+    if len_tokens == 1:
+        return thai_int_map[tokens[0]]
+
+    if len_tokens == 2:
+        a, b = tokens
+        if b in thaiword_units:
+            return thai_int_map[a] * thai_int_map[b]
+        else:
+            return thai_int_map[a] + thai_int_map[b]
+    # longer case we use recursive
+    a, b = tokens[:2]
+    if a in thaiword_units and b != "ล้าน":  # ร้อย แปด
+        return thai_int_map[a] + listtext_num2num_(tokens[1:])
+    # most common case, a isa num, b isa unit
     if b in thaiword_units:
-      return thai_int_map[a]*thai_int_map[b]
-    else:
-      return thai_int_map[a]+thai_int_map[b]
-  # longer case we use recursive
-  a, b = tokens[:2]
-  if a in thaiword_units and b != 'ล้าน':  # ร้อย แปด
-    return thai_int_map[a] + listtext_num2num_(tokens[1:])
-  # most common case, a isa num, b isa unit
-  if b in thaiword_units:
-    return thai_int_map[a]*thai_int_map[b] + listtext_num2num_(tokens[2:])
+        return thai_int_map[a] * thai_int_map[b] + listtext_num2num_(tokens[2:])
+
+
 def listtext_num2num(tokens):
-  res = []
-  for tok in tokens:
-    if tok in thaiword_nums_units:
-      res.append(tok)
-    else:
-      m = nu_pat.fullmatch(tok)
-      if m:
-        res.extend([t for t in m.groups() if t]) # ตัด None ทิ้ง
-      else:
-        pass  # should not be here
-  return listtext_num2num_(res)
\ No newline at end of file
+    res = []
+    for tok in tokens:
+        if tok in thaiword_nums_units:
+            res.append(tok)
+        else:
+            m = nu_pat.fullmatch(tok)
+            if m:
+                res.extend([t for t in m.groups() if t])  # ตัด None ทิ้ง
+            else:
+                pass  # should not be here
+    return listtext_num2num_(res)
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
index 103e80b12..153183094 100644
--- a/pythainlp/word_vector/thai2vec.py
+++ b/pythainlp/word_vector/thai2vec.py
@@ -1,72 +1,86 @@
 # -*- coding: utf-8 -*-
-'''
+"""
+thai2vec - Thai word vector
 Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
-'''
-from __future__ import absolute_import,unicode_literals
-import six
+"""
+from __future__ import absolute_import, unicode_literals
+
 import sys
-if six.PY2:
-	print("Thai sentiment in pythainlp. Not support python 2.7")
-	sys.exit(0)
+
+from pythainlp.corpus import download as download_data
+from pythainlp.corpus import get_file
+from pythainlp.tokenize import word_tokenize
+
 try:
-	from gensim.models import KeyedVectors
-	import numpy as np
+    from gensim.models import KeyedVectors
+    import numpy as np
 except ImportError:
-	from pythainlp.tools import install_package
-	install_package('gensim')
-	install_package('numpy')
-	try:
-		from gensim.models import KeyedVectors
-		import numpy as np
-	except ImportError:
-		print("Error ! using 'pip install gensim numpy'")
-		sys.exit(0)
-from pythainlp.tokenize import word_tokenize
-from pythainlp.corpus import get_file
-from pythainlp.corpus import download as download_data
-import os
+    from pythainlp.tools import install_package
+
+    install_package("gensim")
+    install_package("numpy")
+    try:
+        from gensim.models import KeyedVectors
+        import numpy as np
+    except ImportError:
+        print("Error: Try 'pip install gensim numpy'")
+        sys.exit(0)
+
 
 def download():
-	path = get_file('thai2vec02')
-	if path==None:
-		download_data('thai2vec02')
-		path = get_file('thai2vec02')
-	return path
+    path = get_file("thai2vec02")
+    if not path:
+        download_data("thai2vec02")
+        path = get_file("thai2vec02")
+    return path
+
+
 def get_model():
-    '''
-    :return: Downloads the `gensim` model.'''
-    return KeyedVectors.load_word2vec_format(download(),binary=False)
-def most_similar_cosmul(positive,negative):
-    '''
+    """
+    :return: Downloads the `gensim` model."""
+    return KeyedVectors.load_word2vec_format(download(), binary=False)
+
+
+def most_similar_cosmul(positive, negative):
+    """
 	การใช้งาน
 	input list
-	'''
+	"""
     return get_model().most_similar_cosmul(positive=positive, negative=negative)
+
+
 def doesnt_match(listdata):
     return get_model().doesnt_match(listdata)
-def similarity(word1,word2):
-    '''
+
+
+def similarity(word1, word2):
+    """
     :param str word1: first word
     :param str word2: second word
     :return: the cosine similarity between the two word vectors
-    '''
-    return get_model().similarity(word1,word2)
-def sentence_vectorizer(ss,dim=300,use_mean=False):
-    s = word_tokenize(ss)
-    vec = np.zeros((1,dim))
-    for word in s:
+    """
+    return get_model().similarity(word1, word2)
+
+
+def sentence_vectorizer(text, dim=300, use_mean=False):
+    words = word_tokenize(text)
+    vec = np.zeros((1, dim))
+    for word in words:
         if word in get_model().wv.index2word:
-            vec+= get_model().wv.word_vec(word)
-        else: pass
-    if use_mean: vec /= len(s)
-    return(vec)
+            vec += get_model().wv.word_vec(word)
+        else:
+            pass
+    if use_mean:
+        vec /= len(words)
+    return vec
+
 
 def about():
-	return '''
+    return """
 	thai2vec
 	State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
     Created as part of pyThaiNLP with ULMFit implementation from fast.ai
 	
-	Development : Charin Polpanumas
-	GitHub : https://github.com/cstorm125/thai2vec
-	'''
+	Development: Charin Polpanumas
+	GitHub: https://github.com/cstorm125/thai2vec
+	"""

From f45afa11f7e66aca64926a12d563eb4b84499c35 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 21 Oct 2018 19:53:40 +0700
Subject: [PATCH 3/3] generalized code

---
 pythainlp/tag/__init__.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
index 09447c253..2f9d89ef2 100644
--- a/pythainlp/tag/__init__.py
+++ b/pythainlp/tag/__init__.py
@@ -9,11 +9,11 @@
 ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
 
 
-def pos_tag(texts, engine="unigram", corpus="orchid"):
+def pos_tag(words, engine="unigram", corpus="orchid"):
     """
     Part of Speech tagging function.
 
-    :param list texts: takes in a list of tokenized words (put differently, a list of strings)
+    :param list words: takes in a list of tokenized words (put differently, a list of strings)
     :param str engine:
         * unigram - unigram tagger (default)
         * perceptron - perceptron tagger
@@ -24,10 +24,10 @@ def pos_tag(texts, engine="unigram", corpus="orchid"):
     :return: returns a list of labels regarding which part of speech it is
     """
     if engine == "perceptron":
-        from .perceptron import tag
+        from .perceptron import tag as _tag
     elif engine == "artagger":
 
-        def tag(text):
+        def _tag(text, corpus=None):
             try:
                 from artagger import Tagger
             except ImportError:
@@ -39,18 +39,16 @@ def tag(text):
                 except ImportError:
                     print("Error: Try 'pip install " + ARTAGGER_URL + "'")
                     sys.exit(0)
+
             words = Tagger().tag(" ".join(text))
-            totag = []
-            for word in words:
-                totag.append((word.word, word.tag))
-            return totag
 
-        return tag(texts)
+            return [(word.word, word.tag) for word in words]
+
     else:  # default, use "unigram" ("old") engine
-        from .old import tag
+        from .old import tag as _tag
 
-    return tag(texts, corpus=corpus)
+    return _tag(words, corpus=corpus)
 
 
 def pos_tag_sents(sentences, engine="unigram", corpus="orchid"):
-    return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences]
+    return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]