From fc0bc9f7df6b3cc4baf670b7e800da02044a540d Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 21 Oct 2018 18:58:20 +0700 Subject: [PATCH 1/3] Clean code - precompile regexes - more description for each file (at the beginning) - naming convention: rename thai2rom -> ThaiTransliterator [CamelCase for Class], romanization() -> romanize() [function name should be a verb] - update examples and tests to new names - add __ prefix for private members (ThaiTransliterator) - make sure a function will always return something -- if no engine found, use default (romanize(), sentiment(), spell()) - sort imports, remove unused imports - make consistent indentation (replace tabs with spaces) --- examples/romanization.py | 4 +- pythainlp/__init__.py | 2 +- pythainlp/chunk/__init__.py | 6 +- pythainlp/collation/__init__.py | 30 +++- pythainlp/romanization/__init__.py | 50 +++--- pythainlp/romanization/pyicu.py | 30 ++-- pythainlp/romanization/royin.py | 266 +++++++++++++++-------------- pythainlp/romanization/thai2rom.py | 243 +++++++++++++++----------- pythainlp/sentiment/__init__.py | 90 +++++----- pythainlp/sentiment/ulmfit_sent.py | 125 +++++++------- pythainlp/spell/__init__.py | 26 +-- pythainlp/spell/hunspell.py | 79 +++++---- pythainlp/spell/pn.py | 126 ++++++++++++-- tests/__init__.py | 16 +- 14 files changed, 639 insertions(+), 454 deletions(-) diff --git a/examples/romanization.py b/examples/romanization.py index 38ac4840a..abbbd94fc 100644 --- a/examples/romanization.py +++ b/examples/romanization.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from pythainlp.romanization import romanization +from pythainlp.romanization import romanize -print(romanization("แมว")) +print(romanize("แมว")) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index a7841831c..3da15f7a5 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -3,7 +3,7 @@ __version__ = 1.7 from pythainlp.sentiment import sentiment from pythainlp.spell import spell -from pythainlp.romanization import romanization +from pythainlp.romanization import romanize from pythainlp.tokenize import word_tokenize,sent_tokenize,tcc,etcc from pythainlp.rank import rank from pythainlp.change import texttothai,texttoeng diff --git a/pythainlp/chunk/__init__.py b/pythainlp/chunk/__init__.py index aea1f0adb..99e5bc68b 100644 --- a/pythainlp/chunk/__init__.py +++ b/pythainlp/chunk/__init__.py @@ -1,3 +1,5 @@ # -*- coding: utf-8 -*- -#from __future__ import absolute_import,unicode_literals -# TODO \ No newline at end of file + +# from __future__ import absolute_import, unicode_literals + +# TODO: Chunking diff --git a/pythainlp/collation/__init__.py b/pythainlp/collation/__init__.py index 33687c763..2ddc8851c 100644 --- a/pythainlp/collation/__init__.py +++ b/pythainlp/collation/__init__.py @@ -1,16 +1,27 @@ # -*- coding: utf-8 -*- +""" +Thai collation (sort according to dictionary order) +For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR) +https://unicode.org/cldr/charts/latest/collation/th.html +""" from __future__ import absolute_import, unicode_literals, print_function import re +RE_TONE = re.compile(r"[็-์]") +RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])") + try: import icu - thkey = icu.Collator.createInstance(icu.Locale('th_TH')).getSortKey + + thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey except ImportError: + def thkey(word): - cv = re.sub('[็-์]', '', word,re.U) # remove tone - cv = re.sub('([เ-ไ])([ก-ฮ])', '\\2\\1', cv,re.U) # switch lead vowel - tone = re.sub('[^็-์]', ' ', word,re.U) # just tone - return cv+tone + cv = RE_TONE.sub("", word) # remove tone + cv = RE_LV_C.sub("\\2\\1", cv) # switch lead vowel + tone = RE_TONE.sub(" ", word) # just tone + return cv + tone + def collation(data): """ @@ -23,8 +34,9 @@ def collation(data): """ return sorted(data, key=thkey) + if __name__ == "__main__": - a=collation(['ไก่','ไข่','ก','ฮา'])==['ก', 'ไก่', 'ไข่', 'ฮา'] - print(a) - print(collation(['หลาย','หญิง'])==['หญิง','หลาย']) - print(collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])==['ไก่', 'เป็ด', 'วัว', 'หมู']) + a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"] + print(a) + print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"]) + print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"]) diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py index 34593bbd9..3279adbc5 100644 --- a/pythainlp/romanization/__init__.py +++ b/pythainlp/romanization/__init__.py @@ -1,27 +1,27 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals + +from __future__ import absolute_import, unicode_literals from pythainlp.tokenize import word_tokenize -# ถอดเสียงภาษาไทยเป็น Latin -def romanization(data,engine='royin'): - """ - :param str data: Thai text to be romanized - :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization. - :return: English (more or less) text that spells out how the Thai text should read. - """ - word_list=word_tokenize(data) - listword=[] - i=0 - if engine=='royin': - from .royin import romanization - elif engine=='pyicu': - from .pyicu import romanization - elif engine=='thai2rom': - from pythainlp.romanization.thai2rom import thai2rom - thai=thai2rom() - return thai.romanization(data) - else: - raise Exception("error no have engine.") - while i self.max_decoder_seq_length): - self.stop_condition = True - self.target_seq = np.zeros((1, 1, self.num_decoder_tokens)) - self.target_seq[0, 0, self.sampled_token_index] = 1. - self.states_value = [self.h, self.c] - return self.decoded_sentence - def encode_input(self,name): - self.test_input = np.zeros((1, self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32') + while not self.__stop_condition: + self.__output_tokens, self.__h, self.__c = self.__decoder_model.predict( + [self.__target_seq] + self.__states_value + ) + self.__sampled_token_index = np.argmax(self.__output_tokens[0, -1, :]) + self.__sampled_char = self.__reverse_target_char_index[self.__sampled_token_index] + self.__decoded_sentence += self.__sampled_char + if ( + self.__sampled_char == "\n" + or len(self.__decoded_sentence) > self.__max_decoder_seq_length + ): + self.__stop_condition = True + self.__target_seq = np.zeros((1, 1, self.__num_decoder_tokens)) + self.__target_seq[0, 0, self.__sampled_token_index] = 1. + self.__states_value = [self.__h, self.__c] + return self.__decoded_sentence + + def __encode_input(self, name): + self.__test_input = np.zeros( + (1, self.__max_encoder_seq_length, self.__num_encoder_tokens), dtype="float32" + ) for t, char in enumerate(name): - self.test_input[0, t, self.input_token_index[char]] = 1. - return self.test_input - def romanization(self,text): - ''' + self.__test_input[0, t, self.__input_token_index[char]] = 1. + return self.__test_input + + def romanize(self, text): + """ :param str text: Thai text to be romanized :return: English (more or less) text that spells out how the Thai text should read. - ''' - return self.decode_sequence(self.encode_input(text)) + """ + return self.__decode_sequence(self.__encode_input(text)) diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py index af7995af2..a4fc92dee 100644 --- a/pythainlp/sentiment/__init__.py +++ b/pythainlp/sentiment/__init__.py @@ -1,50 +1,50 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals,print_function + +from __future__ import absolute_import, print_function, unicode_literals +import os +import dill + import pythainlp from pythainlp.corpus import stopwords -import os from pythainlp.tokenize import word_tokenize -import dill -templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'sentiment') -def sentiment(text, engine='old'): - """ - :param str text: thai text - :param str engine: sentiment analysis engine (old or ulmfit) - :return: pos or neg - - **Example**:: - >>> from pythainlp.sentiment import sentiment - >>> text="วันนี้อากาศดีจัง" - >>> sentiment(text) - 'pos' - >>> sentiment(text,'ulmfit') - 'pos' - >>> text="วันนี้อารมณ์เสียมาก" - >>> sentiment(text) - 'neg' - >>> sentiment(text,'ulmfit') - 'neg' - """ - if engine=='old': - with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm: - vocabulary = dill.load(in_strm) - with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm: - classifier = dill.load(in_strm) - text=set(word_tokenize(text))-set(stopwords.words('thai')) - featurized_test_sentence = {i:(i in text) for i in vocabulary} - return classifier.classify(featurized_test_sentence) - elif engine=='ulmfit': - from pythainlp.sentiment import ulmfit_sent - tag=ulmfit_sent.get_sentiment(text) - sa="" - if tag==0: - sa="neg" - else: - sa="pos" - return sa - else: - raise Exception("error no have engine.") -if __name__ == '__main__': - d="เสียใจแย่มากเลย" - print(sentiment(d)) +templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "sentiment") + + +def sentiment(text, engine="old"): + """ + :param str text: thai text + :param str engine: sentiment analysis engine (old or ulmfit) + :return: pos or neg + + **Example**:: + >>> from pythainlp.sentiment import sentiment + >>> text="วันนี้อากาศดีจัง" + >>> sentiment(text) + 'pos' + >>> sentiment(text,'ulmfit') + 'pos' + >>> text="วันนี้อารมณ์เสียมาก" + >>> sentiment(text) + 'neg' + >>> sentiment(text,'ulmfit') + 'neg' + """ + if engine == "ulmfit": + from pythainlp.sentiment import ulmfit_sent + + tag = ulmfit_sent.get_sentiment(text) + return "pos" if tag else "neg" + else: # default, use "old" vocabulary-based engine + with open(os.path.join(templates_dir, "vocabulary.data"), "rb") as in_strm: + vocabulary = dill.load(in_strm) + with open(os.path.join(templates_dir, "sentiment.data"), "rb") as in_strm: + classifier = dill.load(in_strm) + text = set(word_tokenize(text)) - set(stopwords.words("thai")) + featurized_test_sentence = {i: (i in text) for i in vocabulary} + return classifier.classify(featurized_test_sentence) + + +if __name__ == "__main__": + text = "เสียใจแย่มากเลย" + print(sentiment(text)) diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py index 85bd7b790..1e56d923a 100644 --- a/pythainlp/sentiment/ulmfit_sent.py +++ b/pythainlp/sentiment/ulmfit_sent.py @@ -1,90 +1,95 @@ # -*- coding: utf-8 -*- -''' +""" +Sentiment analyzer based on thai2vec ("ulmfit" engine) Code by https://github.com/cstorm125/thai2vec/tree/master/notebook -''' -from __future__ import absolute_import,unicode_literals -import os +""" +from __future__ import absolute_import, unicode_literals + import sys from collections import defaultdict -#numpy and dill +from pythainlp.corpus import download, get_file +from pythainlp.tokenize import word_tokenize + try: import numpy as np import dill as pickle except ImportError: from pythainlp.tools import install_package - install_package('numpy') - install_package('dill') + + install_package("numpy") + install_package("dill") try: import numpy as np import dill as pickle except ImportError: - print("Error installing using 'pip install numpy dill'") + print("Error: Try 'pip install numpy dill'") sys.exit(0) -#import torch try: import torch + from torch import LongTensor + from torch.autograd import Variable except ImportError: - print('PyTorch required. See https://pytorch.org/.') -import torch -from torch.autograd import Variable -from torch import LongTensor + print("PyTorch required. See https://pytorch.org/.") -#import fastai for multiBatchRNN -try: - from fastai.text import * -except ImportError: - print( - """ - fastai required for multiBatchRNN. - Run 'pip install https://github.com/fastai/fastai/archive/master.zip' - """) +# try: +# from fastai.text import multiBatchRNN +# except ImportError: +# print( +# """ +# fastai required for multiBatchRNN. +# Run 'pip install https://github.com/fastai/fastai/archive/master.zip' +# """ +# ) -from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import get_file -from pythainlp.corpus import download -MODEL_NAME = 'sent_model' -ITOS_NAME = 'itos_sent' +MODEL_NAME = "sent_model" +ITOS_NAME = "itos_sent" -#download pretrained model + +# download pretrained model def get_path(fname): - path = get_file(fname) - if path==None: - download(fname) - path = get_file(fname) - return(path) - -#load model -m = torch.load(get_path(MODEL_NAME)) -m.eval() -#load itos and stoi -itos = pickle.load(open(get_path(ITOS_NAME),'rb')) -stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)}) - - -#get sentiment; 1 for positive and 0 for negative -#or score if specified return_score=True -softmax = lambda x : np.exp(x)/np.sum(np.exp(x)) -def get_sentiment(ss,return_score=False): - s = word_tokenize(ss) - t = LongTensor([stoi[i] for i in s]).view(-1,1).cpu() - t = Variable(t,volatile=False) - m.reset() - pred,*_ = m(t) + path = get_file(fname) + if not path: + download(fname) + path = get_file(fname) + return path + + +# load model +model = torch.load(get_path(MODEL_NAME)) +model.eval() + +# load itos and stoi +itos = pickle.load(open(get_path(ITOS_NAME), "rb")) +stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) + +# get sentiment; 1 for positive and 0 for negative +# or score if specified return_score=True +softmax = lambda x: np.exp(x) / np.sum(np.exp(x)) + + +def get_sentiment(text, return_score=False): + words = word_tokenize(text) + tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu() + tensor = Variable(tensor, volatile=False) + model.reset() + pred, *_ = model(tensor) result = pred.data.cpu().numpy().reshape(-1) + if return_score: - return(softmax(result)) + return softmax(result) else: - return(np.argmax(result)) + return np.argmax(result) + def about(): - return ''' - Sentiment Analyzer based on thai2vec - Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge. + return """ + Sentiment analyzer based on thai2vec + Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge. 89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline. - - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec - ''' \ No newline at end of file + + Development: Charin Polpanumas + GitHub: https://github.com/cstorm125/thai2vec + """ diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py index df503e3d7..09fb8576a 100644 --- a/pythainlp/spell/__init__.py +++ b/pythainlp/spell/__init__.py @@ -1,15 +1,21 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -def spell(word,engine='pn'): +""" +Spell checking +""" +from __future__ import absolute_import, unicode_literals + + +def spell(word, engine="pn"): """ - :param str word: the word to check spelling + :param str word: word to check spelling :param str engine: * pn - Peter Norvig's algorithm - * hunspell - uses hunspell's algorithm, which should already exist in linux - :return: list word + * hunspell - uses hunspell's algorithm, which should already exist in Linux + :return: list of words """ - if engine=='pn': - from .pn import spell as spell1 - elif engine=='hunspell': - from .hunspell import spell as spell1 - return spell1(word) + if engine == "hunspell": + from .hunspell import spell as _spell + else: # default, use "pn" engine + from .pn import spell as _spell + + return _spell(word) diff --git a/pythainlp/spell/hunspell.py b/pythainlp/spell/hunspell.py index c940126f5..718d7596b 100644 --- a/pythainlp/spell/hunspell.py +++ b/pythainlp/spell/hunspell.py @@ -1,40 +1,49 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,print_function,unicode_literals -from builtins import * +""" +Spell checking using hunspell +""" +from __future__ import absolute_import, print_function, unicode_literals + import subprocess import sys -def spel1(word,lang='th_TH'): - """เป็นคำสั่งตรวจคำผิดโดยใช้ hunspell - รับค่า str ส่งออกเป็น list - """ - try: - if sys.platform == 'win32': - cmd = "echo "+word+" | hunspell -d "+lang - else: - cmd = 'echo "'+word+'" | hunspell -d '+lang - getoutput = subprocess.getoutput(cmd) - del cmd - get = getoutput.split("\n") - del get[0] - if get[0] == '*': - getoutput = [] - else: - if get[1] == "": - del get[1] - get = get[0].split(":") - del get[0] - getoutput = get[0].replace(" ","") - getoutput = getoutput.split(",") - del get - return getoutput - except subprocess.CalledProcessError: - print('please install hunspell') - return None + +def spell(word, lang="th_TH"): + """เป็นคำสั่งตรวจคำผิดโดยใช้ hunspell + รับค่า str ส่งออกเป็น list + """ + try: + if sys.platform == "win32": + cmd = "echo " + word + " | hunspell -d " + lang + else: + cmd = 'echo "' + word + '" | hunspell -d ' + lang + getoutput = subprocess.getoutput(cmd) + del cmd + get = getoutput.split("\n") + del get[0] + if get[0] == "*": + getoutput = [] + else: + if get[1] == "": + del get[1] + get = get[0].split(":") + del get[0] + getoutput = get[0].replace(" ", "") + getoutput = getoutput.split(",") + del get + return getoutput + except subprocess.CalledProcessError: + print("Error: Please install hunspell.") + return None + except BaseException: + print("Errr: Other error.") + return None + + if __name__ == "__main__": - Input = spell("appoe","") - print(Input) - InputTH = spell("คลินิค","th_TH") - print(InputTH) - trueth = spell("สี่เหลียม","th_TH") - print(trueth) + input1 = spell("appoe", "") + print(input1) + input2 = spell("คลินิค", "th_TH") + print(input2) + input3 = spell("สี่เหลียม", "th_TH") + print(input3) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 272753dfc..4fd942aac 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -1,32 +1,122 @@ # -*- coding: utf-8 -*- """ -Fork from Peter Norvig's Python codes at http://norvig.com/spell-correct.html +Fork from Peter Norvig's Python code at http://norvig.com/spell-correct.html """ -from __future__ import absolute_import,print_function,unicode_literals -from builtins import * -from pythainlp.corpus.thaiword import get_data +from __future__ import absolute_import, print_function, unicode_literals + from collections import Counter +from pythainlp.corpus.thaiword import get_data + WORDS = Counter(get_data()) -def P(word, N=sum(WORDS.values())): - 'Probability of `word`.' - return WORDS[word] / N + + +def prob(word, n=sum(WORDS.values())): + "Probability of `word`." + return WORDS[word] / n + + def correction(word): - 'แสดงคำที่เป็นไปได้มากที่สุด' - return max(spell(word), key=P) + "แสดงคำที่เป็นไปได้มากที่สุด" + return max(spell(word), key=prob) + + def known(words): return list(w for w in words if w in WORDS) + + def edits1(word): - letters = ['ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '\u0e3b', '\u0e3c', '\u0e3d', '\u0e3e', '฿', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์'] - splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] - deletes = [L + R[1:] for L, R in splits if R] - transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] - replaces = [L + c + R[1:] for L, R in splits if R for c in letters] - inserts = [L + c + R for L, R in splits for c in letters] + letters = [ + "ก", + "ข", + "ฃ", + "ค", + "ฅ", + "ฆ", + "ง", + "จ", + "ฉ", + "ช", + "ซ", + "ฌ", + "ญ", + "ฎ", + "ฏ", + "ฐ", + "ฑ", + "ฒ", + "ณ", + "ด", + "ต", + "ถ", + "ท", + "ธ", + "น", + "บ", + "ป", + "ผ", + "ฝ", + "พ", + "ฟ", + "ภ", + "ม", + "ย", + "ร", + "ฤ", + "ล", + "ฦ", + "ว", + "ศ", + "ษ", + "ส", + "ห", + "ฬ", + "อ", + "ฮ", + "ฯ", + "ะ", + "ั", + "า", + "ำ", + "ิ", + "ี", + "ึ", + "ื", + "ุ", + "ู", + "ฺ", + "\u0e3b", + "\u0e3c", + "\u0e3d", + "\u0e3e", + "฿", + "เ", + "แ", + "โ", + "ใ", + "ไ", + "ๅ", + "ๆ", + "็", + "่", + "้", + "๊", + "๋", + "์", + ] + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) + + def edits2(word): return (e2 for e1 in edits1(word) for e2 in edits1(e1)) + + def spell(word): - if word=='': - return '' + if not word: + return "" else: - return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) + return known([word]) or known(edits1(word)) or known(edits2(word)) or [word] diff --git a/tests/__init__.py b/tests/__init__.py index 354849d63..956a7de19 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -22,7 +22,7 @@ from pythainlp.ner import thainer from pythainlp.number import numtowords from pythainlp.rank import rank -from pythainlp.romanization import romanization +from pythainlp.romanization import romanize from pythainlp.soundex import LK82, Udom83 from pythainlp.spell import spell from pythainlp.summarize import summarize_text @@ -139,16 +139,16 @@ def test_change(self): self.assertEqual(texttoeng('สวัสดีครับ'), 'l;ylfu8iy[') def test_romanization(self): - self.assertEqual(romanization('แมว'), 'maeo') - self.assertEqual(romanization('แมว', 'pyicu'), 'mæw') + self.assertEqual(romanize('แมว'), 'maeo') + self.assertEqual(romanize('แมว', 'pyicu'), 'mæw') def test_romanization_royin(self): engine = 'royin' - self.assertEqual(romanization('แมว', engine=engine), 'maeo') - self.assertEqual(romanization('เดือน', engine=engine), 'duean') - self.assertEqual(romanization('ดู', engine=engine), 'du') - self.assertEqual(romanization('ดำ', engine=engine), 'dam') - self.assertEqual(romanization('บัว', engine=engine), 'bua') + self.assertEqual(romanize('แมว', engine=engine), 'maeo') + self.assertEqual(romanize('เดือน', engine=engine), 'duean') + self.assertEqual(romanize('ดู', engine=engine), 'du') + self.assertEqual(romanize('ดำ', engine=engine), 'dam') + self.assertEqual(romanize('บัว', engine=engine), 'bua') def test_number(self): self.assertEqual( From 6f2d2560ab6c935e3ba068e7794c2b27c4a25cc8 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 21 Oct 2018 19:36:17 +0700 Subject: [PATCH 2/3] - Remove Python 2 condition check - make indentation - make sure pos tagger will always return something (will use "unigrame" and "pud" as default) --- pythainlp/romanization/__init__.py | 2 +- pythainlp/sentiment/__init__.py | 2 +- pythainlp/spell/__init__.py | 2 +- pythainlp/spell/hunspell.py | 4 +- pythainlp/tag/__init__.py | 47 +++--- pythainlp/tag/old.py | 56 ++++--- pythainlp/tag/perceptron.py | 54 ++++--- pythainlp/ulmfit/utils.py | 6 +- pythainlp/util/__init__.py | 229 +++++++++++++++-------------- pythainlp/word_vector/thai2vec.py | 112 ++++++++------ 10 files changed, 288 insertions(+), 226 deletions(-) diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py index 3279adbc5..fe0f12290 100644 --- a/pythainlp/romanization/__init__.py +++ b/pythainlp/romanization/__init__.py @@ -8,7 +8,7 @@ def romanize(text, engine="royin"): """ :param str data: Thai text to be romanized - :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization. + :param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization. :return: English (more or less) text that spells out how the Thai text should read. """ if engine == "pyicu": diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py index a4fc92dee..7df87ef8e 100644 --- a/pythainlp/sentiment/__init__.py +++ b/pythainlp/sentiment/__init__.py @@ -14,7 +14,7 @@ def sentiment(text, engine="old"): """ :param str text: thai text - :param str engine: sentiment analysis engine (old or ulmfit) + :param str engine: sentiment analysis engine ("old" [default] or "ulmfit") :return: pos or neg **Example**:: diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py index 09fb8576a..876fc27ca 100644 --- a/pythainlp/spell/__init__.py +++ b/pythainlp/spell/__init__.py @@ -9,7 +9,7 @@ def spell(word, engine="pn"): """ :param str word: word to check spelling :param str engine: - * pn - Peter Norvig's algorithm + * pn - Peter Norvig's algorithm (default) * hunspell - uses hunspell's algorithm, which should already exist in Linux :return: list of words """ diff --git a/pythainlp/spell/hunspell.py b/pythainlp/spell/hunspell.py index 718d7596b..e5444e4b7 100644 --- a/pythainlp/spell/hunspell.py +++ b/pythainlp/spell/hunspell.py @@ -35,8 +35,8 @@ def spell(word, lang="th_TH"): except subprocess.CalledProcessError: print("Error: Please install hunspell.") return None - except BaseException: - print("Errr: Other error.") + except BaseException as exception: + print("Errr: Other error: {}".format(exception)) return None diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index c2c45f7c7..09447c253 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -1,13 +1,21 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,division,print_function,unicode_literals +""" +Part-Of-Speech Tagging +""" +from __future__ import absolute_import, division, print_function, unicode_literals + import sys -def pos_tag(list_text,engine='unigram',corpus='orchid'): + +ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip" + + +def pos_tag(texts, engine="unigram", corpus="orchid"): """ Part of Speech tagging function. - :param list list_text: takes in a list of tokenized words (put differently, a list of string) + :param list texts: takes in a list of tokenized words (put differently, a list of strings) :param str engine: - * unigram - unigram tagger + * unigram - unigram tagger (default) * perceptron - perceptron tagger * artagger - RDR POS tagger :param str corpus: @@ -15,29 +23,34 @@ def pos_tag(list_text,engine='unigram',corpus='orchid'): * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ - if engine=='old' or engine=='unigram': - from .old import tag - elif engine=='perceptron': + if engine == "perceptron": from .perceptron import tag - elif engine=='artagger': - def tag(text1): + elif engine == "artagger": + + def tag(text): try: from artagger import Tagger except ImportError: from pythainlp.tools import install_package - install_package('https://github.com/wannaphongcom/artagger/archive/master.zip') + + install_package(ARTAGGER_URL) try: from artagger import Tagger except ImportError: - print("Error ! using 'pip install https://github.com/wannaphongcom/artagger/archive/master.zip'") + print("Error: Try 'pip install " + ARTAGGER_URL + "'") sys.exit(0) - words = Tagger().tag(' '.join(text1)) - totag=[] + words = Tagger().tag(" ".join(text)) + totag = [] for word in words: totag.append((word.word, word.tag)) return totag - return tag(list_text) - return tag(list_text,corpus=corpus) -def pos_tag_sents(sentences,engine='unigram',corpus='orchid'): - return [pos_tag(i,engine=engine,corpus=corpus) for i in sentences] + return tag(texts) + else: # default, use "unigram" ("old") engine + from .old import tag + + return tag(texts, corpus=corpus) + + +def pos_tag_sents(sentences, engine="unigram", corpus="orchid"): + return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences] diff --git a/pythainlp/tag/old.py b/pythainlp/tag/old.py index acaf72841..d5233fb9a 100644 --- a/pythainlp/tag/old.py +++ b/pythainlp/tag/old.py @@ -1,28 +1,40 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,division,unicode_literals +""" +Unigram Part-Of-Speech Tagger +""" +from __future__ import absolute_import, division, unicode_literals + import codecs -import os import json -import pythainlp -import nltk.tag +import os + import dill -templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus') +import nltk.tag +import pythainlp + +templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus") + + def orchid_data(): - template_file = os.path.join(templates_dir, 'thaipos.json') - with codecs.open(template_file,'r',encoding='utf-8-sig') as handle: - model = json.load(handle) - return model + template_file = os.path.join(templates_dir, "thaipos.json") + with codecs.open(template_file, "r", encoding="utf-8-sig") as handle: + model = json.load(handle) + return model + + def pud_data(): - template_file = os.path.join(templates_dir, 'ud_thai-pud_unigram_tagger.dill') - with open(template_file,'rb') as handle: - model = dill.load(handle) - return model -def tag(text,corpus): - """ - รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]""" - if corpus=='orchid': - tagger = nltk.tag.UnigramTagger(model=orchid_data())# backoff=default_tagger) - return tagger.tag(text) - elif corpus=='pud': - tagger = pud_data() - return tagger.tag(text) + template_file = os.path.join(templates_dir, "ud_thai-pud_unigram_tagger.dill") + with open(template_file, "rb") as handle: + model = dill.load(handle) + return model + + +def tag(text, corpus): + """ + รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]""" + if corpus == "orchid": + tagger = nltk.tag.UnigramTagger(model=orchid_data()) # backoff=default_tagger) + return tagger.tag(text) + else: # default, use "pud" as a corpus + tagger = pud_data() + return tagger.tag(text) diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 16ce35969..a5806b6e8 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -1,27 +1,37 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,division,unicode_literals -import sys +""" +Perceptron Part-Of-Speech Tagger +""" +from __future__ import absolute_import, division, unicode_literals + import os -import pythainlp -import nltk.tag + import dill -templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus') +import pythainlp + +templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus") + + def orchid_data(): - template_file = os.path.join(templates_dir, 'pt_tagger_1.dill') - with open(template_file,'rb') as handle: - model = dill.load(handle) - return model + template_file = os.path.join(templates_dir, "pt_tagger_1.dill") + with open(template_file, "rb") as handle: + model = dill.load(handle) + return model + + def pud_data(): - template_file = os.path.join(templates_dir, 'ud_thai-pud_pt_tagger.dill') - with open(template_file,'rb') as handle: - model = dill.load(handle) - return model -def tag(text,corpus): - """ - รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]""" - if corpus=='orchid': - tagger = orchid_data() - return tagger.tag(text) - elif corpus=='pud': - tagger = pud_data() - return tagger.tag(text) + template_file = os.path.join(templates_dir, "ud_thai-pud_pt_tagger.dill") + with open(template_file, "rb") as handle: + model = dill.load(handle) + return model + + +def tag(text, corpus): + """ + รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]""" + if corpus == "orchid": + tagger = orchid_data() + return tagger.tag(text) + else: # default, use "pud" as a corpus + tagger = pud_data() + return tagger.tag(text) diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py index dd5adaad4..242f4c229 100644 --- a/pythainlp/ulmfit/utils.py +++ b/pythainlp/ulmfit/utils.py @@ -28,7 +28,7 @@ from fastai.text import * import dill as pickle except ImportError: - print("Error installing using 'pip install fastai numpy dill'") + print("Error: Try 'pip install fastai numpy dill'") sys.exit(0) # import torch @@ -281,6 +281,6 @@ def about(): State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language. Created as part of PyThaiNLP with ULMFit implementation from fast.ai - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec + Development: Charin Polpanumas + GitHub: https://github.com/cstorm125/thai2vec """ diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 332ef853d..b8c996fd7 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -1,29 +1,40 @@ # -*- coding: utf-8 -*- +""" +Utility functions +""" import re -import six + from nltk.util import ngrams as ngramsdata -def ngrams(token,num): - ''' - ngrams สร้าง ngrams - ngrams(token,num) - - token คือ list - - num คือ จำนวน ngrams - ''' - return ngramsdata(token,int(num)) + + +def ngrams(token, num): + """ + ngrams สร้าง ngrams + ngrams(token,num) + - token คือ list + - num คือ จำนวน ngrams + """ + return ngramsdata(token, int(num)) + + def bigrams(sequence): - """ - bigrams ใน Python - bigrams(sequence) - """ - return ngrams(sequence,2) + """ + bigrams ใน Python + bigrams(sequence) + """ + return ngrams(sequence, 2) + + def trigram(token): - ''' - Trigram สร้าง trigram - trigram(token) - - token คือ list - ''' - return ngrams(token,3) -rule1=[ + """ + Trigram สร้าง trigram + trigram(token) + - token คือ list + """ + return ngrams(token, 3) + + +RULE1 = [ u"ะ", u"ั", u"็", @@ -38,7 +49,7 @@ def trigram(token): u"ใ", u"ไ", u"โ", - u"ื" + u"ื", u"่", u"้", u"๋", @@ -46,27 +57,21 @@ def trigram(token): u"ึ", u"์", u"๋", - u"ำ" -] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา -if six.PY2: - rule2=[ - (u"เเ",u"แ"), - (u"ํ(t)า",u"\1ำ"), - (u"ํา(t)",u"\1ำ"), - (u"([่-๋])([ัิ-ื])",u"\2\1"), - (u"([่-๋])([ูุ])", u"\2\1"), - (u"ำ([่-๋])", u"\1ำ"), - (u"(์)([ัิ-ื])",u"\2\1") - ] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ -else: - rule2=[ - (u"เเ",u"แ"), # เ เ -> แ - (u"ํ(t)า",u"\\1ำ"), - (u"ํา(t)",u"\\1ำ"), - (u"([่-๋])([ัิ-ื])",u"\\2\\1"), - (u"([่-๋])([ูุ])", u"\\2\\1"), - (u"ำ([่-๋])", u"\\1ำ"), - (u"(์)([ัิ-ื])",u"\\2\\1")] + u"ำ", +] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา + + +RULE2 = [ + (u"เเ", u"แ"), # เ เ -> แ + (u"ํ(t)า", u"\\1ำ"), + (u"ํา(t)", u"\\1ำ"), + (u"([่-๋])([ัิ-ื])", u"\\2\\1"), + (u"([่-๋])([ูุ])", u"\\2\\1"), + (u"ำ([่-๋])", u"\\1ำ"), + (u"(์)([ัิ-ื])", u"\\2\\1"), +] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ + + def normalize(text): """ จัดการกับข้อความภาษาไทยให้เป็นปกติ @@ -76,79 +81,87 @@ def normalize(text): >>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก True """ - if six.PY2: - for data in rule2: - text=re.sub(data[0].replace(u"t",u"[่้๊๋]"),data[1],text,re.U) - else: - for data in rule2: - text=re.sub(data[0].replace("t","[่้๊๋]"),data[1],text,re.U) - for data in list(zip(rule1,rule1)): - text=re.sub(data[0].replace(u"t",u"[่้๊๋]")+"+",data[1],text,re.U) + for data in RULE2: + text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text) + for data in list(zip(RULE1, RULE1)): + text = re.sub(data[0].replace(u"t", u"[่้๊๋]") + "+", data[1], text) return text + + def deletetone(data): - '''โค้ดส่วนตัดวรรณยุกต์ออก''' - for tone in ['่','้','๊','๋']: - if (re.search(tone,data)): - data = re.sub(tone,'',data) - if re.search(u'\w'+'์',data, re.U): - search=re.findall(u'\w'+'์',data, re.U) - for i in search: - data=re.sub(i,'',data,flags=re.U) - return data + """โค้ดส่วนตัดวรรณยุกต์ออก""" + for tone in ["่", "้", "๊", "๋"]: + if re.search(tone, data): + data = re.sub(tone, "", data) + if re.search(r"\w" + "์", data): + search = re.findall(r"\w" + "์", data) + for i in search: + data = re.sub(i, "", data) + return data + + # Notebook : https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q # Cr. Korakot Chaovavanich -thaiword_nums = set('ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า'.split()) -thaiword_units = set('สิบ ร้อย พัน หมื่น แสน ล้าน'.split()) +thaiword_nums = set("ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า".split()) +thaiword_units = set("สิบ ร้อย พัน หมื่น แสน ล้าน".split()) thaiword_nums_units = thaiword_nums | thaiword_units thai_int_map = { - 'ศูนย์': 0, - 'หนึ่ง': 1, - 'เอ็ด': 1, - 'สอง': 2, - 'ยี่': 2, - 'สาม': 3, - 'สี่': 4, - 'ห้า': 5, - 'หก': 6, - 'เจ็ด': 7, - 'แปด': 8, - 'เก้า': 9, - 'สิบ': 10, - 'ร้อย': 100, - 'พัน': 1000, - 'หมื่น': 10000, - 'แสน': 100000, - 'ล้าน': 1000000, + "ศูนย์": 0, + "หนึ่ง": 1, + "เอ็ด": 1, + "สอง": 2, + "ยี่": 2, + "สาม": 3, + "สี่": 4, + "ห้า": 5, + "หก": 6, + "เจ็ด": 7, + "แปด": 8, + "เก้า": 9, + "สิบ": 10, + "ร้อย": 100, + "พัน": 1000, + "หมื่น": 10000, + "แสน": 100000, + "ล้าน": 1000000, } -nu_pat = re.compile('(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?') # หกสิบ, ร้อยเอ็ด +nu_pat = re.compile("(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?") # หกสิบ, ร้อยเอ็ด # assuming that the units are separated already + + def listtext_num2num_(tokens): - if len(tokens)==0: - return 0 - if len(tokens)==1: - return thai_int_map[tokens[0]] - if len(tokens)==2: - a, b = tokens + len_tokens = len(tokens) + + if len_tokens == 0: + return 0 + + if len_tokens == 1: + return thai_int_map[tokens[0]] + + if len_tokens == 2: + a, b = tokens + if b in thaiword_units: + return thai_int_map[a] * thai_int_map[b] + else: + return thai_int_map[a] + thai_int_map[b] + # longer case we use recursive + a, b = tokens[:2] + if a in thaiword_units and b != "ล้าน": # ร้อย แปด + return thai_int_map[a] + listtext_num2num_(tokens[1:]) + # most common case, a isa num, b isa unit if b in thaiword_units: - return thai_int_map[a]*thai_int_map[b] - else: - return thai_int_map[a]+thai_int_map[b] - # longer case we use recursive - a, b = tokens[:2] - if a in thaiword_units and b != 'ล้าน': # ร้อย แปด - return thai_int_map[a] + listtext_num2num_(tokens[1:]) - # most common case, a isa num, b isa unit - if b in thaiword_units: - return thai_int_map[a]*thai_int_map[b] + listtext_num2num_(tokens[2:]) + return thai_int_map[a] * thai_int_map[b] + listtext_num2num_(tokens[2:]) + + def listtext_num2num(tokens): - res = [] - for tok in tokens: - if tok in thaiword_nums_units: - res.append(tok) - else: - m = nu_pat.fullmatch(tok) - if m: - res.extend([t for t in m.groups() if t]) # ตัด None ทิ้ง - else: - pass # should not be here - return listtext_num2num_(res) \ No newline at end of file + res = [] + for tok in tokens: + if tok in thaiword_nums_units: + res.append(tok) + else: + m = nu_pat.fullmatch(tok) + if m: + res.extend([t for t in m.groups() if t]) # ตัด None ทิ้ง + else: + pass # should not be here + return listtext_num2num_(res) diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index 103e80b12..153183094 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -1,72 +1,86 @@ # -*- coding: utf-8 -*- -''' +""" +thai2vec - Thai word vector Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb -''' -from __future__ import absolute_import,unicode_literals -import six +""" +from __future__ import absolute_import, unicode_literals + import sys -if six.PY2: - print("Thai sentiment in pythainlp. Not support python 2.7") - sys.exit(0) + +from pythainlp.corpus import download as download_data +from pythainlp.corpus import get_file +from pythainlp.tokenize import word_tokenize + try: - from gensim.models import KeyedVectors - import numpy as np + from gensim.models import KeyedVectors + import numpy as np except ImportError: - from pythainlp.tools import install_package - install_package('gensim') - install_package('numpy') - try: - from gensim.models import KeyedVectors - import numpy as np - except ImportError: - print("Error ! using 'pip install gensim numpy'") - sys.exit(0) -from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import get_file -from pythainlp.corpus import download as download_data -import os + from pythainlp.tools import install_package + + install_package("gensim") + install_package("numpy") + try: + from gensim.models import KeyedVectors + import numpy as np + except ImportError: + print("Error: Try 'pip install gensim numpy'") + sys.exit(0) + def download(): - path = get_file('thai2vec02') - if path==None: - download_data('thai2vec02') - path = get_file('thai2vec02') - return path + path = get_file("thai2vec02") + if not path: + download_data("thai2vec02") + path = get_file("thai2vec02") + return path + + def get_model(): - ''' - :return: Downloads the `gensim` model.''' - return KeyedVectors.load_word2vec_format(download(),binary=False) -def most_similar_cosmul(positive,negative): - ''' + """ + :return: Downloads the `gensim` model.""" + return KeyedVectors.load_word2vec_format(download(), binary=False) + + +def most_similar_cosmul(positive, negative): + """ การใช้งาน input list - ''' + """ return get_model().most_similar_cosmul(positive=positive, negative=negative) + + def doesnt_match(listdata): return get_model().doesnt_match(listdata) -def similarity(word1,word2): - ''' + + +def similarity(word1, word2): + """ :param str word1: first word :param str word2: second word :return: the cosine similarity between the two word vectors - ''' - return get_model().similarity(word1,word2) -def sentence_vectorizer(ss,dim=300,use_mean=False): - s = word_tokenize(ss) - vec = np.zeros((1,dim)) - for word in s: + """ + return get_model().similarity(word1, word2) + + +def sentence_vectorizer(text, dim=300, use_mean=False): + words = word_tokenize(text) + vec = np.zeros((1, dim)) + for word in words: if word in get_model().wv.index2word: - vec+= get_model().wv.word_vec(word) - else: pass - if use_mean: vec /= len(s) - return(vec) + vec += get_model().wv.word_vec(word) + else: + pass + if use_mean: + vec /= len(words) + return vec + def about(): - return ''' + return """ thai2vec State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language. Created as part of pyThaiNLP with ULMFit implementation from fast.ai - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec - ''' + Development: Charin Polpanumas + GitHub: https://github.com/cstorm125/thai2vec + """ From f45afa11f7e66aca64926a12d563eb4b84499c35 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 21 Oct 2018 19:53:40 +0700 Subject: [PATCH 3/3] generalized code --- pythainlp/tag/__init__.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 09447c253..2f9d89ef2 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -9,11 +9,11 @@ ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip" -def pos_tag(texts, engine="unigram", corpus="orchid"): +def pos_tag(words, engine="unigram", corpus="orchid"): """ Part of Speech tagging function. - :param list texts: takes in a list of tokenized words (put differently, a list of strings) + :param list words: takes in a list of tokenized words (put differently, a list of strings) :param str engine: * unigram - unigram tagger (default) * perceptron - perceptron tagger @@ -24,10 +24,10 @@ def pos_tag(texts, engine="unigram", corpus="orchid"): :return: returns a list of labels regarding which part of speech it is """ if engine == "perceptron": - from .perceptron import tag + from .perceptron import tag as _tag elif engine == "artagger": - def tag(text): + def _tag(text, corpus=None): try: from artagger import Tagger except ImportError: @@ -39,18 +39,16 @@ def tag(text): except ImportError: print("Error: Try 'pip install " + ARTAGGER_URL + "'") sys.exit(0) + words = Tagger().tag(" ".join(text)) - totag = [] - for word in words: - totag.append((word.word, word.tag)) - return totag - return tag(texts) + return [(word.word, word.tag) for word in words] + else: # default, use "unigram" ("old") engine - from .old import tag + from .old import tag as _tag - return tag(texts, corpus=corpus) + return _tag(words, corpus=corpus) def pos_tag_sents(sentences, engine="unigram", corpus="orchid"): - return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences] + return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]