diff --git a/.travis.yml b/.travis.yml index a44fee55c..8f4edb93f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,12 +3,11 @@ language: python python: - - "3.4" - - "3.5" - "3.6" # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - - pip install -r requirements-travis.txt + - pip install -r requirements.txt + - pip install .[icu,ner,pos,tokenize,transliterate] - pip install coveralls os: diff --git a/README-pypi.md b/README-pypi.md index 65d12f03b..70a8a53c2 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -1,6 +1,6 @@ ![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4) -# PyThaiNLP 1.7 +# PyThaiNLP 1.8.0 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&utm_medium=referral&utm_content=PyThaiNLP/pythainlp&utm_campaign=Badge_Grade)[![pypi](https://img.shields.io/pypi/v/pythainlp.svg)](https://pypi.python.org/pypi/pythainlp) [![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp) @@ -14,7 +14,7 @@ PyThaiNLP features include Thai word and subword segmentations, soundex, romaniz ## What's new in version 1.7 ? -- Deprecate Python 2 support +- Deprecate Python 2 support. (Python 2 compatibility code will be completely dropped in PyThaiNLP 1.8) - Refactor pythainlp.tokenize.pyicu for readability - Add Thai NER model to pythainlp.ner - thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset diff --git a/README.md b/README.md index 339666dbd..ef71bf205 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Python 2 users can still use PyThaiNLP 1.6. ## Capabilities - Thai word segmentation (```word_tokenize```), including subword segmentation based on Thai Character Cluster (```tcc```) and ETCC (```etcc```) -- Thai romanization (```romanize```) +- Thai romanization and transliteration (```romanize```, ```transliterate```) - Thai part-of-speech taggers (```pos_tag```) - Read out number to Thai words (```bahttext```, ```num_to_thaiword```) - Thai collation (sort by dictionoary order) (```collate```) @@ -85,7 +85,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนเพื่ ## ความสามารถ - ตัดคำภาษาไทย (```word_tokenize```) และรองรับ Thai Character Clusters (```tcc```) และ ETCC (```etcc```) -- ถอดเสียงภาษาไทยเป็นอักษรละติน (```romanize```) +- ถอดเสียงภาษาไทยเป็นอักษรละตินและสัทอักษร (```romanize```, ```transliterate```) - ระบุชนิดคำ (part-of-speech) ภาษาไทย (```pos_tag```) - อ่านตัวเลขเป็นข้อความภาษาไทย (```bahttext```, ```num_to_thaiword```) - เรียงลำดับคำตามพจนานุกรมไทย (```collate```) diff --git a/appveyor.yml b/appveyor.yml index e66f97f1f..00b4e1ae2 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,11 +2,6 @@ build: off environment: matrix: - - PYTHON: "C:/Python34" - PYTHON_VERSION: "3.4" - PYTHON_ARCH: "32" - PYICU_WHEEL: "https://get.openlp.org/win-sdk/PyICU-1.9.5-cp34-cp34m-win32.whl" - - PYTHON: "C:/Python36" PYTHON_VERSION: "3.6" PYTHON_ARCH: "32" @@ -37,7 +32,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install -e ." + - "%PYTHON%/python.exe -m pip install -e .[icu,ner,pos,tokenize,transliterate]" test_script: - "%PYTHON%/python.exe -m pip --version" diff --git a/docs/api/romanization.rst b/docs/api/romanization.rst index fbae60d77..e99fdd7b2 100644 --- a/docs/api/romanization.rst +++ b/docs/api/romanization.rst @@ -1,10 +1,10 @@ .. currentmodule:: pythainlp.romanization -pythainlp.romanization +pythainlp.transliterate ==================================== -The :class:`pythainlp.romanization` turns thai text into a romanized one (put simply, spelled with English). +The :class:`pythainlp.transliterate` turns Thai text into a romanized one (put simply, spelled with English). -.. autofunction:: romanization -.. currentmodule:: pythainlp.romanization.thai2rom +.. autofunction:: transliterate +.. currentmodule:: pythainlp.transliterate.thai2rom .. autoclass:: thai2rom :members: romanize diff --git a/docs/conf.py b/docs/conf.py index bc1b294f1..7e1440553 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '1.7' +release = '1.8.0' # -- General configuration --------------------------------------------------- diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md index e81012093..503705cba 100644 --- a/docs/pythainlp-dev-thai.md +++ b/docs/pythainlp-dev-thai.md @@ -256,12 +256,13 @@ lentext คือ จำนวนคำขั้นต่ำที่ต้อ คืนค่าเป็น dict -### romanization +### transliteration ```python -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize, transliterate romanize(str, engine="royin") +transliterate(str, engine="pyicu") ``` มี engine ดังนี้ @@ -275,9 +276,10 @@ romanize(str, engine="royin") **ตัวอย่าง** ```python -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize, transliterate romanize("แมว") # 'maew' +transliterate("นก") ``` ### spell diff --git a/examples/romanization.py b/examples/romanization.py deleted file mode 100644 index abbbd94fc..000000000 --- a/examples/romanization.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.romanization import romanize - -print(romanize("แมว")) diff --git a/examples/transliterate.py b/examples/transliterate.py new file mode 100644 index 000000000..97fb4e7f1 --- /dev/null +++ b/examples/transliterate.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +from pythainlp.transliterate import romanize, transliterate + +print(romanize("แมว")) +print(transliterate("แมว")) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 5d01626cc..5a79f230b 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = 1.7 +__version__ = 1.8 thai_alphabets = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars thai_vowels = "ฤฦะ\u0e31าำ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39เแโใไ\u0e45\u0e47" # 19 @@ -24,7 +24,7 @@ from pythainlp.collation import collate from pythainlp.date import now -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize, transliterate from pythainlp.sentiment import sentiment from pythainlp.soundex import soundex from pythainlp.spell import spell diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 657e8ccd5..de7c8de0f 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -40,7 +40,7 @@ def word_freq(word, domain="all"): r = requests.post(url, data=data) - pat = re.compile('TOTAL(?s).*?#ffffff">(.*?)') + pat = re.compile(r'TOTAL(?s).*?#ffffff">(.*?)') match = pat.search(r.text) n = 0 diff --git a/pythainlp/g2p/__init__.py b/pythainlp/g2p/__init__.py deleted file mode 100644 index b289d1eb4..000000000 --- a/pythainlp/g2p/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -try: - import epitran -except ImportError: - from pythainlp.tools import install_package - - install_package("epitran") - try: - import epitran - except ImportError: - raise ImportError("ImportError: Try 'pip install epitran'") -epi = epitran.Epitran('tha-Thai') -class ipa: - def __init__(self,text=""): - self.text=text - def str(self): - return epi.transliterate(self.text) - def list(self): - return epi.trans_list(self.text) - def xsampa_list(self): - return epi.xsampa_list(self.text) diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py index fbd70b5b8..164f4464b 100644 --- a/pythainlp/ner/__init__.py +++ b/pythainlp/ner/__init__.py @@ -4,23 +4,12 @@ """ __all__ = ["ThaiNameRecognizer"] +import sklearn_crfsuite from pythainlp.corpus import download, get_file, thai_stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize from pythainlp.util import is_thaiword -try: - import sklearn_crfsuite -except ImportError: - from pythainlp.tools import install_package - - install_package("sklearn-crfsuite") - try: - import sklearn_crfsuite - except ImportError: - raise ImportError("ImportError: Try 'pip install sklearn-crfsuite'") - - _WORD_TOKENIZER = "newmm" # ตัวตัดคำ _STOPWORDS = thai_stopwords() diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py deleted file mode 100644 index 16de05d1b..000000000 --- a/pythainlp/romanization/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import word_tokenize - - -# ถอดเสียงภาษาไทยเป็นอักษรละติน -def romanize(text, engine="royin"): - """ - :param str data: Thai text to be romanized - :param str engine: 'royin' (default), 'pyicu', or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'pyicu' uses Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization. - :return: English (more or less) text that spells out how the Thai text should read. - """ - if engine == "pyicu": - from .pyicu import romanize - elif engine == "thai2rom": - from .thai2rom import ThaiTransliterator - - thai2rom = ThaiTransliterator() - return thai2rom.romanize(text) - else: # use default engine "royin" - from .royin import romanize - - words = word_tokenize(text) - romanized_words = [romanize(word) for word in words] - - return "".join(romanized_words) diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py deleted file mode 100644 index 732db3e24..000000000 --- a/pythainlp/romanization/pyicu.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- - -try: - import icu -except ImportError: - from pythainlp.tools import install_package - - install_package("pyicu") - try: - import icu - except ImportError: - raise ImportError("ImportError: Try 'pip install pyicu'") - - -# ถอดเสียงภาษาไทยเป็นอักษรละติน -def romanize(data): - """ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน""" - thai2latin = icu.Transliterator.createInstance("Thai-Latin") - return thai2latin.transliterate(data) diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py index 2fd6a1d27..19ca3368f 100644 --- a/pythainlp/sentiment/ulmfit_sent.py +++ b/pythainlp/sentiment/ulmfit_sent.py @@ -5,40 +5,15 @@ """ from collections import defaultdict +import dill as pickle +import numpy as np +import torch from pythainlp.corpus import download, get_file from pythainlp.tokenize import word_tokenize +from torch import LongTensor +from torch.autograd import Variable -try: - import numpy as np - import dill as pickle -except ImportError: - from pythainlp.tools import install_package - - install_package("numpy") - install_package("dill") - try: - import numpy as np - import dill as pickle - except ImportError: - raise ImportError("ImportError: Try 'pip install numpy dill'") - -try: - import torch - from torch import LongTensor - from torch.autograd import Variable -except ImportError: - print("PyTorch required. See https://pytorch.org/.") - -# try: -# from fastai.text import multiBatchRNN -# except ImportError: -# print( -# """ -# fastai required for multiBatchRNN. -# Run 'pip install https://github.com/fastai/fastai/archive/master.zip' -# """ -# ) - +# from fastai.text import multiBatchRNN MODEL_NAME = "sent_model" ITOS_NAME = "itos_sent" diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 6671a99e4..d60ee950f 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -25,19 +25,7 @@ def pos_tag(words, engine="unigram", corpus="orchid"): elif engine == "artagger": def _tag(text, corpus=None): - try: - from artagger import Tagger - except ImportError: - from pythainlp.tools import install_package - - install_package(_ARTAGGER_URL) - try: - from artagger import Tagger - except ImportError: - raise ImportError( - "ImportError: Try 'pip install " + _ARTAGGER_URL + "'" - ) - + from artagger import Tagger words = Tagger().tag(" ".join(text)) return [(word.word, word.tag) for word in words] diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 20f744f25..395e76583 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -3,17 +3,7 @@ Wrapper for deepcut Thai word segmentation """ -try: - import deepcut -except ImportError: - """ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ""" - from pythainlp.tools import install_package - - install_package("deepcut") - try: - import deepcut - except ImportError: - raise ImportError("ImportError: Try 'pip install deepcut'") +import deepcut def segment(text): diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 9a2ffa581..aefcc9311 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -4,20 +4,11 @@ """ import re -try: - import icu -except ImportError: - from pythainlp.tools import install_package - - install_package("pyicu") - try: - import icu - except ImportError: - raise ImportError("ImportError: Try 'pip install pyicu'") +from icu import BreakIterator, Locale def _gen_words(text): - bd = icu.BreakIterator.createWordInstance(icu.Locale("th")) + bd = BreakIterator.createWordInstance(Locale("th")) bd.setText(text) p = bd.first() for q in bd: diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py new file mode 100644 index 000000000..48bd5cfd2 --- /dev/null +++ b/pythainlp/transliterate/__init__.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +from pythainlp.tokenize import word_tokenize + + +# ถอดเสียงภาษาไทยเป็นอักษรละติน +def romanize(text, engine="royin"): + """ + :param str text: Thai text to be romanized + :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras). + :return: English (more or less) text that spells out how the Thai text should read. + """ + if engine == "thai2rom": + from .thai2rom import romanize + return romanize(text) + else: # use default engine "royin" + from .royin import romanize + words = word_tokenize(text) + romanized_words = [romanize(word) for word in words] + return "".join(romanized_words) + + +def transliterate(text, engine="ipa"): + """ + :param str text: Thai text to be transliterated + :param str engine: 'ipa' (default) or 'pyicu'. + :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read. + """ + if engine == "pyicu": + from .pyicu import transliterate + else: + from .ipa import transliterate + + return transliterate(text) diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py new file mode 100644 index 000000000..5fe18d24d --- /dev/null +++ b/pythainlp/transliterate/ipa.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +""" +Transliterating text to International Phonetic Alphabet (IPA) +""" +import epitran + +_EPI_THA = epitran.Epitran("tha-Thai") + + +def transliterate(text): + return _EPI_THA.transliterate(text) + + +def trans_list(text): + return _EPI_THA.trans_list(text) + + +def xsampa_list(text): + return _EPI_THA.xsampa_list(text) diff --git a/pythainlp/transliterate/pyicu.py b/pythainlp/transliterate/pyicu.py new file mode 100644 index 000000000..e34be0e16 --- /dev/null +++ b/pythainlp/transliterate/pyicu.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +from icu import Transliterator + + +_ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin") + + +# ถอดเสียงภาษาไทยเป็นอักษรละติน +def transliterate(text): + """ + ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน + """ + return _ICU_THAI_TO_LATIN.transliterate(text) diff --git a/pythainlp/romanization/royin.py b/pythainlp/transliterate/royin.py similarity index 100% rename from pythainlp/romanization/royin.py rename to pythainlp/transliterate/royin.py diff --git a/pythainlp/romanization/thai2rom.py b/pythainlp/transliterate/thai2rom.py similarity index 92% rename from pythainlp/romanization/thai2rom.py rename to pythainlp/transliterate/thai2rom.py index b23ef8a4f..daaf44088 100644 --- a/pythainlp/romanization/thai2rom.py +++ b/pythainlp/transliterate/thai2rom.py @@ -2,18 +2,11 @@ """ Romanization of Thai words based on machine-learnt engine ("thai2rom") """ +import numpy as np +from keras.layers import Input +from keras.models import Model, load_model from pythainlp.corpus import download, get_file -try: - import numpy as np - from keras.layers import Input - from keras.models import Model, load_model -except ImportError: - from pythainlp.tools import install_package - - install_package("keras") - install_package("numpy") - class ThaiTransliterator: def __init__(self): @@ -55,12 +48,8 @@ def __init__(self): self.__target_characters = sorted(list(self.__target_characters)) self.__num_encoder_tokens = len(self.__input_characters) self.__num_decoder_tokens = len(self.__target_characters) - self.__max_encoder_seq_length = max( - [len(text) for text in self.__input_texts] - ) - self.__max_decoder_seq_length = max( - [len(text) for text in self.__target_texts] - ) + self.__max_encoder_seq_length = max([len(text) for text in self.__input_texts]) + self.__max_decoder_seq_length = max([len(text) for text in self.__target_texts]) """print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) @@ -134,7 +123,9 @@ def __decode_sequence(self, input_seq): [self.__target_seq] + self.__states_value ) self.__sampled_token_index = np.argmax(self.__output_tokens[0, -1, :]) - self.__sampled_char = self.__reverse_target_char_index[self.__sampled_token_index] + self.__sampled_char = self.__reverse_target_char_index[ + self.__sampled_token_index + ] self.__decoded_sentence += self.__sampled_char if ( self.__sampled_char == "\n" @@ -148,7 +139,8 @@ def __decode_sequence(self, input_seq): def __encode_input(self, name): self.__test_input = np.zeros( - (1, self.__max_encoder_seq_length, self.__num_encoder_tokens), dtype="float32" + (1, self.__max_encoder_seq_length, self.__num_encoder_tokens), + dtype="float32", ) for t, char in enumerate(name): self.__test_input[0, t, self.__input_token_index[char]] = 1. @@ -160,3 +152,10 @@ def romanize(self, text): :return: English (more or less) text that spells out how the Thai text should read. """ return self.__decode_sequence(self.__encode_input(text)) + + +_THAI_TO_ROM = ThaiTransliterator() + + +def romanize(text): + return _THAI_TO_ROM.romanize(text) diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py index 17e26a0a6..6e0ccce24 100644 --- a/pythainlp/ulmfit/utils.py +++ b/pythainlp/ulmfit/utils.py @@ -5,30 +5,13 @@ """ import re +import dill as pickle +import numpy as np +import torch from pythainlp.corpus import download, get_file from pythainlp.tokenize import word_tokenize -try: - import numpy as np - from fastai.text import * - import dill as pickle -except ImportError: - from pythainlp.tools import install_package - - install_package("fastai==0.7.0") - install_package("numpy") - try: - import numpy as np - from fastai.text import * - import dill as pickle - except ImportError: - raise ImportError("ImportError: Try 'pip install fastai numpy dill'") - -try: - import torch -except ImportError: - print("PyTorch required. See https://pytorch.org/.") - +from fastai.text import * MODEL_NAME = "thwiki_model2" ITOS_NAME = "itos" diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index a390eae06..e2b4b1329 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -3,24 +3,12 @@ thai2vec - Thai word vector Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb """ +import numpy as np +from gensim.models import KeyedVectors from pythainlp.corpus import download as download_data from pythainlp.corpus import get_file from pythainlp.tokenize import word_tokenize -try: - from gensim.models import KeyedVectors - import numpy as np -except ImportError: - from pythainlp.tools import install_package - - install_package("gensim") - install_package("numpy") - try: - from gensim.models import KeyedVectors - import numpy as np - except ImportError: - raise ImportError("ImportError: Try 'pip install gensim numpy'") - def download(): path = get_file("thai2vec02") diff --git a/requirements-travis.txt b/requirements-travis.txt deleted file mode 100644 index 6073cba5f..000000000 --- a/requirements-travis.txt +++ /dev/null @@ -1,9 +0,0 @@ -dill -langdetect -marisa_trie -nltk>=3.2.2 -pyicu==1.9.3 -pytz -requests -tinydb -tqdm diff --git a/setup.cfg b/setup.cfg index 51cd41ab0..a686f87db 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.7.0.1 +current_version = 1.8.0 commit = True tag = True diff --git a/setup.py b/setup.py index 879015359..3fa7c5c18 100644 --- a/setup.py +++ b/setup.py @@ -4,14 +4,34 @@ with open("README-pypi.md", "r", encoding="utf-8") as readme_file: readme = readme_file.read() -readme_file.close() + with open("requirements.txt", "r", encoding="utf-8") as f: requirements = f.read().splitlines() +extras = { + "icu": ["pyicu"], + "ml": ["fastai==0.7.0", "keras", "numpy", "torch"], + "ner": ["sklearn_crfsuite"], + "pos": ["artagger"], + "tokenize": ["deepcut", "pyicu"], + "transliterate": ["epitran", "pyicu"], + "full": [ + "artagger", + "deepcut", + "epitran", + "fastai==0.7.0", + "keras", + "numpy", + "pyicu", + "sklearn_crfsuite", + "torch", + ], +} + setup( name="pythainlp", - version="1.7.0.1", - description="Thai natural language processing library", + version="1.8.0", + description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", author="PyThaiNLP", @@ -40,6 +60,7 @@ }, include_package_data=True, install_requires=requirements, + extras_require=extras, license="Apache Software License 2.0", zip_safe=False, keywords="pythainlp", diff --git a/tests/__init__.py b/tests/__init__.py index d80cc21d0..ec4a492d6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -17,7 +17,6 @@ wordnet, ) from pythainlp.date import now, now_reign_year, reign_year_to_ad -from pythainlp.g2p import ipa from pythainlp.keywords import find_keyword from pythainlp.ner import ThaiNameRecognizer from pythainlp.ner.locations import tag_provinces @@ -32,13 +31,14 @@ thaiword_to_num, ) from pythainlp.rank import rank -from pythainlp.romanization import romanize from pythainlp.sentiment import sentiment from pythainlp.soundex import lk82, metasound, soundex, udom83 from pythainlp.spell import correct, spell from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize +from pythainlp.transliterate import romanize, transliterate +from pythainlp.transliterate.ipa import trans_list, xsampa_list from pythainlp.util import ( deletetone, eng_to_thai, @@ -101,14 +101,6 @@ def test_date(self): self.assertIsNotNone(reign_year_to_ad(2, 7)) self.assertIsNotNone(now_reign_year()) - # ### pythainlp.g2p - - def test_ipa(self): - t = ipa("คน") - self.assertEqual(t.str(), "kʰon") - self.assertIsNotNone(t.list()) - self.assertIsNotNone(t.xsampa_list()) - # ### pythainlp.keywords def test_keywords(self): @@ -208,21 +200,6 @@ def test_rank(self): self.assertEqual(rank(["แมว", "คน", "แมว"]), Counter({"แมว": 2, "คน": 1})) self.assertIsNotNone(rank(["แมว", "คน", "แมว"], stopword=True)) - # ### pythainlp.romanization - - def test_romanization(self): - self.assertEqual(romanize("แมว"), "maeo") - self.assertEqual(romanize("แมว", "pyicu"), "mæw") - - def test_romanization_royin(self): - engine = "royin" - self.assertIsNotNone(romanize("กก", engine=engine)) - self.assertEqual(romanize("แมว", engine=engine), "maeo") - self.assertEqual(romanize("เดือน", engine=engine), "duean") - self.assertEqual(romanize("ดู", engine=engine), "du") - self.assertEqual(romanize("ดำ", engine=engine), "dam") - self.assertEqual(romanize("บัว", engine=engine), "bua") - # ### pythainlp.sentiment def test_sentiment(self): @@ -258,7 +235,12 @@ def test_soundex(self): def test_spell(self): self.assertIsNotNone(spell("เน้ร")) + self.assertEqual(spell(""), "") + self.assertEqual(spell(None), "") + self.assertIsNotNone(correct("ทดสอง")) + self.assertEqual(correct(""), "") + self.assertEqual(correct(None), "") # ### pythainlp.summarize @@ -274,26 +256,25 @@ def test_summarize(self): summarize(text=text, n=1, engine="frequency"), ["อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย"], ) + self.assertIsNotNone(summarize(text, 1, engine="XX")) # ### pythainlp.tag def test_pos_tag(self): - tokens = ["คำ"] + tokens = ["ผม", "รัก", "คุณ"] self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) - self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) - self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) - self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) - self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) - self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], ) - self.assertEqual( - str(type(pos_tag(word_tokenize("ผมรักคุณ"), engine="artagger"))), - "", - ) + + self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) + self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) + + # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) + # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) + self.assertEqual( pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), [ @@ -357,6 +338,24 @@ def test_tcc(self): def test_etcc(self): self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") + # ### pythainlp.transliterate + + def test_romanize(self): + self.assertEqual(romanize("แมว"), "maeo") + self.assertIsNotNone(romanize("กก", engine="royin")) + self.assertEqual(romanize("แมว", engine="royin"), "maeo") + self.assertEqual(romanize("เดือน", engine="royin"), "duean") + self.assertEqual(romanize("ดู", engine="royin"), "du") + self.assertEqual(romanize("ดำ", engine="royin"), "dam") + self.assertEqual(romanize("บัว", engine="royin"), "bua") + # self.assertIsNotNone(romanize("บัว", engine="thai2rom")) + + def test_transliterate(self): + self.assertEqual(transliterate("แมว", "pyicu"), "mæw") + self.assertEqual(transliterate("คน", engine="ipa"), "kʰon") + self.assertIsNotNone(trans_list("คน")) + self.assertIsNotNone(xsampa_list("คน")) + # ### pythainlp.util def test_deletetone(self): diff --git a/tox.ini b/tox.ini index 37cf268e5..e5573e5b5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27, py34, py35, flake8 +envlist = py36 flake8 [testenv:flake8] basepython = python