From 6717b06b105c45268c7f23b8f6c021d2384cff63 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 23:35:06 +0700 Subject: [PATCH 01/12] - Propose to merge g2p and romanization modules to one common transliterate module - add more test cases --- README-pypi.md | 2 +- docs/pythainlp-dev-thai.md | 4 +- .../{romanization.py => transliterate.py} | 2 +- pythainlp/__init__.py | 2 +- pythainlp/g2p/__init__.py | 21 -------- .../__init__.py | 5 ++ pythainlp/transliterate/ipa.py | 31 +++++++++++ .../{romanization => transliterate}/pyicu.py | 0 .../{romanization => transliterate}/royin.py | 0 .../thai2rom.py | 0 tests/__init__.py | 53 ++++++++++--------- 11 files changed, 69 insertions(+), 51 deletions(-) rename examples/{romanization.py => transliterate.py} (55%) delete mode 100644 pythainlp/g2p/__init__.py rename pythainlp/{romanization => transliterate}/__init__.py (89%) create mode 100644 pythainlp/transliterate/ipa.py rename pythainlp/{romanization => transliterate}/pyicu.py (100%) rename pythainlp/{romanization => transliterate}/royin.py (100%) rename pythainlp/{romanization => transliterate}/thai2rom.py (100%) diff --git a/README-pypi.md b/README-pypi.md index 65d12f03b..ea200a76f 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -14,7 +14,7 @@ PyThaiNLP features include Thai word and subword segmentations, soundex, romaniz ## What's new in version 1.7 ? -- Deprecate Python 2 support +- Deprecate Python 2 support. (Python 2 compatibility code will be completely dropped in PyThaiNLP 1.8) - Refactor pythainlp.tokenize.pyicu for readability - Add Thai NER model to pythainlp.ner - thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md index e81012093..47b268457 100644 --- a/docs/pythainlp-dev-thai.md +++ b/docs/pythainlp-dev-thai.md @@ -259,7 +259,7 @@ lentext คือ จำนวนคำขั้นต่ำที่ต้อ ### romanization ```python -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize romanize(str, engine="royin") ``` @@ -275,7 +275,7 @@ romanize(str, engine="royin") **ตัวอย่าง** ```python -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize romanize("แมว") # 'maew' ``` diff --git a/examples/romanization.py b/examples/transliterate.py similarity index 55% rename from examples/romanization.py rename to examples/transliterate.py index abbbd94fc..90ec7be80 100644 --- a/examples/romanization.py +++ b/examples/transliterate.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize print(romanize("แมว")) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 5d01626cc..12065e6bc 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -24,7 +24,7 @@ from pythainlp.collation import collate from pythainlp.date import now -from pythainlp.romanization import romanize +from pythainlp.transliterate import romanize from pythainlp.sentiment import sentiment from pythainlp.soundex import soundex from pythainlp.spell import spell diff --git a/pythainlp/g2p/__init__.py b/pythainlp/g2p/__init__.py deleted file mode 100644 index b289d1eb4..000000000 --- a/pythainlp/g2p/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -try: - import epitran -except ImportError: - from pythainlp.tools import install_package - - install_package("epitran") - try: - import epitran - except ImportError: - raise ImportError("ImportError: Try 'pip install epitran'") -epi = epitran.Epitran('tha-Thai') -class ipa: - def __init__(self,text=""): - self.text=text - def str(self): - return epi.transliterate(self.text) - def list(self): - return epi.trans_list(self.text) - def xsampa_list(self): - return epi.xsampa_list(self.text) diff --git a/pythainlp/romanization/__init__.py b/pythainlp/transliterate/__init__.py similarity index 89% rename from pythainlp/romanization/__init__.py rename to pythainlp/transliterate/__init__.py index 16de05d1b..35e07a743 100644 --- a/pythainlp/romanization/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -2,6 +2,11 @@ from pythainlp.tokenize import word_tokenize +from .ipa import IPA +from .thai2rom import ThaiTransliterator + +__all__ = ["IPA", "romanize", "ThaiTransliterator"] + # ถอดเสียงภาษาไทยเป็นอักษรละติน def romanize(text, engine="royin"): diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py new file mode 100644 index 000000000..2b1652f14 --- /dev/null +++ b/pythainlp/transliterate/ipa.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +Transliterating text to International Phonetic Alphabet (IPA) +""" + +try: + import epitran +except ImportError: + from pythainlp.tools import install_package + + install_package("epitran") + try: + import epitran + except ImportError: + raise ImportError("ImportError: Try 'pip install epitran'") + +epi = epitran.Epitran("tha-Thai") + + +class IPA: + def __init__(self, text=""): + self.text = text + + def str(self): + return epi.transliterate(self.text) + + def list(self): + return epi.trans_list(self.text) + + def xsampa_list(self): + return epi.xsampa_list(self.text) diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/transliterate/pyicu.py similarity index 100% rename from pythainlp/romanization/pyicu.py rename to pythainlp/transliterate/pyicu.py diff --git a/pythainlp/romanization/royin.py b/pythainlp/transliterate/royin.py similarity index 100% rename from pythainlp/romanization/royin.py rename to pythainlp/transliterate/royin.py diff --git a/pythainlp/romanization/thai2rom.py b/pythainlp/transliterate/thai2rom.py similarity index 100% rename from pythainlp/romanization/thai2rom.py rename to pythainlp/transliterate/thai2rom.py diff --git a/tests/__init__.py b/tests/__init__.py index d80cc21d0..43983a5a4 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -17,7 +17,6 @@ wordnet, ) from pythainlp.date import now, now_reign_year, reign_year_to_ad -from pythainlp.g2p import ipa from pythainlp.keywords import find_keyword from pythainlp.ner import ThaiNameRecognizer from pythainlp.ner.locations import tag_provinces @@ -32,13 +31,13 @@ thaiword_to_num, ) from pythainlp.rank import rank -from pythainlp.romanization import romanize from pythainlp.sentiment import sentiment from pythainlp.soundex import lk82, metasound, soundex, udom83 from pythainlp.spell import correct, spell from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize +from pythainlp.transliterate import IPA, ThaiTransliterator, romanize from pythainlp.util import ( deletetone, eng_to_thai, @@ -101,14 +100,6 @@ def test_date(self): self.assertIsNotNone(reign_year_to_ad(2, 7)) self.assertIsNotNone(now_reign_year()) - # ### pythainlp.g2p - - def test_ipa(self): - t = ipa("คน") - self.assertEqual(t.str(), "kʰon") - self.assertIsNotNone(t.list()) - self.assertIsNotNone(t.xsampa_list()) - # ### pythainlp.keywords def test_keywords(self): @@ -208,21 +199,6 @@ def test_rank(self): self.assertEqual(rank(["แมว", "คน", "แมว"]), Counter({"แมว": 2, "คน": 1})) self.assertIsNotNone(rank(["แมว", "คน", "แมว"], stopword=True)) - # ### pythainlp.romanization - - def test_romanization(self): - self.assertEqual(romanize("แมว"), "maeo") - self.assertEqual(romanize("แมว", "pyicu"), "mæw") - - def test_romanization_royin(self): - engine = "royin" - self.assertIsNotNone(romanize("กก", engine=engine)) - self.assertEqual(romanize("แมว", engine=engine), "maeo") - self.assertEqual(romanize("เดือน", engine=engine), "duean") - self.assertEqual(romanize("ดู", engine=engine), "du") - self.assertEqual(romanize("ดำ", engine=engine), "dam") - self.assertEqual(romanize("บัว", engine=engine), "bua") - # ### pythainlp.sentiment def test_sentiment(self): @@ -258,7 +234,12 @@ def test_soundex(self): def test_spell(self): self.assertIsNotNone(spell("เน้ร")) + self.assertEqual(spell(""), "") + self.assertEqual(spell(None), "") + self.assertIsNotNone(correct("ทดสอง")) + self.assertEqual(correct(""), "") + self.assertEqual(correct(None), "") # ### pythainlp.summarize @@ -274,6 +255,7 @@ def test_summarize(self): summarize(text=text, n=1, engine="frequency"), ["อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย"], ) + self.assertIsNotNone(summarize(text, 1, engine="XX")) # ### pythainlp.tag @@ -357,6 +339,27 @@ def test_tcc(self): def test_etcc(self): self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") + # ### pythainlp.transliterate + + def test_ipa(self): + t = IPA("คน") + self.assertEqual(t.str(), "kʰon") + self.assertIsNotNone(t.list()) + self.assertIsNotNone(t.xsampa_list()) + + def test_romanize(self): + self.assertEqual(romanize("แมว"), "maeo") + self.assertEqual(romanize("แมว", "pyicu"), "mæw") + + def test_romanizeroyin(self): + engine = "royin" + self.assertIsNotNone(romanize("กก", engine=engine)) + self.assertEqual(romanize("แมว", engine=engine), "maeo") + self.assertEqual(romanize("เดือน", engine=engine), "duean") + self.assertEqual(romanize("ดู", engine=engine), "du") + self.assertEqual(romanize("ดำ", engine=engine), "dam") + self.assertEqual(romanize("บัว", engine=engine), "bua") + # ### pythainlp.util def test_deletetone(self): From 9707c6797f8403f6f6ddf3174709500ff7dbfe30 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 7 Nov 2018 00:58:41 +0700 Subject: [PATCH 02/12] fix imports --- pythainlp/tokenize/pyicu.py | 13 ++----------- pythainlp/transliterate/__init__.py | 5 ----- pythainlp/transliterate/ipa.py | 12 +----------- pythainlp/transliterate/pyicu.py | 19 ++++++------------- requirements-travis.txt | 1 + tests/__init__.py | 5 +++-- 6 files changed, 13 insertions(+), 42 deletions(-) diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 9a2ffa581..aefcc9311 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -4,20 +4,11 @@ """ import re -try: - import icu -except ImportError: - from pythainlp.tools import install_package - - install_package("pyicu") - try: - import icu - except ImportError: - raise ImportError("ImportError: Try 'pip install pyicu'") +from icu import BreakIterator, Locale def _gen_words(text): - bd = icu.BreakIterator.createWordInstance(icu.Locale("th")) + bd = BreakIterator.createWordInstance(Locale("th")) bd.setText(text) p = bd.first() for q in bd: diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py index 35e07a743..16de05d1b 100644 --- a/pythainlp/transliterate/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -2,11 +2,6 @@ from pythainlp.tokenize import word_tokenize -from .ipa import IPA -from .thai2rom import ThaiTransliterator - -__all__ = ["IPA", "romanize", "ThaiTransliterator"] - # ถอดเสียงภาษาไทยเป็นอักษรละติน def romanize(text, engine="royin"): diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py index 2b1652f14..ed460e8c9 100644 --- a/pythainlp/transliterate/ipa.py +++ b/pythainlp/transliterate/ipa.py @@ -2,17 +2,7 @@ """ Transliterating text to International Phonetic Alphabet (IPA) """ - -try: - import epitran -except ImportError: - from pythainlp.tools import install_package - - install_package("epitran") - try: - import epitran - except ImportError: - raise ImportError("ImportError: Try 'pip install epitran'") +import epitran epi = epitran.Epitran("tha-Thai") diff --git a/pythainlp/transliterate/pyicu.py b/pythainlp/transliterate/pyicu.py index 732db3e24..77b74c6ad 100644 --- a/pythainlp/transliterate/pyicu.py +++ b/pythainlp/transliterate/pyicu.py @@ -1,19 +1,12 @@ # -*- coding: utf-8 -*- - -try: - import icu -except ImportError: - from pythainlp.tools import install_package - - install_package("pyicu") - try: - import icu - except ImportError: - raise ImportError("ImportError: Try 'pip install pyicu'") +from icu import Transliterator # ถอดเสียงภาษาไทยเป็นอักษรละติน def romanize(data): - """ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน""" - thai2latin = icu.Transliterator.createInstance("Thai-Latin") + """ + ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน + """ + thai2latin = Transliterator.createInstance("Thai-Latin") + return thai2latin.transliterate(data) diff --git a/requirements-travis.txt b/requirements-travis.txt index 6073cba5f..c0aebf0f5 100644 --- a/requirements-travis.txt +++ b/requirements-travis.txt @@ -1,4 +1,5 @@ dill +epitran langdetect marisa_trie nltk>=3.2.2 diff --git a/tests/__init__.py b/tests/__init__.py index 43983a5a4..27d675e65 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -37,7 +37,8 @@ from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize -from pythainlp.transliterate import IPA, ThaiTransliterator, romanize +from pythainlp.transliterate import romanize +from pythainlp.transliterate.ipa import IPA from pythainlp.util import ( deletetone, eng_to_thai, @@ -351,7 +352,7 @@ def test_romanize(self): self.assertEqual(romanize("แมว"), "maeo") self.assertEqual(romanize("แมว", "pyicu"), "mæw") - def test_romanizeroyin(self): + def test_romanize_royin(self): engine = "royin" self.assertIsNotNone(romanize("กก", engine=engine)) self.assertEqual(romanize("แมว", engine=engine), "maeo") From 529d657db768b8f7780ef48443ef6d36310e2b8a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 7 Nov 2018 01:04:09 +0700 Subject: [PATCH 03/12] install epitrain for Windows test in AppVeyor --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index e66f97f1f..17ac9497a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -37,6 +37,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" + - "%PYTHON%/python.exe -m pip install epitrain" - "%PYTHON%/python.exe -m pip install -e ." test_script: From b67465a51e32f4dbee51e8bdcb3745c248a94e3b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 08:00:59 +0700 Subject: [PATCH 04/12] - remove try imports - add extras_require to setup.py --- .travis.yml | 3 ++- appveyor.yml | 3 +-- pythainlp/ner/__init__.py | 13 +--------- pythainlp/sentiment/ulmfit_sent.py | 37 +++++------------------------ pythainlp/tag/__init__.py | 14 +---------- pythainlp/tokenize/deepcut.py | 12 +--------- pythainlp/transliterate/thai2rom.py | 13 +++------- pythainlp/ulmfit/utils.py | 25 ++++--------------- pythainlp/word_vector/thai2vec.py | 16 ++----------- requirements-travis.txt | 10 -------- setup.py | 15 +++++++++--- 11 files changed, 33 insertions(+), 128 deletions(-) delete mode 100644 requirements-travis.txt diff --git a/.travis.yml b/.travis.yml index a44fee55c..c39e96dd1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,8 @@ python: - "3.6" # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - - pip install -r requirements-travis.txt + - pip install -r requirements.txt + - pip install .[icu,ml,pos,tokenize,transliterate] - pip install coveralls os: diff --git a/appveyor.yml b/appveyor.yml index 17ac9497a..17d3256a7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -37,8 +37,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install epitrain" - - "%PYTHON%/python.exe -m pip install -e ." + - "%PYTHON%/python.exe -m pip install -e .[icu,ml,pos,tokenize,transliterate]" test_script: - "%PYTHON%/python.exe -m pip --version" diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py index fbd70b5b8..164f4464b 100644 --- a/pythainlp/ner/__init__.py +++ b/pythainlp/ner/__init__.py @@ -4,23 +4,12 @@ """ __all__ = ["ThaiNameRecognizer"] +import sklearn_crfsuite from pythainlp.corpus import download, get_file, thai_stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize from pythainlp.util import is_thaiword -try: - import sklearn_crfsuite -except ImportError: - from pythainlp.tools import install_package - - install_package("sklearn-crfsuite") - try: - import sklearn_crfsuite - except ImportError: - raise ImportError("ImportError: Try 'pip install sklearn-crfsuite'") - - _WORD_TOKENIZER = "newmm" # ตัวตัดคำ _STOPWORDS = thai_stopwords() diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py index 2fd6a1d27..19ca3368f 100644 --- a/pythainlp/sentiment/ulmfit_sent.py +++ b/pythainlp/sentiment/ulmfit_sent.py @@ -5,40 +5,15 @@ """ from collections import defaultdict +import dill as pickle +import numpy as np +import torch from pythainlp.corpus import download, get_file from pythainlp.tokenize import word_tokenize +from torch import LongTensor +from torch.autograd import Variable -try: - import numpy as np - import dill as pickle -except ImportError: - from pythainlp.tools import install_package - - install_package("numpy") - install_package("dill") - try: - import numpy as np - import dill as pickle - except ImportError: - raise ImportError("ImportError: Try 'pip install numpy dill'") - -try: - import torch - from torch import LongTensor - from torch.autograd import Variable -except ImportError: - print("PyTorch required. See https://pytorch.org/.") - -# try: -# from fastai.text import multiBatchRNN -# except ImportError: -# print( -# """ -# fastai required for multiBatchRNN. -# Run 'pip install https://github.com/fastai/fastai/archive/master.zip' -# """ -# ) - +# from fastai.text import multiBatchRNN MODEL_NAME = "sent_model" ITOS_NAME = "itos_sent" diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 6671a99e4..d60ee950f 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -25,19 +25,7 @@ def pos_tag(words, engine="unigram", corpus="orchid"): elif engine == "artagger": def _tag(text, corpus=None): - try: - from artagger import Tagger - except ImportError: - from pythainlp.tools import install_package - - install_package(_ARTAGGER_URL) - try: - from artagger import Tagger - except ImportError: - raise ImportError( - "ImportError: Try 'pip install " + _ARTAGGER_URL + "'" - ) - + from artagger import Tagger words = Tagger().tag(" ".join(text)) return [(word.word, word.tag) for word in words] diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 20f744f25..395e76583 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -3,17 +3,7 @@ Wrapper for deepcut Thai word segmentation """ -try: - import deepcut -except ImportError: - """ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ""" - from pythainlp.tools import install_package - - install_package("deepcut") - try: - import deepcut - except ImportError: - raise ImportError("ImportError: Try 'pip install deepcut'") +import deepcut def segment(text): diff --git a/pythainlp/transliterate/thai2rom.py b/pythainlp/transliterate/thai2rom.py index b23ef8a4f..f35db6b12 100644 --- a/pythainlp/transliterate/thai2rom.py +++ b/pythainlp/transliterate/thai2rom.py @@ -2,18 +2,11 @@ """ Romanization of Thai words based on machine-learnt engine ("thai2rom") """ +import numpy as np +from keras.layers import Input +from keras.models import Model, load_model from pythainlp.corpus import download, get_file -try: - import numpy as np - from keras.layers import Input - from keras.models import Model, load_model -except ImportError: - from pythainlp.tools import install_package - - install_package("keras") - install_package("numpy") - class ThaiTransliterator: def __init__(self): diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py index 17e26a0a6..6e0ccce24 100644 --- a/pythainlp/ulmfit/utils.py +++ b/pythainlp/ulmfit/utils.py @@ -5,30 +5,13 @@ """ import re +import dill as pickle +import numpy as np +import torch from pythainlp.corpus import download, get_file from pythainlp.tokenize import word_tokenize -try: - import numpy as np - from fastai.text import * - import dill as pickle -except ImportError: - from pythainlp.tools import install_package - - install_package("fastai==0.7.0") - install_package("numpy") - try: - import numpy as np - from fastai.text import * - import dill as pickle - except ImportError: - raise ImportError("ImportError: Try 'pip install fastai numpy dill'") - -try: - import torch -except ImportError: - print("PyTorch required. See https://pytorch.org/.") - +from fastai.text import * MODEL_NAME = "thwiki_model2" ITOS_NAME = "itos" diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index a390eae06..e2b4b1329 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -3,24 +3,12 @@ thai2vec - Thai word vector Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb """ +import numpy as np +from gensim.models import KeyedVectors from pythainlp.corpus import download as download_data from pythainlp.corpus import get_file from pythainlp.tokenize import word_tokenize -try: - from gensim.models import KeyedVectors - import numpy as np -except ImportError: - from pythainlp.tools import install_package - - install_package("gensim") - install_package("numpy") - try: - from gensim.models import KeyedVectors - import numpy as np - except ImportError: - raise ImportError("ImportError: Try 'pip install gensim numpy'") - def download(): path = get_file("thai2vec02") diff --git a/requirements-travis.txt b/requirements-travis.txt deleted file mode 100644 index c0aebf0f5..000000000 --- a/requirements-travis.txt +++ /dev/null @@ -1,10 +0,0 @@ -dill -epitran -langdetect -marisa_trie -nltk>=3.2.2 -pyicu==1.9.3 -pytz -requests -tinydb -tqdm diff --git a/setup.py b/setup.py index 879015359..141f4317b 100644 --- a/setup.py +++ b/setup.py @@ -4,14 +4,22 @@ with open("README-pypi.md", "r", encoding="utf-8") as readme_file: readme = readme_file.read() -readme_file.close() + with open("requirements.txt", "r", encoding="utf-8") as f: requirements = f.read().splitlines() +extras = { + "icu": ["pyicu"], + "ml": ["fastai", "numpy", "sklearn_crfsuite", "torch"], + "pos": ["artagger"], + "tokenize": ["deepcut", "pyicu"], + "transliterate": ["epitran", "pyicu"], +} + setup( name="pythainlp", - version="1.7.0.1", - description="Thai natural language processing library", + version="1.8.0", + description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", author="PyThaiNLP", @@ -40,6 +48,7 @@ }, include_package_data=True, install_requires=requirements, + extras_require=extras, license="Apache Software License 2.0", zip_safe=False, keywords="pythainlp", From baa85ac682ffc52a68ecb042c6be75671e200782 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 08:06:59 +0700 Subject: [PATCH 05/12] torch on Python 3.4 does not has PyPI package --- .travis.yml | 2 -- appveyor.yml | 5 ----- 2 files changed, 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index c39e96dd1..0f709b651 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,6 @@ language: python python: - - "3.4" - - "3.5" - "3.6" # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: diff --git a/appveyor.yml b/appveyor.yml index 17d3256a7..084f18ecb 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,11 +2,6 @@ build: off environment: matrix: - - PYTHON: "C:/Python34" - PYTHON_VERSION: "3.4" - PYTHON_ARCH: "32" - PYICU_WHEEL: "https://get.openlp.org/win-sdk/PyICU-1.9.5-cp34-cp34m-win32.whl" - - PYTHON: "C:/Python36" PYTHON_VERSION: "3.6" PYTHON_ARCH: "32" From 806ad545e1419b42a4449e87532c09b5aef7c1c8 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 08:11:57 +0700 Subject: [PATCH 06/12] remove [ml] from install --- .travis.yml | 2 +- appveyor.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0f709b651..fa69b3b7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install -r requirements.txt - - pip install .[icu,ml,pos,tokenize,transliterate] + - pip install .[icu,pos,tokenize,transliterate] - pip install coveralls os: diff --git a/appveyor.yml b/appveyor.yml index 084f18ecb..a573bb0f8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install -e .[icu,ml,pos,tokenize,transliterate]" + - "%PYTHON%/python.exe -m pip install -e .[icu,pos,tokenize,transliterate]" test_script: - "%PYTHON%/python.exe -m pip --version" From b31bc701a38fe94d2cf63ba309d1e0336ec36b3a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 08:16:25 +0700 Subject: [PATCH 07/12] add "ner" to extras_require --- .travis.yml | 2 +- appveyor.yml | 2 +- setup.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index fa69b3b7a..8f4edb93f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install -r requirements.txt - - pip install .[icu,pos,tokenize,transliterate] + - pip install .[icu,ner,pos,tokenize,transliterate] - pip install coveralls os: diff --git a/appveyor.yml b/appveyor.yml index a573bb0f8..00b4e1ae2 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install -e .[icu,pos,tokenize,transliterate]" + - "%PYTHON%/python.exe -m pip install -e .[icu,ner,pos,tokenize,transliterate]" test_script: - "%PYTHON%/python.exe -m pip --version" diff --git a/setup.py b/setup.py index 141f4317b..38c4c9967 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,8 @@ extras = { "icu": ["pyicu"], - "ml": ["fastai", "numpy", "sklearn_crfsuite", "torch"], + "ml": ["fastai", "numpy", "torch"], + "ner": ["sklearn_crfsuite"], "pos": ["artagger"], "tokenize": ["deepcut", "pyicu"], "transliterate": ["epitran", "pyicu"], From 5faabef5b1e9bad038c4a27b55c2f0e15e3e076b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 11:35:11 +0700 Subject: [PATCH 08/12] Update test cases --- pythainlp/corpus/tnc.py | 2 +- pythainlp/transliterate/__init__.py | 32 +++++++++++++++---------- pythainlp/transliterate/ipa.py | 18 +++++++------- pythainlp/transliterate/pyicu.py | 9 +++---- pythainlp/transliterate/thai2rom.py | 22 ++++++++++------- setup.cfg | 2 +- setup.py | 2 +- tests/__init__.py | 37 +++++++++++++---------------- tox.ini | 2 +- 9 files changed, 68 insertions(+), 58 deletions(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 657e8ccd5..de7c8de0f 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -40,7 +40,7 @@ def word_freq(word, domain="all"): r = requests.post(url, data=data) - pat = re.compile('TOTAL(?s).*?#ffffff">(.*?)') + pat = re.compile(r'TOTAL(?s).*?#ffffff">(.*?)') match = pat.search(r.text) n = 0 diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py index 16de05d1b..48bd5cfd2 100644 --- a/pythainlp/transliterate/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -6,21 +6,29 @@ # ถอดเสียงภาษาไทยเป็นอักษรละติน def romanize(text, engine="royin"): """ - :param str data: Thai text to be romanized - :param str engine: 'royin' (default), 'pyicu', or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'pyicu' uses Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization. + :param str text: Thai text to be romanized + :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras). :return: English (more or less) text that spells out how the Thai text should read. """ - if engine == "pyicu": - from .pyicu import romanize - elif engine == "thai2rom": - from .thai2rom import ThaiTransliterator - - thai2rom = ThaiTransliterator() - return thai2rom.romanize(text) + if engine == "thai2rom": + from .thai2rom import romanize + return romanize(text) else: # use default engine "royin" from .royin import romanize + words = word_tokenize(text) + romanized_words = [romanize(word) for word in words] + return "".join(romanized_words) + - words = word_tokenize(text) - romanized_words = [romanize(word) for word in words] +def transliterate(text, engine="ipa"): + """ + :param str text: Thai text to be transliterated + :param str engine: 'ipa' (default) or 'pyicu'. + :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read. + """ + if engine == "pyicu": + from .pyicu import transliterate + else: + from .ipa import transliterate - return "".join(romanized_words) + return transliterate(text) diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py index ed460e8c9..5fe18d24d 100644 --- a/pythainlp/transliterate/ipa.py +++ b/pythainlp/transliterate/ipa.py @@ -4,18 +4,16 @@ """ import epitran -epi = epitran.Epitran("tha-Thai") +_EPI_THA = epitran.Epitran("tha-Thai") -class IPA: - def __init__(self, text=""): - self.text = text +def transliterate(text): + return _EPI_THA.transliterate(text) - def str(self): - return epi.transliterate(self.text) - def list(self): - return epi.trans_list(self.text) +def trans_list(text): + return _EPI_THA.trans_list(text) - def xsampa_list(self): - return epi.xsampa_list(self.text) + +def xsampa_list(text): + return _EPI_THA.xsampa_list(text) diff --git a/pythainlp/transliterate/pyicu.py b/pythainlp/transliterate/pyicu.py index 77b74c6ad..e34be0e16 100644 --- a/pythainlp/transliterate/pyicu.py +++ b/pythainlp/transliterate/pyicu.py @@ -2,11 +2,12 @@ from icu import Transliterator +_ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin") + + # ถอดเสียงภาษาไทยเป็นอักษรละติน -def romanize(data): +def transliterate(text): """ ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน """ - thai2latin = Transliterator.createInstance("Thai-Latin") - - return thai2latin.transliterate(data) + return _ICU_THAI_TO_LATIN.transliterate(text) diff --git a/pythainlp/transliterate/thai2rom.py b/pythainlp/transliterate/thai2rom.py index f35db6b12..daaf44088 100644 --- a/pythainlp/transliterate/thai2rom.py +++ b/pythainlp/transliterate/thai2rom.py @@ -48,12 +48,8 @@ def __init__(self): self.__target_characters = sorted(list(self.__target_characters)) self.__num_encoder_tokens = len(self.__input_characters) self.__num_decoder_tokens = len(self.__target_characters) - self.__max_encoder_seq_length = max( - [len(text) for text in self.__input_texts] - ) - self.__max_decoder_seq_length = max( - [len(text) for text in self.__target_texts] - ) + self.__max_encoder_seq_length = max([len(text) for text in self.__input_texts]) + self.__max_decoder_seq_length = max([len(text) for text in self.__target_texts]) """print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) @@ -127,7 +123,9 @@ def __decode_sequence(self, input_seq): [self.__target_seq] + self.__states_value ) self.__sampled_token_index = np.argmax(self.__output_tokens[0, -1, :]) - self.__sampled_char = self.__reverse_target_char_index[self.__sampled_token_index] + self.__sampled_char = self.__reverse_target_char_index[ + self.__sampled_token_index + ] self.__decoded_sentence += self.__sampled_char if ( self.__sampled_char == "\n" @@ -141,7 +139,8 @@ def __decode_sequence(self, input_seq): def __encode_input(self, name): self.__test_input = np.zeros( - (1, self.__max_encoder_seq_length, self.__num_encoder_tokens), dtype="float32" + (1, self.__max_encoder_seq_length, self.__num_encoder_tokens), + dtype="float32", ) for t, char in enumerate(name): self.__test_input[0, t, self.__input_token_index[char]] = 1. @@ -153,3 +152,10 @@ def romanize(self, text): :return: English (more or less) text that spells out how the Thai text should read. """ return self.__decode_sequence(self.__encode_input(text)) + + +_THAI_TO_ROM = ThaiTransliterator() + + +def romanize(text): + return _THAI_TO_ROM.romanize(text) diff --git a/setup.cfg b/setup.cfg index 51cd41ab0..a686f87db 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.7.0.1 +current_version = 1.8.0 commit = True tag = True diff --git a/setup.py b/setup.py index 38c4c9967..90f17af09 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ extras = { "icu": ["pyicu"], - "ml": ["fastai", "numpy", "torch"], + "ml": ["fastai", "keras", "numpy", "torch"], "ner": ["sklearn_crfsuite"], "pos": ["artagger"], "tokenize": ["deepcut", "pyicu"], diff --git a/tests/__init__.py b/tests/__init__.py index 27d675e65..1f313e4aa 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -37,8 +37,8 @@ from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize -from pythainlp.transliterate import romanize -from pythainlp.transliterate.ipa import IPA +from pythainlp.transliterate import romanize, transliterate +from pythainlp.transliterate.ipa import trans_list, xsampa_list from pythainlp.util import ( deletetone, eng_to_thai, @@ -266,8 +266,8 @@ def test_pos_tag(self): self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) - self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) - self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) + # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) + # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), @@ -342,24 +342,21 @@ def test_etcc(self): # ### pythainlp.transliterate - def test_ipa(self): - t = IPA("คน") - self.assertEqual(t.str(), "kʰon") - self.assertIsNotNone(t.list()) - self.assertIsNotNone(t.xsampa_list()) - def test_romanize(self): self.assertEqual(romanize("แมว"), "maeo") - self.assertEqual(romanize("แมว", "pyicu"), "mæw") - - def test_romanize_royin(self): - engine = "royin" - self.assertIsNotNone(romanize("กก", engine=engine)) - self.assertEqual(romanize("แมว", engine=engine), "maeo") - self.assertEqual(romanize("เดือน", engine=engine), "duean") - self.assertEqual(romanize("ดู", engine=engine), "du") - self.assertEqual(romanize("ดำ", engine=engine), "dam") - self.assertEqual(romanize("บัว", engine=engine), "bua") + self.assertIsNotNone(romanize("กก", engine="royin")) + self.assertEqual(romanize("แมว", engine="royin"), "maeo") + self.assertEqual(romanize("เดือน", engine="royin"), "duean") + self.assertEqual(romanize("ดู", engine="royin"), "du") + self.assertEqual(romanize("ดำ", engine="royin"), "dam") + self.assertEqual(romanize("บัว", engine="royin"), "bua") + # self.assertIsNotNone(romanize("บัว", engine="thai2rom")) + + def test_transliterate(self): + self.assertEqual(transliterate("แมว", "pyicu"), "mæw") + self.assertEqual(transliterate("คน", engine="ipa"), "kʰon") + self.assertIsNotNone(trans_list("คน")) + self.assertIsNotNone(xsampa_list("คน")) # ### pythainlp.util diff --git a/tox.ini b/tox.ini index 37cf268e5..e5573e5b5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27, py34, py35, flake8 +envlist = py36 flake8 [testenv:flake8] basepython = python From 02a05f69e955040a69245b3b5e34a771f5162828 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 11:38:07 +0700 Subject: [PATCH 09/12] Update version number --- README-pypi.md | 2 +- docs/conf.py | 2 +- pythainlp/__init__.py | 2 +- tests/__init__.py | 16 +++++++--------- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/README-pypi.md b/README-pypi.md index ea200a76f..70a8a53c2 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -1,6 +1,6 @@ ![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4) -# PyThaiNLP 1.7 +# PyThaiNLP 1.8.0 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&utm_medium=referral&utm_content=PyThaiNLP/pythainlp&utm_campaign=Badge_Grade)[![pypi](https://img.shields.io/pypi/v/pythainlp.svg)](https://pypi.python.org/pypi/pythainlp) [![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp) diff --git a/docs/conf.py b/docs/conf.py index bc1b294f1..7e1440553 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '1.7' +release = '1.8.0' # -- General configuration --------------------------------------------------- diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 12065e6bc..02da9f4e0 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = 1.7 +__version__ = 1.8.0 thai_alphabets = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars thai_vowels = "ฤฦะ\u0e31าำ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39เแโใไ\u0e45\u0e47" # 19 diff --git a/tests/__init__.py b/tests/__init__.py index 1f313e4aa..ec4a492d6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -261,22 +261,20 @@ def test_summarize(self): # ### pythainlp.tag def test_pos_tag(self): - tokens = ["คำ"] + tokens = ["ผม", "รัก", "คุณ"] self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) + self.assertEqual( + pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), + [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], + ) + self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) + # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) - self.assertEqual( - pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), - [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], - ) - self.assertEqual( - str(type(pos_tag(word_tokenize("ผมรักคุณ"), engine="artagger"))), - "", - ) self.assertEqual( pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), [ From 75e8971a7c649c3d59428c91c0fa1349f2249484 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 11:52:12 +0700 Subject: [PATCH 10/12] Fix version syntax --- pythainlp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 02da9f4e0..fde65698f 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = 1.8.0 +__version__ = 1.8 thai_alphabets = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars thai_vowels = "ฤฦะ\u0e31าำ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39เแโใไ\u0e45\u0e47" # 19 From f8626904122822a7714ac6b18a9f0dd070903fca Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 12:04:03 +0700 Subject: [PATCH 11/12] update docs for transliterate --- README.md | 4 ++-- docs/api/romanization.rst | 8 ++++---- docs/pythainlp-dev-thai.md | 8 +++++--- examples/transliterate.py | 3 ++- pythainlp/__init__.py | 2 +- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 339666dbd..ef71bf205 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Python 2 users can still use PyThaiNLP 1.6. ## Capabilities - Thai word segmentation (```word_tokenize```), including subword segmentation based on Thai Character Cluster (```tcc```) and ETCC (```etcc```) -- Thai romanization (```romanize```) +- Thai romanization and transliteration (```romanize```, ```transliterate```) - Thai part-of-speech taggers (```pos_tag```) - Read out number to Thai words (```bahttext```, ```num_to_thaiword```) - Thai collation (sort by dictionoary order) (```collate```) @@ -85,7 +85,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนเพื่ ## ความสามารถ - ตัดคำภาษาไทย (```word_tokenize```) และรองรับ Thai Character Clusters (```tcc```) และ ETCC (```etcc```) -- ถอดเสียงภาษาไทยเป็นอักษรละติน (```romanize```) +- ถอดเสียงภาษาไทยเป็นอักษรละตินและสัทอักษร (```romanize```, ```transliterate```) - ระบุชนิดคำ (part-of-speech) ภาษาไทย (```pos_tag```) - อ่านตัวเลขเป็นข้อความภาษาไทย (```bahttext```, ```num_to_thaiword```) - เรียงลำดับคำตามพจนานุกรมไทย (```collate```) diff --git a/docs/api/romanization.rst b/docs/api/romanization.rst index fbae60d77..e99fdd7b2 100644 --- a/docs/api/romanization.rst +++ b/docs/api/romanization.rst @@ -1,10 +1,10 @@ .. currentmodule:: pythainlp.romanization -pythainlp.romanization +pythainlp.transliterate ==================================== -The :class:`pythainlp.romanization` turns thai text into a romanized one (put simply, spelled with English). +The :class:`pythainlp.transliterate` turns Thai text into a romanized one (put simply, spelled with English). -.. autofunction:: romanization -.. currentmodule:: pythainlp.romanization.thai2rom +.. autofunction:: transliterate +.. currentmodule:: pythainlp.transliterate.thai2rom .. autoclass:: thai2rom :members: romanize diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md index 47b268457..503705cba 100644 --- a/docs/pythainlp-dev-thai.md +++ b/docs/pythainlp-dev-thai.md @@ -256,12 +256,13 @@ lentext คือ จำนวนคำขั้นต่ำที่ต้อ คืนค่าเป็น dict -### romanization +### transliteration ```python -from pythainlp.transliterate import romanize +from pythainlp.transliterate import romanize, transliterate romanize(str, engine="royin") +transliterate(str, engine="pyicu") ``` มี engine ดังนี้ @@ -275,9 +276,10 @@ romanize(str, engine="royin") **ตัวอย่าง** ```python -from pythainlp.transliterate import romanize +from pythainlp.transliterate import romanize, transliterate romanize("แมว") # 'maew' +transliterate("นก") ``` ### spell diff --git a/examples/transliterate.py b/examples/transliterate.py index 90ec7be80..97fb4e7f1 100644 --- a/examples/transliterate.py +++ b/examples/transliterate.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- -from pythainlp.transliterate import romanize +from pythainlp.transliterate import romanize, transliterate print(romanize("แมว")) +print(transliterate("แมว")) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index fde65698f..5a79f230b 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -24,7 +24,7 @@ from pythainlp.collation import collate from pythainlp.date import now -from pythainlp.transliterate import romanize +from pythainlp.transliterate import romanize, transliterate from pythainlp.sentiment import sentiment from pythainlp.soundex import soundex from pythainlp.spell import spell From a47d297c31874a9494c56baa5789c5a2bb3b9971 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 13:11:13 +0700 Subject: [PATCH 12/12] - add "full" option in extras_require - fastai==0.7.0 --- setup.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 90f17af09..3fa7c5c18 100644 --- a/setup.py +++ b/setup.py @@ -10,11 +10,22 @@ extras = { "icu": ["pyicu"], - "ml": ["fastai", "keras", "numpy", "torch"], + "ml": ["fastai==0.7.0", "keras", "numpy", "torch"], "ner": ["sklearn_crfsuite"], "pos": ["artagger"], "tokenize": ["deepcut", "pyicu"], "transliterate": ["epitran", "pyicu"], + "full": [ + "artagger", + "deepcut", + "epitran", + "fastai==0.7.0", + "keras", + "numpy", + "pyicu", + "sklearn_crfsuite", + "torch", + ], } setup(