PyThaiNLP · wannaphong · Jul 30, 2021 · Jul 29, 2021 · Jul 29, 2021 · Jul 29, 2021
diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -23,5 +23,6 @@ fairseq==0.10.2
 pyicu==2.6
 deepcut==0.7.0.0
 h5py==2.10.0
-tensorflow==2.4.2
-pandas==0.24
+tensorflow==2.4.0
+pandas==0.24
+tltk==1.3.8
diff --git a/docs/api/tag.rst b/docs/api/tag.rst
@@ -232,6 +232,7 @@ Modules
 .. autofunction:: chunk_parse
 .. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
    :members: get_ner
+.. autofunction:: pythainlp.tag.tltk.get_ner
 
 Tagger Engines
 --------------

diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
@@ -27,6 +27,7 @@ where ``extras`` can be
   - ``mt5`` (to mt5 models for Thai text summarizer)
   - ``wordnet`` (to support wordnet)
   - ``spell`` (to support phunspell & symspellpy)
+  - ``tltk`` (to support tltk)
   - ``full`` (install everything)
 
 For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.

diff --git a/pythainlp/spell/core.py b/pythainlp/spell/core.py
@@ -22,6 +22,7 @@ def spell(word: str, engine: str = "pn") -> List[str]:
         * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
         * *phunspell* - A spell checker utilizing spylls a port of Hunspell.
         * *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
+        * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.,
 
     :return: list of possible correct words within 1 or 2 edit distance and
              sorted by frequency of word occurrences in the spelling dictionary
@@ -39,6 +40,9 @@ def spell(word: str, engine: str = "pn") -> List[str]:
         spell("เส้นตรบ")
         # output: ['เส้นตรง']
 
+        spell("เส้นตรบ",  engine="tltk")
+        # output: ['เส้นตรง']
+
         spell("ครัช")
         # output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', 'วรัช', 'ครัส',
         # 'ปรัช', 'บรัช', 'ครัง', 'คัช', 'คลัช', 'ครัย', 'ครัด']
@@ -58,6 +62,9 @@ def spell(word: str, engine: str = "pn") -> List[str]:
     elif engine == "symspellpy":
         from pythainlp.spell.symspellpy import spell as SPELL_CHECKER
         text_correct = SPELL_CHECKER(word)
+    elif engine == "tltk":
+        from pythainlp.spell.tltk import spell as SPELL_CHECKER
+        text_correct = SPELL_CHECKER(word)
     else:
         text_correct = DEFAULT_SPELL_CHECKER.spell(word)
 

diff --git a/pythainlp/spell/tltk.py b/pythainlp/spell/tltk.py
@@ -0,0 +1,6 @@
+from tltk.nlp import spell_candidates
+from typing import List
+
+
+def spell(text: str) -> List[str]:
+    return spell_candidates(text)
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -15,6 +15,8 @@ def pos_tag(
         * *wangchanberta* - wangchanberta model (support lst20 corpus only \
             and it supports a string only. if you input a list of word, \
             it will convert list word to a string.
+        * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\
+            if you choose other corpus, It's change to TNC corpus.)
     :param str corpus:
         the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
@@ -28,6 +30,7 @@ def pos_tag(
         * *pud* - `Parallel Universal Dependencies (PUD)\
             <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
             treebanks, natively use Universal POS tags
+        * *tnc* - Thai National Corpus (support tltk engine only)
     :return: a list of tuples (word, POS tag)
     :rtype: list[tuple[str, str]]
 
@@ -89,13 +92,25 @@ def pos_tag(
     if not words:
         return []
 
-    if engine == "perceptron":
+    _support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]
+
+    if engine == "perceptron" and corpus in _support_corpus:
         from pythainlp.tag.perceptron import tag as tag_
     elif engine == "wangchanberta" and corpus == "lst20":
         from pythainlp.wangchanberta.postag import pos_tag as tag_
         words = ''.join(words)
-    else:  # default, use "unigram" ("old") engine
+    elif engine == "tltk":
+        from pythainlp.tag.tltk import pos_tag as tag_
+        corpus = "tnc"
+    elif engine == "unigram" and corpus in _support_corpus:  # default
         from pythainlp.tag.unigram import tag as tag_
+    else:
+        raise ValueError(
+            "pos_tag not support {0} engine or {1} corpus.".format(
+                engine,
+                corpus
+            )
+        )
 
     word_tags = tag_(words, corpus=corpus)
 
@@ -114,6 +129,9 @@ def pos_tag_sents(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
+        * *wangchanberta*  - wangchanberta model (support lst20 corpus only)
+        * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\
+            if you choose other corpus, It's change to TNC corpus.)
     :param str corpus:
         the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
@@ -127,6 +145,7 @@ def pos_tag_sents(
         * *pud* - `Parallel Universal Dependencies (PUD)\
             <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
             treebanks, natively use Universal POS tags
+        * *tnc* - Thai National Corpus (support tltk engine only)
     :return: a list of lists of tuples (word, POS tag)
     :rtype: list[list[tuple[str, str]]]
 

diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple, Union
+from tltk import nlp
+from pythainlp.tokenize import word_tokenize
+
+nlp.pos_load()
+nlp.ner_load()
+
+
+def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]:
+    if corpus != "tnc":
+        raise ValueError("tltk not support {0} corpus.".format(0))
+    return nlp.pos_tag_wordlist(words)
+
+
+def _post_process(text: str) -> str:
+    return text.replace("<s/>", " ")
+
+
+def get_ner(
+    text: str,
+    pos: bool = True,
+    tag: bool = False
+) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
+    """
+    Named-entity recognizer from **TLTK**
+
+    This function tags named-entitiy from text in IOB format.
+
+    :param str text: text in Thai to be tagged
+    :param bool pos: To include POS tags in the results (`True`) or
+        exclude (`False`). The defualt value is `True`
+    :param bool tag: output like html tag.
+    :return: a list of tuple associated with tokenized word, NER tag,
+        POS tag (if the parameter `pos` is specified as `True`),
+        and output like html tag (if the parameter `tag` is
+        specified as `True`).
+        Otherwise, return a list of tuple associated with tokenized
+        word and NER tag
+    :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str
+
+    :Example:
+
+        >>> from pythainlp.tag.tltk import get_ner
+        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง")
+        [('เขา', 'PRON', 'O'),
+        ('เรียน', 'VERB', 'O'),
+        ('ที่', 'SCONJ', 'O'),
+        ('โรงเรียน', 'NOUN', 'B-L'),
+        ('นางรอง', 'VERB', 'I-L')]
+        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False)
+        [('เขา', 'O'),
+        ('เรียน', 'O'),
+        ('ที่', 'O'),
+        ('โรงเรียน', 'B-L'),
+        ('นางรอง', 'I-L')]
+        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True)
+        'เขาเรียนที่<L>โรงเรียนนางรอง</L>'
+    """
+    if not text:
+        return []
+    list_word = []
+    for i in word_tokenize(text, engine="tltk"):
+        if i == " ":
+            i = "<s/>"
+        list_word.append(i)
+    _pos = nlp.pos_tag_wordlist(list_word)
+    sent_ner = [
+        (_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos)
+    ]
+    if tag:
+        temp = ""
+        sent = ""
+        for idx, (word, pos, ner) in enumerate(sent_ner):
+            if ner.startswith("B-") and temp != "":
+                sent += "</" + temp + ">"
+                temp = ner[2:]
+                sent += "<" + temp + ">"
+            elif ner.startswith("B-"):
+                temp = ner[2:]
+                sent += "<" + temp + ">"
+            elif ner == "O" and temp != "":
+                sent += "</" + temp + ">"
+                temp = ""
+            sent += word
+
+            if idx == len(sent_ner) - 1 and temp != "":
+                sent += "</" + temp + ">"
+
+        return sent
+    if pos is False:
+        return [(word, ner) for word, pos, ner in sent_ner]
+    return sent_ner
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -86,6 +86,8 @@ def word_tokenize(
           and combining tokens that are parts of the same named-entity.
         * *sefr_cut* - wrapper for
           `SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
+        * *tltk* - wrapper for
+          `TLTK <https://pypi.org/project/tltk/>`_.,
 
     :Note:
         - The parameter **custom_dict** can be provided as an argument \
@@ -182,6 +184,10 @@ def word_tokenize(
     elif engine == "sefr_cut":
         from pythainlp.tokenize.sefr_cut import segment
 
+        segments = segment(text)
+    elif engine == "tltk":
+        from pythainlp.tokenize.tltk import segment
+
         segments = segment(text)
     else:
         raise ValueError(
@@ -215,6 +221,7 @@ def sent_tokenize(
         * *whitespace+newline* - split by whitespaces and newline.
         * *whitespace* - split by whitespaces. Specifiaclly, with \
                          :class:`regex` pattern  ``r" +"``
+        * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
     :Example:
 
     Split the text based on *whitespace*::
@@ -271,6 +278,10 @@ def sent_tokenize(
         segments = re.split(r" +", text, re.U)
     elif engine == "whitespace+newline":
         segments = text.split()
+    elif engine == "tltk":
+        from pythainlp.tokenize.tltk import sent_tokenize as segment
+
+        segments = segment(text)
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
@@ -314,6 +325,7 @@ def subword_tokenize(
         * *wangchanberta* - SentencePiece from wangchanberta model.
         * *dict* - newmm word tokenizer with a syllable dictionary
         * *ssg* - CRF syllable segmenter for Thai
+        * *tltk* - syllable tokenizer from tltk
 
     :Example:
 
@@ -376,6 +388,8 @@ def subword_tokenize(
             )
     elif engine == "ssg":
         from pythainlp.tokenize.ssg import segment
+    elif engine == "tltk":
+        from pythainlp.tokenize.tltk import syllable_tokenize as segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.

diff --git a/pythainlp/tokenize/tltk.py b/pythainlp/tokenize/tltk.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+from typing import List
+from tltk.nlp import word_segment as tltk_segment
+from tltk.nlp import syl_segment
+
+
+def segment(text: str) -> List[str]:
+    if not text or not isinstance(text, str):
+        return []
+    text = text.replace(" ", "<u/>")
+    _temp = tltk_segment(text).replace("<u/>", " ").replace("<s/>", "")
+    _temp = _temp.split('|')
+    if _temp[-1] == "":
+        del _temp[-1]
+    return _temp
+
+
+def syllable_tokenize(text: str) -> List[str]:
+    if not text or not isinstance(text, str):
+        return []
+    _temp = syl_segment(text)
+    _temp = _temp.split('~')
+    if _temp[-1] == "<s/>":
+        del _temp[-1]
+    return _temp
+
+
+def sent_tokenize(text: str) -> List[str]:
+    text = text.replace(" ", "<u/>")
+    _temp = tltk_segment(text).replace("<u/>", " ").replace("|", "")
+    _temp = _temp.split('<s/>')
+    if _temp[-1] == "":
+        del _temp[-1]
+    return _temp
diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py
@@ -23,6 +23,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
           Transcription issued by Royal Institute of Thailand.
         * *thai2rom* - a deep learning-based Thai romanization engine
           (require PyTorch).
+        * *tltk* - TLTK: Thai Language Toolkit
 
     :Example:
     ::
@@ -35,6 +36,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
         romanize("สามารถ", engine="thai2rom")
         # output: 'samat'
 
+        romanize("สามารถ", engine="tltk")
+        # output: 'samat'
+
         romanize("ภาพยนตร์", engine="royin")
         # output: 'phapn'
 
@@ -47,6 +51,8 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
 
     if engine == "thai2rom":
         from pythainlp.transliterate.thai2rom import romanize
+    elif engine == "tltk":
+        from pythainlp.transliterate.tltk import romanize
     else:  # use default engine "royin"
         from pythainlp.transliterate.royin import romanize
 
@@ -67,10 +73,13 @@ def transliterate(
     :rtype: str
 
     :Options for engines:
-        * *icu* - pyicu, based on International Components for Unicode (ICU)
-        * *ipa* - epitran, output is International Phonetic Alphabet (IPA)
         * *thaig2p* - (default) Thai Grapheme-to-Phoneme,
           output is IPA (require PyTorch)
+        * *icu* - pyicu, based on International Components for Unicode (ICU)
+        * *ipa* - epitran, output is International Phonetic Alphabet (IPA)
+        * *tltk_g2p* - Thai Grapheme-to-Phoneme from\
+            `TLTK <https://pypi.org/project/tltk/>`_.,
+        * *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA)
 
     :Example:
     ::
@@ -86,6 +95,12 @@ def transliterate(
         transliterate("สามารถ", engine="thaig2p")
         # output: 's aː ˩˩˦ . m aː t̚ ˥˩'
 
+        transliterate("สามารถ", engine="tltk_ipa")
+        # output: 'saː5.maːt3'
+
+        transliterate("สามารถ", engine="tltk_g2p")
+        # output: 'saa4~maat2'
+
         transliterate("ภาพยนตร์", engine="icu")
         # output: 'p̣hāphyntr̒'
 
@@ -103,6 +118,10 @@ def transliterate(
         from pythainlp.transliterate.pyicu import transliterate
     elif engine == "ipa":
         from pythainlp.transliterate.ipa import transliterate
+    elif engine == "tltk_g2p":
+        from pythainlp.transliterate.tltk import tltk_g2p as transliterate
+    elif engine == "tltk_ipa":
+        from pythainlp.transliterate.tltk import tltk_ipa as transliterate
     else:  # use default engine: "thaig2p"
         from pythainlp.transliterate.thaig2p import transliterate
 

diff --git a/pythainlp/transliterate/tltk.py b/pythainlp/transliterate/tltk.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+from tltk.nlp import g2p, th2ipa, th2roman
+
+
+def romanize(text: str) -> str:
+    _temp = th2roman(text)
+    return _temp[:_temp.rfind(" <s/>")].replace("<s/>", "")
+
+
+def tltk_g2p(text: str) -> str:
+    _temp = g2p(text).split("<tr/>")[1].replace("|<s/>", "").replace("|", " ")
+    return _temp.replace("<s/>", "")
+
+
+def tltk_ipa(text: str) -> str:
+    _temp = th2ipa(text)
+    return _temp[:_temp.rfind(" <s/>")].replace("<s/>", "")