Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ fairseq==0.10.2
pyicu==2.6
deepcut==0.7.0.0
h5py==2.10.0
tensorflow==2.4.2
pandas==0.24
tensorflow==2.4.0
pandas==0.24
tltk==1.3.8
1 change: 1 addition & 0 deletions docs/api/tag.rst
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ Modules
.. autofunction:: chunk_parse
.. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
:members: get_ner
.. autofunction:: pythainlp.tag.tltk.get_ner

Tagger Engines
--------------
Expand Down
1 change: 1 addition & 0 deletions docs/notes/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ where ``extras`` can be
- ``mt5`` (to mt5 models for Thai text summarizer)
- ``wordnet`` (to support wordnet)
- ``spell`` (to support phunspell & symspellpy)
- ``tltk`` (to support tltk)
- ``full`` (install everything)

For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
Expand Down
7 changes: 7 additions & 0 deletions pythainlp/spell/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def spell(word: str, engine: str = "pn") -> List[str]:
* *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default)
* *phunspell* - A spell checker utilizing spylls a port of Hunspell.
* *symspellpy* - symspellpy is a Python port of SymSpell v6.5.
* *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.,

:return: list of possible correct words within 1 or 2 edit distance and
sorted by frequency of word occurrences in the spelling dictionary
Expand All @@ -39,6 +40,9 @@ def spell(word: str, engine: str = "pn") -> List[str]:
spell("เส้นตรบ")
# output: ['เส้นตรง']

spell("เส้นตรบ", engine="tltk")
# output: ['เส้นตรง']

spell("ครัช")
# output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', 'วรัช', 'ครัส',
# 'ปรัช', 'บรัช', 'ครัง', 'คัช', 'คลัช', 'ครัย', 'ครัด']
Expand All @@ -58,6 +62,9 @@ def spell(word: str, engine: str = "pn") -> List[str]:
elif engine == "symspellpy":
from pythainlp.spell.symspellpy import spell as SPELL_CHECKER
text_correct = SPELL_CHECKER(word)
elif engine == "tltk":
from pythainlp.spell.tltk import spell as SPELL_CHECKER
text_correct = SPELL_CHECKER(word)
else:
text_correct = DEFAULT_SPELL_CHECKER.spell(word)

Expand Down
6 changes: 6 additions & 0 deletions pythainlp/spell/tltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from tltk.nlp import spell_candidates
from typing import List


def spell(text: str) -> List[str]:
return spell_candidates(text)
23 changes: 21 additions & 2 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def pos_tag(
* *wangchanberta* - wangchanberta model (support lst20 corpus only \
and it supports a string only. if you input a list of word, \
it will convert list word to a string.
* *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\
if you choose other corpus, It's change to TNC corpus.)
:param str corpus:
the corpus that used to create the language model for tagger
* *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
Expand All @@ -28,6 +30,7 @@ def pos_tag(
* *pud* - `Parallel Universal Dependencies (PUD)\
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
* *tnc* - Thai National Corpus (support tltk engine only)
:return: a list of tuples (word, POS tag)
:rtype: list[tuple[str, str]]

Expand Down Expand Up @@ -89,13 +92,25 @@ def pos_tag(
if not words:
return []

if engine == "perceptron":
_support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]

if engine == "perceptron" and corpus in _support_corpus:
from pythainlp.tag.perceptron import tag as tag_
elif engine == "wangchanberta" and corpus == "lst20":
from pythainlp.wangchanberta.postag import pos_tag as tag_
words = ''.join(words)
else: # default, use "unigram" ("old") engine
elif engine == "tltk":
from pythainlp.tag.tltk import pos_tag as tag_
corpus = "tnc"
elif engine == "unigram" and corpus in _support_corpus: # default
from pythainlp.tag.unigram import tag as tag_
else:
raise ValueError(
"pos_tag not support {0} engine or {1} corpus.".format(
engine,
corpus
)
)

word_tags = tag_(words, corpus=corpus)

Expand All @@ -114,6 +129,9 @@ def pos_tag_sents(
:param str engine:
* *perceptron* - perceptron tagger (default)
* *unigram* - unigram tagger
* *wangchanberta* - wangchanberta model (support lst20 corpus only)
* *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\
if you choose other corpus, It's change to TNC corpus.)
:param str corpus:
the corpus that used to create the language model for tagger
* *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
Expand All @@ -127,6 +145,7 @@ def pos_tag_sents(
* *pud* - `Parallel Universal Dependencies (PUD)\
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
* *tnc* - Thai National Corpus (support tltk engine only)
:return: a list of lists of tuples (word, POS tag)
:rtype: list[list[tuple[str, str]]]

Expand Down
93 changes: 93 additions & 0 deletions pythainlp/tag/tltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple, Union
from tltk import nlp
from pythainlp.tokenize import word_tokenize

nlp.pos_load()
nlp.ner_load()


def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]:
if corpus != "tnc":
raise ValueError("tltk not support {0} corpus.".format(0))
return nlp.pos_tag_wordlist(words)


def _post_process(text: str) -> str:
return text.replace("<s/>", " ")


def get_ner(
text: str,
pos: bool = True,
tag: bool = False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
Named-entity recognizer from **TLTK**

This function tags named-entitiy from text in IOB format.

:param str text: text in Thai to be tagged
:param bool pos: To include POS tags in the results (`True`) or
exclude (`False`). The defualt value is `True`
:param bool tag: output like html tag.
:return: a list of tuple associated with tokenized word, NER tag,
POS tag (if the parameter `pos` is specified as `True`),
and output like html tag (if the parameter `tag` is
specified as `True`).
Otherwise, return a list of tuple associated with tokenized
word and NER tag
:rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

:Example:

>>> from pythainlp.tag.tltk import get_ner
>>> get_ner("เขาเรียนที่โรงเรียนนางรอง")
[('เขา', 'PRON', 'O'),
('เรียน', 'VERB', 'O'),
('ที่', 'SCONJ', 'O'),
('โรงเรียน', 'NOUN', 'B-L'),
('นางรอง', 'VERB', 'I-L')]
>>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False)
[('เขา', 'O'),
('เรียน', 'O'),
('ที่', 'O'),
('โรงเรียน', 'B-L'),
('นางรอง', 'I-L')]
>>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True)
'เขาเรียนที่<L>โรงเรียนนางรอง</L>'
"""
if not text:
return []
list_word = []
for i in word_tokenize(text, engine="tltk"):
if i == " ":
i = "<s/>"
list_word.append(i)
_pos = nlp.pos_tag_wordlist(list_word)
sent_ner = [
(_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos)
]
if tag:
temp = ""
sent = ""
for idx, (word, pos, ner) in enumerate(sent_ner):
if ner.startswith("B-") and temp != "":
sent += "</" + temp + ">"
temp = ner[2:]
sent += "<" + temp + ">"
elif ner.startswith("B-"):
temp = ner[2:]
sent += "<" + temp + ">"
elif ner == "O" and temp != "":
sent += "</" + temp + ">"
temp = ""
sent += word

if idx == len(sent_ner) - 1 and temp != "":
sent += "</" + temp + ">"

return sent
if pos is False:
return [(word, ner) for word, pos, ner in sent_ner]
return sent_ner
14 changes: 14 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def word_tokenize(
and combining tokens that are parts of the same named-entity.
* *sefr_cut* - wrapper for
`SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
* *tltk* - wrapper for
`TLTK <https://pypi.org/project/tltk/>`_.,

:Note:
- The parameter **custom_dict** can be provided as an argument \
Expand Down Expand Up @@ -182,6 +184,10 @@ def word_tokenize(
elif engine == "sefr_cut":
from pythainlp.tokenize.sefr_cut import segment

segments = segment(text)
elif engine == "tltk":
from pythainlp.tokenize.tltk import segment

segments = segment(text)
else:
raise ValueError(
Expand Down Expand Up @@ -215,6 +221,7 @@ def sent_tokenize(
* *whitespace+newline* - split by whitespaces and newline.
* *whitespace* - split by whitespaces. Specifiaclly, with \
:class:`regex` pattern ``r" +"``
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
:Example:

Split the text based on *whitespace*::
Expand Down Expand Up @@ -271,6 +278,10 @@ def sent_tokenize(
segments = re.split(r" +", text, re.U)
elif engine == "whitespace+newline":
segments = text.split()
elif engine == "tltk":
from pythainlp.tokenize.tltk import sent_tokenize as segment

segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down Expand Up @@ -314,6 +325,7 @@ def subword_tokenize(
* *wangchanberta* - SentencePiece from wangchanberta model.
* *dict* - newmm word tokenizer with a syllable dictionary
* *ssg* - CRF syllable segmenter for Thai
* *tltk* - syllable tokenizer from tltk

:Example:

Expand Down Expand Up @@ -376,6 +388,8 @@ def subword_tokenize(
)
elif engine == "ssg":
from pythainlp.tokenize.ssg import segment
elif engine == "tltk":
from pythainlp.tokenize.tltk import syllable_tokenize as segment
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down
34 changes: 34 additions & 0 deletions pythainlp/tokenize/tltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
from typing import List
from tltk.nlp import word_segment as tltk_segment
from tltk.nlp import syl_segment


def segment(text: str) -> List[str]:
if not text or not isinstance(text, str):
return []
text = text.replace(" ", "<u/>")
_temp = tltk_segment(text).replace("<u/>", " ").replace("<s/>", "")
_temp = _temp.split('|')
if _temp[-1] == "":
del _temp[-1]
return _temp


def syllable_tokenize(text: str) -> List[str]:
if not text or not isinstance(text, str):
return []
_temp = syl_segment(text)
_temp = _temp.split('~')
if _temp[-1] == "<s/>":
del _temp[-1]
return _temp


def sent_tokenize(text: str) -> List[str]:
text = text.replace(" ", "<u/>")
_temp = tltk_segment(text).replace("<u/>", " ").replace("|", "")
_temp = _temp.split('<s/>')
if _temp[-1] == "":
del _temp[-1]
return _temp
23 changes: 21 additions & 2 deletions pythainlp/transliterate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
Transcription issued by Royal Institute of Thailand.
* *thai2rom* - a deep learning-based Thai romanization engine
(require PyTorch).
* *tltk* - TLTK: Thai Language Toolkit

:Example:
::
Expand All @@ -35,6 +36,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
romanize("สามารถ", engine="thai2rom")
# output: 'samat'

romanize("สามารถ", engine="tltk")
# output: 'samat'

romanize("ภาพยนตร์", engine="royin")
# output: 'phapn'

Expand All @@ -47,6 +51,8 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:

if engine == "thai2rom":
from pythainlp.transliterate.thai2rom import romanize
elif engine == "tltk":
from pythainlp.transliterate.tltk import romanize
else: # use default engine "royin"
from pythainlp.transliterate.royin import romanize

Expand All @@ -67,10 +73,13 @@ def transliterate(
:rtype: str

:Options for engines:
* *icu* - pyicu, based on International Components for Unicode (ICU)
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
* *thaig2p* - (default) Thai Grapheme-to-Phoneme,
output is IPA (require PyTorch)
* *icu* - pyicu, based on International Components for Unicode (ICU)
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
* *tltk_g2p* - Thai Grapheme-to-Phoneme from\
`TLTK <https://pypi.org/project/tltk/>`_.,
* *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA)

:Example:
::
Expand All @@ -86,6 +95,12 @@ def transliterate(
transliterate("สามารถ", engine="thaig2p")
# output: 's aː ˩˩˦ . m aː t̚ ˥˩'

transliterate("สามารถ", engine="tltk_ipa")
# output: 'saː5.maːt3'

transliterate("สามารถ", engine="tltk_g2p")
# output: 'saa4~maat2'

transliterate("ภาพยนตร์", engine="icu")
# output: 'p̣hāphyntr̒'

Expand All @@ -103,6 +118,10 @@ def transliterate(
from pythainlp.transliterate.pyicu import transliterate
elif engine == "ipa":
from pythainlp.transliterate.ipa import transliterate
elif engine == "tltk_g2p":
from pythainlp.transliterate.tltk import tltk_g2p as transliterate
elif engine == "tltk_ipa":
from pythainlp.transliterate.tltk import tltk_ipa as transliterate
else: # use default engine: "thaig2p"
from pythainlp.transliterate.thaig2p import transliterate

Expand Down
17 changes: 17 additions & 0 deletions pythainlp/transliterate/tltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
from tltk.nlp import g2p, th2ipa, th2roman


def romanize(text: str) -> str:
_temp = th2roman(text)
return _temp[:_temp.rfind(" <s/>")].replace("<s/>", "")


def tltk_g2p(text: str) -> str:
_temp = g2p(text).split("<tr/>")[1].replace("|<s/>", "").replace("|", " ")
return _temp.replace("<s/>", "")


def tltk_ipa(text: str) -> str:
_temp = th2ipa(text)
return _temp[:_temp.rfind(" <s/>")].replace("<s/>", "")
Loading