Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 34 additions & 14 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,34 +19,54 @@ Modules
Tokenization Engines
--------------------

newmm
+++++
.. automodule:: pythainlp.tokenize.newmm
.. autofunction:: pythainlp.tokenize.newmm.segment
Word level
----------

attacut
+++++++
.. automodule:: pythainlp.tokenize.attacut

longest
.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer
:members:

deepcut
+++++++
.. automodule:: pythainlp.tokenize.longest
.. automodule:: pythainlp.tokenize.deepcut

multi_cut
+++++++++
.. automodule:: pythainlp.tokenize.multi_cut

.. autofunction:: pythainlp.tokenize.multi_cut.segment
.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment

longest
+++++++
.. automodule:: pythainlp.tokenize.longest

.. autofunction:: pythainlp.tokenize.longest.segment

pyicu
+++++
.. automodule:: pythainlp.tokenize.pyicu

deepcut
+++++++
.. automodule:: pythainlp.tokenize.deepcut
nercut
++++++
.. automodule:: pythainlp.tokenize.nercut

attacut
+++++++
.. automodule:: pythainlp.tokenize.attacut
.. autofunction:: pythainlp.tokenize.nercut.segment

.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer
:members:
newmm
+++++

The default word tokenization engine.

.. automodule:: pythainlp.tokenize.newmm

.. autofunction:: pythainlp.tokenize.newmm.segment

Subword level
-------------

tcc
+++
Expand Down
10 changes: 10 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
['และ', 'คุณ', 'เล่น', 'มือถือ'],
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
"""
if not doc or not isinstance(doc, str):
return []

from .crfcls import segment

return segment(doc)
Expand Down Expand Up @@ -74,6 +77,9 @@ def word_tokenize(
* *deepcut* - wrapper for
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
learning-based approach
* *nercut* - Dictionary-based maximal matching word segmentation,
constrained with Thai Character Cluster (TCC) boundaries,
and combining tokens that are parts of the same named-entity.

:Note:
- The parameter **custom_dict** can be provided as an argument \
Expand Down Expand Up @@ -162,6 +168,10 @@ def word_tokenize(
elif engine == "icu":
from .pyicu import segment

segments = segment(text)
elif engine == "nercut":
from .nercut import segment

segments = segment(text)
else:
raise ValueError(
Expand Down
77 changes: 77 additions & 0 deletions pythainlp/tokenize/nercut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
"""
nercut 0.1

Dictionary-based maximal matching word segmentation, constrained with
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named-entity.

Code by Wannaphong Phatthiyaphaibun
"""
from typing import Iterable, List

from pythainlp.tag.named_entity import ThaiNameTagger

_thainer = ThaiNameTagger()


def segment(
text: str,
taglist: Iterable[str] = [
"ORGANIZATION",
"PERSON",
"PHONE",
"EMAIL",
"DATE",
"TIME",
],
) -> List[str]:
"""
Dictionary-based maximal matching word segmentation, constrained with
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named-entity.

:param str text: text to be tokenized to words
:parm list taglist: a list of named-entity tags to be used
:return: list of words, tokenized from the text
"""
if not text or not isinstance(text, str):
return []

global _thainer
tagged_words = _thainer.get_ner(text, pos=False)

words = []
combining_word = ""
combining_word = ""
for curr_word, curr_tag in tagged_words:
if curr_tag != "O":
tag = curr_tag[2:]
else:
tag = "O"

if curr_tag.startswith("B-") and tag in taglist:
if combining_word != "":
words.append(combining_word)
combining_word = curr_word
elif (
curr_tag.startswith("I-")
and combining_word != ""
and tag in taglist
):
combining_word += curr_word
elif (
curr_tag == "O"
and combining_word != ""
):
words.append(combining_word)
combining_word = ""
words.append(curr_word)
else:
combining_word = ""
words.append(curr_word)

if combining_word != "":
words.append(combining_word)

return words
Loading