Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions pythainlp/tokenize/attacut.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,36 @@
"""
from typing import List

from attacut import tokenize
from attacut import Tokenizer


def segment(text: str) -> List[str]:
class AttacutTokenizer:
def __init__(self, model="attacut-sc"):
self._MODEL_NAME = "attacut-sc"

if model == "attacut-c":
self._MODEL_NAME = "attacut-c"

self._tokenizer = Tokenizer(model=self._MODEL_NAME)

def tokenize(self, text: str) -> List[str]:
return self._tokenizer.tokenize(text)


def segment(text: str, model: str = "attacut-sc") -> List[str]:
"""
Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
:param str text: text to be tokenized to words
:param str model: word tokenizer model to be tokenized to words
:return: list of words, tokenized from the text
:rtype: list[str]
**Options for model**
* *attacut-sc* (default) using both syllable and character features
* *attacut-c* using only character feature
"""
if not text or not isinstance(text, str):
return []

return tokenize(text)
_tokenizer = AttacutTokenizer(model)

return _tokenizer.tokenize(text)