diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 05be19fdd..0c42130eb 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -7,16 +7,36 @@ """ from typing import List -from attacut import tokenize +from attacut import Tokenizer -def segment(text: str) -> List[str]: +class AttacutTokenizer: + def __init__(self, model="attacut-sc"): + self._MODEL_NAME = "attacut-sc" + + if model == "attacut-c": + self._MODEL_NAME = "attacut-c" + + self._tokenizer = Tokenizer(model=self._MODEL_NAME) + + def tokenize(self, text: str) -> List[str]: + return self._tokenizer.tokenize(text) + + +def segment(text: str, model: str = "attacut-sc") -> List[str]: """ Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai :param str text: text to be tokenized to words + :param str model: word tokenizer model to be tokenized to words :return: list of words, tokenized from the text + :rtype: list[str] + **Options for model** + * *attacut-sc* (default) using both syllable and character features + * *attacut-c* using only character feature """ if not text or not isinstance(text, str): return [] - return tokenize(text) + _tokenizer = AttacutTokenizer(model) + + return _tokenizer.tokenize(text)