From bbd5ba998ed09d81ca52db919a3b0354cd70d170 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 9 Oct 2020 20:31:49 +0700 Subject: [PATCH 1/7] Update attacut.py --- pythainlp/tokenize/attacut.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 05be19fdd..c36284c5f 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -7,16 +7,22 @@ """ from typing import List -from attacut import tokenize +from attacut import Tokenizer +_MODEL_NAME = "attacut-sc" +_tokenize = Tokenizer(model=_MODEL_NAME) - -def segment(text: str) -> List[str]: +def segment(text: str, model: str = "attacut-sc") -> List[str]: """ Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai :param str text: text to be tokenized to words + :param str model: word tokenizer model to be tokenized to words :return: list of words, tokenized from the text """ if not text or not isinstance(text, str): return [] - return tokenize(text) + if model != _MODEL_NAME: + _MODEL_NAME = model + _tokenize = Tokenizer(model=_MODEL_NAME) + + return _tokenize.tokenize(text) From 3939dcc1dbd0c1b98e45a488ed532dadd4600085 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 10 Oct 2020 01:37:02 +0700 Subject: [PATCH 2/7] Update Options for model (attacut) --- pythainlp/tokenize/attacut.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index c36284c5f..258bfa260 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -17,6 +17,10 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: :param str text: text to be tokenized to words :param str model: word tokenizer model to be tokenized to words :return: list of words, tokenized from the text + :rtype: list[str] + **Options for model** + * *attacut-sc* (default) + * *attacut-c* """ if not text or not isinstance(text, str): return [] From 68cdd963e57e735e28ecbea8cdb8aa0d634ac834 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 10 Oct 2020 01:39:54 +0700 Subject: [PATCH 3/7] fixed PEP8 --- pythainlp/tokenize/attacut.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 258bfa260..9441a47a7 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -11,6 +11,7 @@ _MODEL_NAME = "attacut-sc" _tokenize = Tokenizer(model=_MODEL_NAME) + def segment(text: str, model: str = "attacut-sc") -> List[str]: """ Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai From e82f769734040f5d6bd1ec935b8c5e654d47843a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 10 Oct 2020 01:46:32 +0700 Subject: [PATCH 4/7] Update attacut.py --- pythainlp/tokenize/attacut.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 9441a47a7..44971e5f1 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -23,6 +23,7 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: * *attacut-sc* (default) * *attacut-c* """ + global _MODEL_NAME, _tokenize if not text or not isinstance(text, str): return [] From c4576b9585a6ed2da49ffd1a33ac19d809a688fa Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Oct 2020 07:51:13 +0100 Subject: [PATCH 5/7] Add more info about model options --- pythainlp/tokenize/attacut.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 44971e5f1..48a3e2c25 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -8,8 +8,9 @@ from typing import List from attacut import Tokenizer + _MODEL_NAME = "attacut-sc" -_tokenize = Tokenizer(model=_MODEL_NAME) +_tokenizer = Tokenizer(model=_MODEL_NAME) def segment(text: str, model: str = "attacut-sc") -> List[str]: @@ -20,15 +21,15 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: :return: list of words, tokenized from the text :rtype: list[str] **Options for model** - * *attacut-sc* (default) - * *attacut-c* + * *attacut-sc* (default) using both syllable and character features + * *attacut-c* using only character feature """ - global _MODEL_NAME, _tokenize + global _MODEL_NAME, _tokenizer if not text or not isinstance(text, str): return [] if model != _MODEL_NAME: _MODEL_NAME = model - _tokenize = Tokenizer(model=_MODEL_NAME) + _tokenizer = Tokenizer(model=_MODEL_NAME) - return _tokenize.tokenize(text) + return _tokenizer.tokenize(text) From c0831698e2e127d8d437a2c0aceda0dd091b70bd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 20 Oct 2020 21:54:49 +0700 Subject: [PATCH 6/7] Update attacut.py --- pythainlp/tokenize/attacut.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 48a3e2c25..0c4881689 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -9,8 +9,21 @@ from attacut import Tokenizer -_MODEL_NAME = "attacut-sc" -_tokenizer = Tokenizer(model=_MODEL_NAME) + +class attacut: + def __init__(self, model= "attacut-sc"): + if model == "attacut-sc": + self.load_attacut_sc() + else: + self.load_attacut_c() + def tokenize(self,text:str) -> List[str]: + return self._tokenizer.tokenize(text) + def load_attacut_sc(self): + self._MODEL_NAME = "attacut-sc" + self._tokenizer = Tokenizer(model=self._MODEL_NAME) + def load_attacut_c(self): + self._MODEL_NAME = "attacut-c" + self._tokenizer = Tokenizer(model=self._MODEL_NAME) def segment(text: str, model: str = "attacut-sc") -> List[str]: @@ -24,12 +37,9 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: * *attacut-sc* (default) using both syllable and character features * *attacut-c* using only character feature """ - global _MODEL_NAME, _tokenizer if not text or not isinstance(text, str): return [] - if model != _MODEL_NAME: - _MODEL_NAME = model - _tokenizer = Tokenizer(model=_MODEL_NAME) + _tokenizer = attacut(model) return _tokenizer.tokenize(text) From 2b618716a3a2985aa959a598f2b8ef700b560d20 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 7 Dec 2020 12:18:04 +0700 Subject: [PATCH 7/7] Simplify attacut class --- pythainlp/tokenize/attacut.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 0c4881689..0c42130eb 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -10,20 +10,17 @@ from attacut import Tokenizer -class attacut: - def __init__(self, model= "attacut-sc"): - if model == "attacut-sc": - self.load_attacut_sc() - else: - self.load_attacut_c() - def tokenize(self,text:str) -> List[str]: - return self._tokenizer.tokenize(text) - def load_attacut_sc(self): +class AttacutTokenizer: + def __init__(self, model="attacut-sc"): self._MODEL_NAME = "attacut-sc" + + if model == "attacut-c": + self._MODEL_NAME = "attacut-c" + self._tokenizer = Tokenizer(model=self._MODEL_NAME) - def load_attacut_c(self): - self._MODEL_NAME = "attacut-c" - self._tokenizer = Tokenizer(model=self._MODEL_NAME) + + def tokenize(self, text: str) -> List[str]: + return self._tokenizer.tokenize(text) def segment(text: str, model: str = "attacut-sc") -> List[str]: @@ -40,6 +37,6 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: if not text or not isinstance(text, str): return [] - _tokenizer = attacut(model) + _tokenizer = AttacutTokenizer(model) return _tokenizer.tokenize(text)