diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb index 2364a87c6..07ffbf589 100644 --- a/notebooks/test_wsd.ipynb +++ b/notebooks/test_wsd.ipynb @@ -30,7 +30,7 @@ } ], "source": [ - "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))" + "print(get_sense(\"เขากำลังอบขนมคุกกี้\", \"คุกกี้\"))" ] }, { @@ -50,7 +50,7 @@ } ], "source": [ - "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))" + "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คุกกี้\"))" ] }, { @@ -68,7 +68,7 @@ } ], "source": [ - "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))" + "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คน\"))" ] }, { @@ -92,7 +92,7 @@ }, "outputs": [], "source": [ - "_w=thai_wsd_dict()" + "w = thai_wsd_dict()" ] }, { @@ -115,7 +115,7 @@ } ], "source": [ - "_w.keys()" + "w.keys()" ] }, { @@ -138,7 +138,7 @@ } ], "source": [ - "_w[\"word\"][0],_w[\"meaning\"][0]" + "w[\"word\"][0], w[\"meaning\"][0]" ] }, { diff --git a/pythainlp/ancient/aksonhan.py b/pythainlp/ancient/aksonhan.py index 5d22fba3f..0ec28b621 100644 --- a/pythainlp/ancient/aksonhan.py +++ b/pythainlp/ancient/aksonhan.py @@ -2,22 +2,22 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 from pythainlp.util import Trie -from pythainlp import thai_consonants,thai_tonemarks +from pythainlp import thai_consonants, thai_tonemarks from pythainlp.tokenize import Tokenizer from pythainlp.corpus import thai_orst_words _dict_aksonhan = {} for i in list(thai_consonants): - if i=="ร": + if i == "ร": continue for j in list(thai_tonemarks): - _dict_aksonhan[i+j+i] = "ั"+j+i - _dict_aksonhan[i+i+j+i] = i+"ั"+j+i - _dict_aksonhan[i+i] = "ั"+i + _dict_aksonhan[i + j + i] = "ั" + j + i + _dict_aksonhan[i + i + j + i] = i + "ั" + j + i + _dict_aksonhan[i + i] = "ั" + i _set_aksonhan = set(_dict_aksonhan.keys()) -_trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants)) -_tokenizer = Tokenizer(custom_dict=_trie,engine="mm") +_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants)) +_tokenizer = Tokenizer(custom_dict=_trie, engine="mm") _dict_thai = set(thai_orst_words()) # call Thai words @@ -52,8 +52,9 @@ def aksonhan_to_current(word: str) -> str: return word elif word in _set_aksonhan: return _dict_aksonhan[word] - elif word in _dict_thai: # word in Thai words + elif word in _dict_thai: # word in Thai words return word + _seg = _tokenizer.word_tokenize(word) _w = [] for i in _seg: @@ -61,4 +62,4 @@ def aksonhan_to_current(word: str) -> str: _w.append(_dict_aksonhan[i]) else: _w.append(i) - return ''.join(_w) + return "".join(_w) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index fce5b0032..a901102fc 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -10,12 +10,12 @@ "Tokenizer", "Trie", "clause_tokenize", + "paragraph_tokenize", "sent_tokenize", "subword_tokenize", "syllable_tokenize", - "word_tokenize", "word_detokenize", - "paragraph_tokenize", + "word_tokenize", ] from pythainlp.corpus import thai_syllables, thai_words @@ -33,12 +33,12 @@ from pythainlp.tokenize.core import ( Tokenizer, clause_tokenize, + paragraph_tokenize, sent_tokenize, subword_tokenize, syllable_tokenize, - word_tokenize, word_detokenize, - paragraph_tokenize, + word_tokenize, ) from pythainlp.corpus import get_corpus as _get_corpus diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index e59f18a42..f1854fd67 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -48,7 +48,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]: def word_detokenize( segments: Union[List[List[str]], List[str]], output: str = "str" -) -> Union[str, List[str]]: +) -> Union[List[str], str]: """ Word detokenizer. @@ -65,16 +65,18 @@ def word_detokenize( print(word_detokenize(["เรา", "เล่น"])) # output: เราเล่น """ - _list_all = [] + list_all = [] + if isinstance(segments[0], str): segments = [segments] + from pythainlp import thai_characters for i, s in enumerate(segments): - _list_sents = [] - _add_index = [] - _space_index = [] - _mark_index = [] + list_sents = [] + add_index = [] + space_index = [] + mark_index = [] for j, w in enumerate(s): if j > 0: # previous word @@ -85,35 +87,36 @@ def word_detokenize( and not w.isspace() and not p_w.isspace() ): - _list_sents.append(" ") - _add_index.append(j) + list_sents.append(" ") + add_index.append(j) # if previous word is number or other language and is not space elif p_w[0] not in thai_characters and not p_w.isspace(): - _list_sents.append(" ") - _add_index.append(j) + list_sents.append(" ") + add_index.append(j) # if word is Thai iteration mark elif w == "ๆ": if not p_w.isspace(): - _list_sents.append(" ") - _mark_index.append(j) - elif w.isspace() and j - 1 not in _space_index: - _space_index.append(j) - elif j - 1 in _mark_index: - _list_sents.append(" ") - _list_sents.append(w) - _list_all.append(_list_sents) + list_sents.append(" ") + mark_index.append(j) + elif w.isspace() and j - 1 not in space_index: + space_index.append(j) + elif j - 1 in mark_index: + list_sents.append(" ") + list_sents.append(w) + list_all.append(list_sents) + if output == "list": - return _list_all - else: - _text = [] - for i in _list_all: - _text.append("".join(i)) - return " ".join(_text) + return list_all + + text = [] + for i in list_all: + text.append("".join(i)) + return " ".join(text) def word_tokenize( text: str, - custom_dict: Trie = None, + custom_dict: Trie = Trie([]), engine: str = DEFAULT_WORD_TOKENIZE_ENGINE, keep_whitespace: bool = True, join_broken_num: bool = True, @@ -290,7 +293,7 @@ def word_tokenize( if isinstance(custom_dict, str): segments = segment(text, custom_dict=custom_dict) - elif not isinstance(custom_dict, str) and custom_dict is not None: + elif not isinstance(custom_dict, str) and not custom_dict: raise ValueError( f"""Tokenizer \"{engine}\": custom_dict must be a str. @@ -415,11 +418,12 @@ def sent_tokenize( segments = segment.split_into_sentences(text) elif engine.startswith("wtp"): if "-" not in engine: - _size="mini" + _size = "mini" else: _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(text,size=_size,tokenize="sentence") + + segments = segment(text, size=_size, tokenize="sentence") else: raise ValueError( f"""Tokenizer \"{engine}\" not found. @@ -435,8 +439,8 @@ def sent_tokenize( def paragraph_tokenize( text: str, engine: str = "wtp-mini", - paragraph_threshold:float=0.5, - style:str='newline', + paragraph_threshold: float = 0.5, + style: str = "newline", ) -> List[List[str]]: """ Paragraph tokenizer. @@ -479,23 +483,25 @@ def paragraph_tokenize( """ if engine.startswith("wtp"): if "-" not in engine: - _size="mini" + size = "mini" else: - _size = engine.split("-")[-1] + size = engine.split("-")[-1] + from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment( - text, - size=_size, - tokenize="paragraph", - paragraph_threshold=paragraph_threshold, - style=style, - ) + segments = segment( + text, + size=size, + tokenize="paragraph", + paragraph_threshold=paragraph_threshold, + style=style, + ) else: raise ValueError( f"""Tokenizer \"{engine}\" not found. It might be a typo; if not, please consult our document.""" ) + return segments @@ -622,7 +628,7 @@ def subword_tokenize( def syllable_tokenize( text: str, - engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE, + engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE, keep_whitespace: bool = True, ) -> List[str]: """ @@ -652,9 +658,7 @@ def syllable_tokenize( It might be a typo; if not, please consult our document.""" ) return subword_tokenize( - text=text, - engine=engine, - keep_whitespace=keep_whitespace + text=text, engine=engine, keep_whitespace=keep_whitespace ) @@ -727,7 +731,7 @@ class Tokenizer: def __init__( self, - custom_dict: Union[Trie, Iterable[str], str] = None, + custom_dict: Union[Trie, Iterable[str], str] = [], engine: str = "newmm", keep_whitespace: bool = True, join_broken_num: bool = True, @@ -743,7 +747,7 @@ def __init__( :param bool keep_whitespace: True to keep whitespace, a common mark for end of phrase in Thai """ - self.__trie_dict = None + self.__trie_dict = Trie([]) if custom_dict: self.__trie_dict = dict_trie(custom_dict) else: diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index a8992c5c9..91adab9a0 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -21,7 +21,7 @@ def segment( - text: str, custom_dict: Union[Trie, List[str], str] = None + text: str, custom_dict: Union[Trie, List[str], str] = [] ) -> List[str]: if not text or not isinstance(text, str): return [] diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py index 6bef737fc..3547b9bf4 100644 --- a/pythainlp/tokenize/han_solo.py +++ b/pythainlp/tokenize/han_solo.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-FileCopyrightText: Copyright 2019 Ponrawee Prasertsom # SPDX-License-Identifier: Apache-2.0 """ 🪿 Han-solo: Thai syllable segmenter @@ -8,54 +9,45 @@ """ from typing import List from pythainlp.corpus import path_pythainlp_corpus + try: import pycrfsuite except ImportError: - raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite") + raise ImportError( + "ImportError; Install pycrfsuite by pip install python-crfsuite" + ) tagger = pycrfsuite.Tagger() -tagger.open(path_pythainlp_corpus('han_solo.crfsuite')) +tagger.open(path_pythainlp_corpus("han_solo.crfsuite")) class Featurizer: -# This class from ssg at https://github.com/ponrawee/ssg. -# Copyright 2019 Ponrawee Prasertsom - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# { -# "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1 -# } + # This class from ssg at https://github.com/ponrawee/ssg. def __init__(self, N=2, sequence_size=1, delimiter=None): self.N = N self.delimiter = delimiter self.radius = N + sequence_size - def pad(self, sentence, padder='#'): + def pad(self, sentence, padder="#"): return padder * (self.radius) + sentence + padder * (self.radius) - def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'): + def featurize( + self, sentence, padding=True, indiv_char=True, return_type="list" + ): if padding: sentence = self.pad(sentence) all_features = [] all_labels = [] skip_next = False - for current_position in range(self.radius, len(sentence) - self.radius + 1): + for current_position in range( + self.radius, len(sentence) - self.radius + 1 + ): if skip_next: skip_next = False continue features = {} - if return_type == 'list': + if return_type == "list": features = [] cut = 0 char = sentence[current_position] @@ -63,13 +55,15 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') cut = 1 skip_next = True counter = 0 - chars_left = '' - chars_right = '' - chars = '' - abs_index_left = current_position # left start at -1 - abs_index_right = current_position - 1 # right start at 0 + chars_left = "" + chars_right = "" + chars = "" + abs_index_left = current_position # left start at -1 + abs_index_right = current_position - 1 # right start at 0 while counter < self.radius: - abs_index_left -= 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5) + abs_index_left -= ( + 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5) + ) char_left = sentence[abs_index_left] while char_left == self.delimiter: abs_index_left -= 1 @@ -79,13 +73,15 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') chars_left = char_left + chars_left # ใส่ลง feature if indiv_char: - left_key = '|'.join([str(relative_index_left), char_left]) - if return_type == 'dict': + left_key = "|".join([str(relative_index_left), char_left]) + if return_type == "dict": features[left_key] = 1 else: features.append(left_key) - abs_index_right += 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5) + abs_index_right += ( + 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5) + ) char_right = sentence[abs_index_right] while char_right == self.delimiter: abs_index_right += 1 @@ -93,8 +89,10 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') relative_index_right = counter chars_right += char_right if indiv_char: - right_key = '|'.join([str(relative_index_right), char_right]) - if return_type == 'dict': + right_key = "|".join( + [str(relative_index_right), char_right] + ) + if return_type == "dict": features[right_key] = 1 else: features.append(right_key) @@ -103,31 +101,30 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') chars = chars_left + chars_right for i in range(0, len(chars) - self.N + 1): - ngram = chars[i:i + self.N] - ngram_key = '|'.join([str(i - self.radius), ngram]) - if return_type == 'dict': + ngram = chars[i : i + self.N] + ngram_key = "|".join([str(i - self.radius), ngram]) + if return_type == "dict": features[ngram_key] = 1 else: features.append(ngram_key) all_features.append(features) - if return_type == 'list': + if return_type == "list": cut = str(cut) all_labels.append(cut) - return { - 'X': all_features, - 'Y': all_labels - } + return {"X": all_features, "Y": all_labels} + + _to_feature = Featurizer() def segment(text: str) -> List[str]: - x=_to_feature.featurize(text)["X"] + x = _to_feature.featurize(text)["X"] y_pred = tagger.tag(x) list_cut = [] - for j,k in zip(list(text),y_pred): - if k=="1": + for j, k in zip(list(text), y_pred): + if k == "1": list_cut.append(j) else: - list_cut[-1]+=j + list_cut[-1] += j return list_cut diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 722c47d99..4acecd61f 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 """ Dictionary-based longest-matching Thai word segmentation. Implementation is based on the codes from Patorn Utenpattanun. @@ -40,7 +42,7 @@ _UNKNOWN = False -class LongestMatchTokenizer(): +class LongestMatchTokenizer: def __init__(self, trie: Trie): self.__trie = trie diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 7b318575f..63f958db0 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -28,7 +28,7 @@ # `|` is used as like "early return", # which divides "abc123" to "abc", "123" for example. _PAT_NONTHAI = re.compile( -r"""(?x) + r"""(?x) [-a-zA-Z]+| # Latin characters \d+([,\.]\d+)*| # numbers [ \t]+| # spaces diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py index 3eaf75684..c5fa52b5c 100644 --- a/pythainlp/tokenize/nlpo3.py +++ b/pythainlp/tokenize/nlpo3.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -#SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 from sys import stderr from typing import List diff --git a/pythainlp/tokenize/thaisumcut.py b/pythainlp/tokenize/thaisumcut.py index eb12144e7..16d4da9e5 100644 --- a/pythainlp/tokenize/thaisumcut.py +++ b/pythainlp/tokenize/thaisumcut.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-FileCopyrightText: Copyright 2020 Nakhun Chumpolsathien +# SPDX-License-Identifier: Apache-2.0 """ The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020 original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum @@ -10,22 +13,6 @@ author={Chumpolsathien, Nakhun}, year={2020}, school={Beijing Institute of Technology} - -**ThaiSum License** - - Copyright [2020 [Nakhun Chumpolsathien] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. """ import re diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index 111531d65..dcfaf61d0 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -14,28 +14,30 @@ def _tokenize( - text:str, - lang_code:str="th", - model:str="wtp-bert-mini", - tokenize:str="sentence", - paragraph_threshold:float=0.5, - style:str='newline', - )-> List[str]: - global _MODEL_NAME,_MODEL + text: str, + lang_code: str = "th", + model: str = "wtp-bert-mini", + tokenize: str = "sentence", + paragraph_threshold: float = 0.5, + style: str = "newline", +) -> List[str]: + global _MODEL_NAME, _MODEL + if _MODEL_NAME != model: _MODEL = WtP(model_name_or_model=model) _MODEL_NAME = model - if tokenize=="sentence": - return _MODEL.split(text,lang_code=lang_code) - else: # Paragraph - if style=='newline': + + if tokenize == "sentence": + return _MODEL.split(text, lang_code=lang_code) + else: # Paragraph + if style == "newline": return _MODEL.split( text, lang_code=lang_code, do_paragraph_segmentation=True, - paragraph_threshold=paragraph_threshold + paragraph_threshold=paragraph_threshold, ) - elif style=='opus100': + elif style == "opus100": return _MODEL.split( text, lang_code=lang_code, @@ -45,26 +47,28 @@ def _tokenize( ) else: raise ValueError( - f"""Segmentation style \"{style}\" not found. + f"""Segmentation style \"{style}\" not found. It might be a typo; if not, please consult our document.""" - ) + ) + def tokenize( - text:str, - size:str="mini", - tokenize:str="sentence", - paragraph_threshold:float=0.5, - style:str='newline', - )-> List[str]: - _model_load="" - if size=="tiny": - _model_load="wtp-bert-tiny" - elif size=="base": - _model_load="wtp-canine-s-1l" - elif size=="large": - _model_load="wtp-canine-s-12l" + text: str, + size: str = "mini", + tokenize: str = "sentence", + paragraph_threshold: float = 0.5, + style: str = "newline", +) -> List[str]: + _model_load = "" + if size == "tiny": + _model_load = "wtp-bert-tiny" + elif size == "base": + _model_load = "wtp-canine-s-1l" + elif size == "large": + _model_load = "wtp-canine-s-12l" else: # mini - _model_load="wtp-bert-mini" + _model_load = "wtp-bert-mini" + return _tokenize( text, model=_model_load, diff --git a/pythainlp/util/phoneme.py b/pythainlp/util/phoneme.py index 40aed32b1..d709cbb1c 100644 --- a/pythainlp/util/phoneme.py +++ b/pythainlp/util/phoneme.py @@ -9,72 +9,80 @@ from pythainlp.tokenize import Tokenizer consonants_ipa_nectec = [ - ("k","k","k^"), - ("kʰ","kh"), - ("ŋ","ng","ng^"), - ("tɕ","c"), - ("tɕʰ","ch"), - ("s","s"), - ("j","j","j^"), - ("d","d"), - ("t","y","t^"), - ("tʰ","th"), - ("n","n","n^"), - ("b","b"), - ("p","p","p^"), - ("pʰ","ph"), - ("f","f"), - ("m","m","m^"), - ("r","r"), - ("l","l"), - ("w","w","w^"), - ("h","h"), - ("?","z","z^") + ("k", "k", "k^"), + ("kʰ", "kh"), + ("ŋ", "ng", "ng^"), + ("tɕ", "c"), + ("tɕʰ", "ch"), + ("s", "s"), + ("j", "j", "j^"), + ("d", "d"), + ("t", "y", "t^"), + ("tʰ", "th"), + ("n", "n", "n^"), + ("b", "b"), + ("p", "p", "p^"), + ("pʰ", "ph"), + ("f", "f"), + ("m", "m", "m^"), + ("r", "r"), + ("l", "l"), + ("w", "w", "w^"), + ("h", "h"), + ("?", "z", "z^"), ] # ipa, initial, final monophthong_ipa_nectec = [ - ("i","i"), - ("e","e"), - ("ɛ","x"), - ("ɤ","q"), - ("a","a"), - ("am","am^"), - ("aj","aj^"), - ("aw","aw^"), - ("u","u"), - ("o","o"), - ("ɔ","@"), - ("ii","ii"), - ("ee","ee"), - ("ɛɛ","xx"), - ("ɯɯ","vv"), - ("ɤɤ","qq"), - ("aa","aa"), - ("uu","uu"), - ("oo","oo"), - ("","@@"), #-อ long + ("i", "i"), + ("e", "e"), + ("ɛ", "x"), + ("ɤ", "q"), + ("a", "a"), + ("am", "am^"), + ("aj", "aj^"), + ("aw", "aw^"), + ("u", "u"), + ("o", "o"), + ("ɔ", "@"), + ("ii", "ii"), + ("ee", "ee"), + ("ɛɛ", "xx"), + ("ɯɯ", "vv"), + ("ɤɤ", "qq"), + ("aa", "aa"), + ("uu", "uu"), + ("oo", "oo"), + ("", "@@"), # -อ long ] diphthong_ipa_nectec = [ - ("ia","ia"), - ("ɯa","va"), - ("ua","ua"), - ("iia","iia"), - ("ɯɯa","vva"), - ("uua","uua"), + ("ia", "ia"), + ("ɯa", "va"), + ("ua", "ua"), + ("iia", "iia"), + ("ɯɯa", "vva"), + ("uua", "uua"), ] tones_ipa_nectec = [ - ("˧","0"), - ("˨˩","1"), - ("˥˩","2"), - ("˦˥","3"), - ("˩˩˦","4"), + ("˧", "0"), + ("˨˩", "1"), + ("˥˩", "2"), + ("˦˥", "3"), + ("˩˩˦", "4"), ] -dict_nectec_to_ipa = {i[1]:i[0] for i in consonants_ipa_nectec+monophthong_ipa_nectec+diphthong_ipa_nectec+tones_ipa_nectec} -dict_nectec_to_ipa.update({i[2]:i[0] for i in consonants_ipa_nectec if len(i)>2}) +dict_nectec_to_ipa = { + i[1]: i[0] + for i in consonants_ipa_nectec + + monophthong_ipa_nectec + + diphthong_ipa_nectec + + tones_ipa_nectec +} +dict_nectec_to_ipa.update( + {i[2]: i[0] for i in consonants_ipa_nectec if len(i) > 2} +) def nectec_to_ipa(pronunciation: str) -> str: @@ -89,7 +97,7 @@ def nectec_to_ipa(pronunciation: str) -> str: :: from pythainlp.util import nectec_to_ipa - + print(nectec_to_ipa("kl-uua-j^-2")) # output : 'kl uua j ˥˩' @@ -97,23 +105,25 @@ def nectec_to_ipa(pronunciation: str) -> str: References ---------- - Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing and Speech corpus. In: Handbook of Thai Electronic Corpus. 1st ed. p. 122–56. + Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing \ + and Speech corpus. In: Handbook of Thai Electronic Corpus. \ + 1st ed. p. 122–56. """ - pronunciation = pronunciation.split("-") - _temp = [] - for i in pronunciation: - if i in dict_nectec_to_ipa.keys(): - _temp.append(dict_nectec_to_ipa[i]) + parts = pronunciation.split("-") + ipa = [] + for part in parts: + if part in dict_nectec_to_ipa.keys(): + ipa.append(dict_nectec_to_ipa[part]) else: - _temp.append(i) - return ' '.join(_temp) + ipa.append(part) + return " ".join(ipa) dict_ipa_rtgs = { - "b":"b", - "d":"d", - "f":"f", - "h":"h", + "b": "b", + "d": "d", + "f": "f", + "h": "h", # The conversion of j depends on its position in the syllable. # But, unfortunately, the current implementation cannot handle both cases. # To remove confusions without changing the behavior and breaking existing codes, @@ -121,68 +131,66 @@ def nectec_to_ipa(pronunciation: str) -> str: # as it would be overridden by the second one and thus never take effect from the beginning. # See #846 for a more detailed discussion: https://github.com/PyThaiNLP/pythainlp/issues/846 # "j":"y", - "k":"k", - "kʰ":"kh", - "l":"l", - "m":"m", - "n":"n", - "ŋ":"ng", - "p":"p", - "pʰ":"ph", - "r":"r", - "s":"s", - "t":"t", - "tʰ":"th", - "tɕ":"ch", - "tɕʰ":"ch", - "w":"w", - "ʔ":"", - "j":"i", - "a":"a", - "e":"e", - "ɛ":"ae", - "i":"i", - "o":"o", - "ɔ":"o", - "u":"u", - "ɯ":"ue", - "ɤ":"oe", - "aː":"a", - "eː":"e", - "ɛː":"ae", - "iː":"i", - "oː":"o", - "ɔː":"o", - "uː":"u", - "ɯː":"ue", - "ɤː":"oe", - "ia":"ia", - "ua":"ua", - "ɯa":"uea", - "aj":"ai", - "aw":"ao", - "ew":"eo", - "ɛw":"aeo", - "iw":"io", - "ɔj":"io", - "uj":"ui", - "aːj":"ai", - "aːw":"ao", - "eːw":"eo", - "ɛːw":"aeo", - "oːj":"oi", - "ɔːj":"oi", - "ɤːj":"oei", - "iaw":"iao", - "uaj":"uai", - "ɯaj":"ueai", - ".":".", + "k": "k", + "kʰ": "kh", + "l": "l", + "m": "m", + "n": "n", + "ŋ": "ng", + "p": "p", + "pʰ": "ph", + "r": "r", + "s": "s", + "t": "t", + "tʰ": "th", + "tɕ": "ch", + "tɕʰ": "ch", + "w": "w", + "ʔ": "", + "j": "i", + "a": "a", + "e": "e", + "ɛ": "ae", + "i": "i", + "o": "o", + "ɔ": "o", + "u": "u", + "ɯ": "ue", + "ɤ": "oe", + "aː": "a", + "eː": "e", + "ɛː": "ae", + "iː": "i", + "oː": "o", + "ɔː": "o", + "uː": "u", + "ɯː": "ue", + "ɤː": "oe", + "ia": "ia", + "ua": "ua", + "ɯa": "uea", + "aj": "ai", + "aw": "ao", + "ew": "eo", + "ɛw": "aeo", + "iw": "io", + "ɔj": "io", + "uj": "ui", + "aːj": "ai", + "aːw": "ao", + "eːw": "eo", + "ɛːw": "aeo", + "oːj": "oi", + "ɔːj": "oi", + "ɤːj": "oei", + "iaw": "iao", + "uaj": "uai", + "ɯaj": "ueai", + ".": ".", } -dict_ipa_rtgs_final = { - "w":"o" -} -trie = Trie(list(dict_ipa_rtgs.keys())+list(dict_ipa_rtgs_final.keys())) +dict_ipa_rtgs_final = {"w": "o"} +trie = Trie(list(dict_ipa_rtgs.keys()) + list(dict_ipa_rtgs_final.keys())) ipa_cut = Tokenizer(custom_dict=trie, engine="newmm") @@ -200,23 +208,30 @@ def ipa_to_rtgs(ipa: str) -> str: :: from pythainlp.util import ipa_to_rtgs - + print(ipa_to_rtgs("kluaj")) # output : 'kluai' """ - _temp = [] - _list_ipa = ipa_cut.word_tokenize(ipa) - for i,p in enumerate(_list_ipa): - if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final): - _temp.append(dict_ipa_rtgs_final[p]) - elif p in list(dict_ipa_rtgs): - _temp.append(dict_ipa_rtgs[p]) + rtgs_parts = [] + + ipa_parts = ipa_cut.word_tokenize(ipa) + for i, ipa_part in enumerate(ipa_parts): + if i == len(ipa_parts) - 1 and ipa_part in list(dict_ipa_rtgs_final): + rtgs_parts.append(dict_ipa_rtgs_final[ipa_part]) + elif ipa_part in list(dict_ipa_rtgs): + rtgs_parts.append(dict_ipa_rtgs[ipa_part]) else: - _temp.append(p) - _text = ''.join(_temp) - _text = unicodedata.normalize('NFKD', _text).encode('ascii', 'ignore') - return _text.decode("utf-8") + rtgs_parts.append(ipa_part) + + rtgs = "".join(rtgs_parts) + rtgs = ( + unicodedata.normalize("NFKD", rtgs) + .encode("ascii", "ignore") + .decode("utf-8") + ) + + return rtgs def remove_tone_ipa(ipa: str) -> str: @@ -231,7 +246,7 @@ def remove_tone_ipa(ipa: str) -> str: :: from pythainlp.util import remove_tone_ipa - + print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦")) # output : laː.sa.maj diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py index 0dc29d77a..11a826c00 100644 --- a/pythainlp/util/remove_trailing_repeat_consonants.py +++ b/pythainlp/util/remove_trailing_repeat_consonants.py @@ -7,7 +7,7 @@ from pythainlp.corpus import thai_words from pythainlp.util.trie import Trie from pythainlp import thai_consonants as consonants -from typing import Tuple, List +from typing import Iterable, List, Tuple # used by remove_trailing_repeat_consonants() # contains all words that has repeating consonants at the end @@ -19,7 +19,9 @@ def remove_trailing_repeat_consonants( - text: str, dictionary: Trie = None, has_dictionary_updated: bool = True + text: str, + custom_dict: Iterable[str] = [], + has_dictionary_updated: bool = True, ) -> str: """ Remove repeating consonants at the last of the sentence. @@ -58,8 +60,8 @@ def remove_trailing_repeat_consonants( # "อืมมม" is in the default dictionary # use custom dictionary - custom_dictionary = dict_trie(["อืมมมมม"]) - remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary) + custom_dict = dict_trie(["อืมมมมม"]) + remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dict) # output: อืมมมมม # long text @@ -69,12 +71,12 @@ def remove_trailing_repeat_consonants( # นี่เป็นความลับ """ # use default dictionary if not given - if dictionary is None: - dictionary = thai_words() + if not custom_dict: + custom_dict = thai_words() # update repeaters dictionary if not updated if has_dictionary_updated: - _update_consonant_repeaters(dictionary) + _update_consonant_repeaters(custom_dict) # seperate by newline modified_lines = [] @@ -167,7 +169,7 @@ def _remove_all_last_consonants(text: str, dup: str) -> str: return removed -def _update_consonant_repeaters(dictionary: Trie) -> None: +def _update_consonant_repeaters(custom_dict: Iterable[str]) -> None: """ Update dictionary of all words that has repeating consonants at the end from the dictionary. @@ -184,7 +186,7 @@ def _update_consonant_repeaters(dictionary: Trie) -> None: last_consonants_repeaters[consonant] = [] # register - for word in dictionary: + for word in custom_dict: if _is_last_consonant_repeater(word): last_consonants_repeaters[word[-1]].append(word) diff --git a/pythainlp/util/spell_words.py b/pythainlp/util/spell_words.py index 6305cd025..c2b83bbda 100644 --- a/pythainlp/util/spell_words.py +++ b/pythainlp/util/spell_words.py @@ -4,62 +4,79 @@ import re from typing import List from pythainlp import ( - thai_letters, - thai_consonants, - thai_lead_vowels, - thai_follow_vowels, thai_above_vowels, thai_below_vowels, - thai_tonemarks + thai_consonants, + thai_follow_vowels, + thai_lead_vowels, + thai_letters, + thai_tonemarks, ) -from pythainlp.tokenize import Tokenizer -from pythainlp.tokenize import subword_tokenize - - -_r1=["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"] -_r2=["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"] -tonemarks={i: "ไม้"+j for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"])} - -rule1=[i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1] -rule2=[i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2] -rule3=[i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])") for i in _r2] -dict_vowel_ex={} -for i in _r1+_r2: - dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")]=i.replace("-", "อ").replace(":", "").replace("–", "อ") -dict_vowel={} -for i in _r1+_r2: - dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")]=i.replace("-", "อ").replace(":", "").replace("–", "อ") +from pythainlp.tokenize import subword_tokenize, Tokenizer + + +_r1 = ["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"] +_r2 = ["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"] +tonemarks = { + i: "ไม้" + j + for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"]) +} + +rule1 = [i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1] +rule2 = [i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2] +rule3 = [ + i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])") + for i in _r2 +] +dict_vowel_ex = {} +for i in _r1 + _r2: + dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = ( + i.replace("-", "อ").replace(":", "").replace("–", "อ") + ) +dict_vowel = {} +for i in _r1 + _r2: + dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = ( + i.replace("-", "อ").replace(":", "").replace("–", "อ") + ) for i in thai_lead_vowels: - dict_vowel[i]=i+"อ" + dict_vowel[i] = i + "อ" for i in thai_follow_vowels: - dict_vowel[i]="อ"+i + dict_vowel[i] = "อ" + i for i in thai_above_vowels: - dict_vowel[i]="อ"+i + dict_vowel[i] = "อ" + i for i in thai_below_vowels: - dict_vowel[i]="อ"+i + dict_vowel[i] = "อ" + i -_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants), engine="mm") +_cut = Tokenizer(list(dict_vowel.keys()) + list(thai_consonants), engine="mm") def _clean(w): - if bool(re.match('|'.join(rule3), w)): + if bool(re.match("|".join(rule3), w)): for r in rule3: if bool(re.match(r, w)): - _w=re.sub(r, "\\1==\\2==", w) - _temp=_w.split("==") - w=_temp[0]+r.replace(f"([{thai_letters}])", "อ").replace(f"([{thai_tonemarks}])", "")+_temp[1] - elif bool(re.match('|'.join(rule2), w)): + w = re.sub(r, "\\1==\\2==", w) + temp = w.split("==") + w = ( + temp[0] + + r.replace(f"([{thai_letters}])", "อ").replace( + f"([{thai_tonemarks}])", "" + ) + + temp[1] + ) + elif bool(re.match("|".join(rule2), w)): for r in rule2: if bool(re.match(r, w)): - w=re.sub(r, "\\1", w)+r.replace(f"([{thai_letters}])", "อ") - elif bool(re.match('|'.join(rule1), w)): + w = re.sub(r, "\\1", w) + r.replace(f"([{thai_letters}])", "อ") + elif bool(re.match("|".join(rule1), w)): for r in rule1: if bool(re.match(r, w)): - w=re.sub(r, "\\1", w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)", "อ") + w = re.sub(r, "\\1", w) + r.replace( + f"([{thai_letters}](thai_tonemarks)?)", "อ" + ) return w -def spell_syllable(s: str)-> List[str]: +def spell_syllable(text: str) -> List[str]: """ Spell out syllables in Thai word distribution form. @@ -75,17 +92,16 @@ def spell_syllable(s: str)-> List[str]: print(spell_syllable("แมว")) # output: ['มอ', 'วอ', 'แอ', 'แมว'] """ - _t=s - s=_cut.word_tokenize(_clean(s)) - _c_only = [i+"อ" for i in s if i in set(thai_consonants)] - _v_only = [dict_vowel[i] for i in s if i in set(dict_vowel)] - _t_only = [tonemarks[i] for i in s if i in set(tonemarks.keys())] - _out=_c_only+_v_only+_t_only - _out.append(_t) - return _out + tokens = _cut.word_tokenize(_clean(text)) + + c_only = [tok + "อ" for tok in tokens if tok in set(thai_consonants)] + v_only = [dict_vowel[tok] for tok in tokens if tok in set(dict_vowel)] + t_only = [tonemarks[tok] for tok in tokens if tok in set(tonemarks.keys())] + + return c_only + v_only + t_only + [text] -def spell_word(w: str)-> List[str]: +def spell_word(text: str) -> List[str]: """ Spell out words in Thai word distribution form. @@ -101,10 +117,13 @@ def spell_word(w: str)-> List[str]: print(spell_word("คนดี")) # output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'] """ - _r=[] - _temp=subword_tokenize(w, engine="ssg") - for i in _temp: - _r.extend(spell_syllable(i)) - if len(_temp)>1: - _r.append(w) - return _r + spellouts = [] + tokens = subword_tokenize(text, engine="ssg") + + for tok in tokens: + spellouts.extend(spell_syllable(tok)) + + if len(tokens) > 1: + spellouts.append(text) + + return spellouts diff --git a/pythainlp/util/time.py b/pythainlp/util/time.py index be89b9702..27eb03a10 100644 --- a/pythainlp/util/time.py +++ b/pythainlp/util/time.py @@ -128,11 +128,7 @@ def _format( raise NotImplementedError(f"Time format not supported: {fmt}") if precision in ("m", "s"): - if ( - m == 30 - and (s == 0 or precision == "m") - and (fmt in ("6h", "m6h")) - ): + if m == 30 and (s == 0 or precision == "m") and (fmt in ("6h", "m6h")): text += "ครึ่ง" else: text += num_to_thaiword(m) + "นาที" diff --git a/pythainlp/util/trie.py b/pythainlp/util/trie.py index 0d0cda7f2..2b24ab79a 100644 --- a/pythainlp/util/trie.py +++ b/pythainlp/util/trie.py @@ -6,11 +6,11 @@ Designed to be used for tokenizer's dictionary, but can be for other purposes. """ -from typing import Iterable, List, Union +from typing import Iterable, Iterator, List, Union -class Trie: - class Node(): +class Trie(Iterable[str]): + class Node: __slots__ = "end", "children" def __init__(self): @@ -90,7 +90,7 @@ def prefixes(self, text: str) -> List[str]: def __contains__(self, key: str) -> bool: return key in self.words - def __iter__(self) -> Iterable[str]: + def __iter__(self) -> Iterator[str]: yield from self.words def __len__(self) -> int: @@ -106,7 +106,7 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: :return: a trie object :rtype: pythainlp.util.Trie """ - trie = None + trie = Trie([]) if isinstance(dict_source, str) and len(dict_source) > 0: # dict_source is a path to dictionary text file diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py index e3ae86b37..e68c3d3b0 100644 --- a/pythainlp/wangchanberta/__init__.py +++ b/pythainlp/wangchanberta/__init__.py @@ -2,9 +2,13 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 __all__ = [ + "NamedEntityRecognition", "ThaiNameTagger", "segment", - "NamedEntityRecognition", ] -from pythainlp.wangchanberta.core import ThaiNameTagger, segment, NamedEntityRecognition +from pythainlp.wangchanberta.core import ( + NamedEntityRecognition, + ThaiNameTagger, + segment, +) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index c38b97ff3..93755bc99 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -52,7 +52,7 @@ def _clear_tag(self, tag): return tag.replace("B-", "").replace("I-", "") def get_ner( - self, text: str, pos: bool= False,tag: bool = False + self, text: str, pos: bool = False, tag: bool = False ) -> Union[List[Tuple[str, str]], str]: """ This function tags named entities in text in IOB format. @@ -61,15 +61,17 @@ def get_ner( :param str text: text in Thai to be tagged :param bool tag: output HTML-like tags. - :return: a list of tuples associated with tokenized word groups, NER tags, \ - and output HTML-like tags (if the parameter `tag` is \ - specified as `True`). \ - Otherwise, return a list of tuples associated with tokenized \ - words and NER tags + :return: a list of tuples associated with tokenized word groups,\ + NER tags, and output HTML-like tags (if the parameter `tag` is \ + specified as `True`). \ + Otherwise, return a list of tuples associated with tokenized \ + words and NER tags :rtype: Union[list[tuple[str, str]]], str """ if pos: - warnings.warn("This model doesn't support output of POS tags and it doesn't output the POS tags.") + warnings.warn( + "This model doesn't support output of POS tags and it doesn't output the POS tags." + ) text = re.sub(" ", "<_>", text) self.json_ner = self.classify_tokens(text) self.output = "" @@ -128,7 +130,9 @@ def get_ner( class NamedEntityRecognition: - def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None: + def __init__( + self, model: str = "pythainlp/thainer-corpus-v2-base-model" + ) -> None: """ This function tags named entities in text in IOB format. @@ -138,24 +142,27 @@ def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None """ from transformers import AutoTokenizer from transformers import AutoModelForTokenClassification + self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForTokenClassification.from_pretrained(model) + def _fix_span_error(self, words, ner): _ner = [] - _ner=ner - _new_tag=[] - for i,j in zip(words,_ner): - i=self.tokenizer.decode(i) + _ner = ner + _new_tag = [] + for i, j in zip(words, _ner): + i = self.tokenizer.decode(i) if i.isspace() and j.startswith("B-"): - j="O" + j = "O" if i in ("", "", ""): continue - if i=="<_>": - i=" " - _new_tag.append((i,j)) + if i == "<_>": + i = " " + _new_tag.append((i, j)) return _new_tag + def get_ner( - self, text: str, pos: bool= False,tag: bool = False + self, text: str, pos: bool = False, tag: bool = False ) -> Union[List[Tuple[str, str]], str]: """ This function tags named entities in text in IOB format. @@ -172,18 +179,27 @@ def get_ner( :rtype: Union[list[tuple[str, str]]], str """ import torch + if pos: - warnings.warn("This model doesn't support output postag and It doesn't output the postag.") + warnings.warn( + "This model doesn't support output postag and It doesn't output the postag." + ) words_token = word_tokenize(text.replace(" ", "<_>")) - inputs=self.tokenizer(words_token,is_split_into_words=True,return_tensors="pt") + inputs = self.tokenizer( + words_token, is_split_into_words=True, return_tensors="pt" + ) ids = inputs["input_ids"] mask = inputs["attention_mask"] # forward pass outputs = self.model(ids, attention_mask=mask) logits = outputs[0] predictions = torch.argmax(logits, dim=2) - predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]] - ner_tag=self._fix_span_error(inputs['input_ids'][0],predicted_token_class) + predicted_token_class = [ + self.model.config.id2label[t.item()] for t in predictions[0] + ] + ner_tag = self._fix_span_error( + inputs["input_ids"][0], predicted_token_class + ) if tag: temp = "" sent = "" diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index 7e97aae8a..62b26f6ae 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -9,39 +9,50 @@ _wsd_dict = thai_wsd_dict() _mean_all = {} -for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]): - _mean_all[i]=j + +for i, j in zip(_wsd_dict["word"], _wsd_dict["meaning"]): + _mean_all[i] = j + _all_word = set(list(_mean_all.keys())) _TRIE = Trie(list(_all_word)) _word_cut = Tokenizer(custom_dict=_TRIE) +_MODEL = None + class _SentenceTransformersModel: - def __init__(self, model:str="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", device:str="cpu"): + def __init__( + self, + model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + device: str = "cpu", + ): from sentence_transformers import SentenceTransformer + self.device = device self.model_name = model self.model = SentenceTransformer(self.model_name, device=self.device) + def change_device(self, device: str): from sentence_transformers import SentenceTransformer + self.device = device self.model = SentenceTransformer(self.model_name, device=self.device) - def get_score(self, sentences1: str,sentences2: str)->float: + + def get_score(self, sentences1: str, sentences2: str) -> float: from sentence_transformers import util - embedding_1= self.model.encode(sentences1, convert_to_tensor=True) - embedding_2 = self.model.encode(sentences2, convert_to_tensor=True) - return 1-util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item() -_MODEL = None + embedding_1 = self.model.encode(sentences1, convert_to_tensor=True) + embedding_2 = self.model.encode(sentences2, convert_to_tensor=True) + return 1 - util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item() def get_sense( sentence: str, word: str, - device: str="cpu", - custom_dict: Union[dict,None]=None, - custom_tokenizer: Tokenizer=_word_cut, -) -> Union[List[Tuple[str, float]], None]: + device: str = "cpu", + custom_dict: dict = dict(), + custom_tokenizer: Tokenizer = _word_cut, +) -> List[Tuple[str, float]]: """ Get word sense from the sentence. This function will get definition and distance from context in sentence. @@ -50,19 +61,23 @@ def get_sense( :param str word: Thai word :param str device: device for running model on. :param dict custom_dict: Thai dictionary {"word":["definition",..]} - :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in sentence. - :return: list of definitions and distances (1 - cos_sim) or None (If word is not in the dictionary) - :rtype: Union[List[Tuple[str, float]], None] + :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in \ + sentence. + :return: a list of definitions and distances (1 - cos_sim) or \ + an empty list (if word is not in the dictionary) + :rtype: List[Tuple[str, float]] - We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised \ - Word Sense Disambiguation `_ to build get_sense function. + We get the ideas from `Context-Aware Semantic Similarity Measurement for \ + Unsupervised Word Sense Disambiguation \ + `_ to build get_sense function. - For Thai dictionary, we use Thai dictionary from wiktionary. - See more `thai_dict `_. + Use Thai dictionary from wiktionary. + See `thai_dict `_. - For the model, we use sentence transformers model from \ - `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 `_ for \ - unsupervised word sense disambiguation. + Use sentence transformers model from \ + `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 \ + `_ \ + for unsupervised word sense disambiguation. :Example: :: @@ -83,22 +98,31 @@ def get_sense( # 0.12473666667938232)] """ global _MODEL - if custom_dict is None: + if not custom_dict: custom_dict = _mean_all - _w = custom_tokenizer.word_tokenize(sentence) + + w = custom_tokenizer.word_tokenize(sentence) if word not in set(custom_dict.keys()) or word not in sentence: - return None - if _MODEL is None: + return [] + + if not _MODEL: _MODEL = _SentenceTransformersModel(device=device) - if _MODEL.device!=device: + if _MODEL.device != device: _MODEL.change_device(device=device) - _temp_mean = custom_dict[word] - _temp =[] - for i in _temp_mean: + + temp_mean = custom_dict[word] + temp = [] + for i in temp_mean: _temp_2 = [] - for j in _w: + for j in w: if j == word: - j = word+f" ({word} ความหมาย '"+i.replace('(',"").replace(')',"")+"') " + j = ( + word + + f" ({word} ความหมาย '" + + i.replace("(", "").replace(")", "") + + "') " + ) _temp_2.append(j) - _temp.append((i,_MODEL.get_score(sentence,''.join(_temp_2)))) - return _temp + temp.append((i, _MODEL.get_score(sentence, "".join(_temp_2)))) + + return temp diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 0abd403e9..c504731da 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -5,6 +5,7 @@ import pandas as pd import torch + # fastai import fastai from fastai.text import * @@ -213,15 +214,15 @@ def test_process_thai_dense(self): def test_document_vector(self): imdb = untar_data(URLs.IMDB_SAMPLE) - dummy_df = pd.read_csv(imdb/'texts.csv') + dummy_df = pd.read_csv(imdb / "texts.csv") thwiki = THWIKI_LSTM - thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb')) + thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb")) thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) tt = Tokenizer( tok_func=ThaiTokenizer, - lang='th', + lang="th", pre_rules=pre_rules_th, - post_rules=post_rules_th + post_rules=post_rules_th, ) processor = [ TokenizeProcessor( @@ -229,14 +230,11 @@ def test_document_vector(self): ), NumericalizeProcessor( vocab=thwiki_vocab, max_vocab=60000, min_freq=3 - ) + ), ] data_lm = ( TextList.from_df( - dummy_df, - imdb, - cols=['text'], - processor=processor + dummy_df, imdb, cols=["text"], processor=processor ) .split_by_rand_pct(0.2) .label_for_lm() @@ -255,28 +253,22 @@ def test_document_vector(self): "hidden_p": 0.1, "input_p": 0.2, "embed_p": 0.02, - "weight_p": 0.15 + "weight_p": 0.15, } trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1} learn = language_model_learner( - data_lm, - AWD_LSTM, - config=config, - pretrained=False, - **trn_args + data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args ) learn.load_pretrained(**thwiki) + self.assertIsNotNone(document_vector("วันนี้วันดีปีใหม่", learn, data_lm)) self.assertIsNotNone( - document_vector('วันนี้วันดีปีใหม่', learn, data_lm) - ) - self.assertIsNotNone( - document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg="sum") + document_vector("วันนี้วันดีปีใหม่", learn, data_lm, agg="sum") ) with self.assertRaises(ValueError): - document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg='abc') + document_vector("วันนี้วันดีปีใหม่", learn, data_lm, agg="abc") def test_merge_wgts(self): - wgts = {'0.encoder.weight': torch.randn(5,3)} + wgts = {"0.encoder.weight": torch.randn(5, 3)} itos_pre = ["แมว", "คน", "หนู"] itos_new = ["ปลา", "เต่า", "นก"] em_sz = 3 diff --git a/tests/test_util.py b/tests/test_util.py index ee02a278c..85a03ddc3 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -61,13 +61,12 @@ ipa_to_rtgs, remove_tone_ipa, tis620_to_utf8, - remove_trailing_repeat_consonants + remove_trailing_repeat_consonants, ) from pythainlp.util.spell_words import spell_word class TestUtilPackage(unittest.TestCase): - # ### pythainlp.util.collate def test_collate(self): @@ -102,9 +101,7 @@ def test_number(self): ) self.assertEqual(thaiword_to_num("สองล้านสามแสนหกร้อยสิบสอง"), 2300612) self.assertEqual(thaiword_to_num("หนึ่งร้อยสิบล้าน"), 110000000) - self.assertEqual( - thaiword_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072 - ) + self.assertEqual(thaiword_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072) self.assertEqual(thaiword_to_num("หนึ่งล้านล้าน"), 1000000000000) self.assertEqual( thaiword_to_num("สองแสนสี่หมื่นสามสิบล้านสี่พันล้าน"), @@ -137,9 +134,7 @@ def test_number(self): ) self.assertEqual(words_to_num("สองล้านสามแสนหกร้อยสิบสอง"), 2300612) self.assertEqual(words_to_num("หนึ่งร้อยสิบล้าน"), 110000000) - self.assertEqual( - words_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072 - ) + self.assertEqual(words_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072) self.assertEqual(words_to_num("หนึ่งล้านล้าน"), 1000000000000) self.assertEqual( words_to_num("สองแสนสี่หมื่นสามสิบล้านสี่พันล้าน"), @@ -149,15 +144,9 @@ def test_number(self): self.assertEqual(words_to_num("ลบหนึ่ง"), -1) text = "ลบหนึ่งร้อยล้านสี่แสนห้าพันยี่สิบเอ็ด" self.assertEqual(num_to_thaiword(words_to_num(text)), text) - self.assertIsNotNone( - text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด") - ) - self.assertIsNotNone( - text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท") - ) - self.assertIsNotNone( - text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้า") - ) + self.assertIsNotNone(text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด")) + self.assertIsNotNone(text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท")) + self.assertIsNotNone(text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้า")) self.assertEqual( arabic_digit_to_thai_digit("ไทยแลนด์ 4.0"), "ไทยแลนด์ ๔.๐" @@ -293,16 +282,10 @@ def test_thai_strftime(self): def test_time_to_thaiword(self): self.assertEqual(time_to_thaiword("8:17"), time_to_thaiword("08:17")) self.assertEqual(time_to_thaiword("8:17"), "แปดนาฬิกาสิบเจ็ดนาที") - self.assertEqual( - time_to_thaiword("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที" - ) + self.assertEqual(time_to_thaiword("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที") self.assertEqual(time_to_thaiword("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที") - self.assertEqual( - time_to_thaiword("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง" - ) - self.assertEqual( - time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที" - ) + self.assertEqual(time_to_thaiword("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง") + self.assertEqual(time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที") self.assertEqual( time_to_thaiword(time(12, 3, 1)), "สิบสองนาฬิกาสามนาทีหนึ่งวินาที", @@ -320,9 +303,7 @@ def test_time_to_thaiword(self): "เที่ยงครึ่ง", ) self.assertEqual(time_to_thaiword("18:30"), "สิบแปดนาฬิกาสามสิบนาที") - self.assertEqual( - time_to_thaiword("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที" - ) + self.assertEqual(time_to_thaiword("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที") self.assertEqual( time_to_thaiword("18:30:01"), "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที" ) @@ -389,9 +370,7 @@ def test_thaiword_to_time(self): self.assertEqual(thaiword_to_time("สิบโมงเช้าสิบสองนาที"), "10:12") self.assertEqual(thaiword_to_time("บ่ายโมงสิบสามนาที"), "13:13") self.assertEqual(thaiword_to_time("ศูนย์นาฬิกาสิบเอ็ดนาที"), "00:11") - self.assertEqual( - thaiword_to_time("บ่ายโมงเย็นสามสิบเอ็ดนาที"), "13:31" - ) + self.assertEqual(thaiword_to_time("บ่ายโมงเย็นสามสิบเอ็ดนาที"), "13:31") self.assertEqual(thaiword_to_time("เที่ยงคืนหนึ่งนาที"), "00:01") self.assertEqual(thaiword_to_time("เที่ยงครึ่ง"), "12:30") self.assertEqual(thaiword_to_time("ห้าโมงเย็นสามสิบสี่นาที"), "17:34") @@ -412,9 +391,7 @@ def test_thaiword_to_time(self): def test_thaiword_to_date(self): now = datetime.now() - self.assertEqual( - now + timedelta(days=0), thaiword_to_date("วันนี้", now) - ) + self.assertEqual(now + timedelta(days=0), thaiword_to_date("วันนี้", now)) self.assertEqual( now + timedelta(days=1), thaiword_to_date("พรุ่งนี้", now), @@ -548,52 +525,25 @@ def test_normalize(self): # maiyamok self.assertEqual( maiyamok("เด็กๆชอบไปโรงเรียน"), - ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน'] - ) - self.assertEqual( - maiyamok([ - "ทำไม", - "คน", - "ดี", - " ", - "ๆ", - "ๆ", - " ", - "ถึง", - "ทำ", - "ไม่ได้" - ]), - ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] - ) - self.assertEqual( - maiyamok([ - "ทำไม", - "คน", - "ดี", - " ", - " ๆ", - "ๆ", - " ", - "ถึง", - "ทำ", - "ไม่ได้" - ]), - ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] - ) - self.assertEqual( - maiyamok([ - "ทำไม", - "คน", - "ดีๆ", - " ", - "ๆ", - "ๆ", - " ", - "ถึง", - "ทำ", - "ไม่ได้" - ]), - ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"] + ["เด็ก", "เด็ก", "ชอบ", "ไป", "โรงเรียน"], + ) + self.assertEqual( + maiyamok( + ["ทำไม", "คน", "ดี", " ", "ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"] + ), + ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"], + ) + self.assertEqual( + maiyamok( + ["ทำไม", "คน", "ดี", " ", " ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"] + ), + ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"], + ) + self.assertEqual( + maiyamok( + ["ทำไม", "คน", "ดีๆ", " ", "ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"] + ), + ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"], ) # ### pythainlp.util.thai @@ -611,34 +561,34 @@ def test_count_thai_chars(self): self.assertEqual( count_thai_chars("ทดสอบภาษาไทย"), { - 'vowels': 3, - 'lead_vowels': 1, - 'follow_vowels': 2, - 'above_vowels': 0, - 'below_vowels': 0, - 'consonants': 9, - 'tonemarks': 0, - 'signs': 0, - 'thai_digits': 0, - 'punctuations': 0, - 'non_thai': 0, - } + "vowels": 3, + "lead_vowels": 1, + "follow_vowels": 2, + "above_vowels": 0, + "below_vowels": 0, + "consonants": 9, + "tonemarks": 0, + "signs": 0, + "thai_digits": 0, + "punctuations": 0, + "non_thai": 0, + }, ) self.assertEqual( count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"), { - 'vowels': 12, - 'lead_vowels': 6, - 'follow_vowels': 1, - 'above_vowels': 4, - 'below_vowels': 1, - 'consonants': 22, - 'tonemarks': 3, - 'signs': 2, - 'thai_digits': 1, - 'punctuations': 1, - 'non_thai': 4, - } + "vowels": 12, + "lead_vowels": 6, + "follow_vowels": 1, + "above_vowels": 4, + "below_vowels": 1, + "consonants": 22, + "tonemarks": 3, + "signs": 2, + "thai_digits": 1, + "punctuations": 1, + "non_thai": 4, + }, ) def test_isthaichar(self): @@ -687,13 +637,8 @@ def test_display_thai_char(self): def test_emoji_to_thai(self): self.assertEqual( - emoji_to_thai( - "จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀" - ), - ( - "จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ " - "ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:" - ), + emoji_to_thai("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀"), + ("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ " "ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:"), ) self.assertEqual( emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣"), @@ -787,16 +732,13 @@ def test_to_idna(self): def test_thai_word_tone_detector(self): self.assertIsNotNone(thai_word_tone_detector("คนดี")) self.assertEqual( - thai_word_tone_detector("ราคา"), - [('รา', 'm'), ('คา', 'm')] + thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")] ) def test_thai_strptime(self): self.assertIsNotNone( thai_strptime( - "05-7-65 09:00:01.10600", - "%d-%B-%Y %H:%M:%S.%f", - year="be" + "05-7-65 09:00:01.10600", "%d-%B-%Y %H:%M:%S.%f", year="be" ) ) self.assertIsNotNone( @@ -804,14 +746,12 @@ def test_thai_strptime(self): "24-6-75 09:00:00", "%d-%B-%Y %H:%M:%S", year="be", - add_year="2400" + add_year="2400", ) ) self.assertIsNotNone( thai_strptime( - "05-7-22 09:00:01.10600", - "%d-%B-%Y %H:%M:%S.%f", - year="ad" + "05-7-22 09:00:01.10600", "%d-%B-%Y %H:%M:%S.%f", year="ad" ) ) self.assertIsNotNone( @@ -819,7 +759,7 @@ def test_thai_strptime(self): "05-7-99 09:00:01.10600", "%d-%B-%Y %H:%M:%S.%f", year="ad", - add_year="1900" + add_year="1900", ) ) @@ -837,11 +777,12 @@ def test_convert_years(self): self.assertEqual(convert_years("242", src="re", target="ad"), "2023") self.assertEqual(convert_years("242", src="re", target="ah"), "1444") with self.assertRaises(NotImplementedError): - self.assertIsNotNone(convert_years( - "2023", src="cat", target="dog")) + self.assertIsNotNone( + convert_years("2023", src="cat", target="dog") + ) def test_nectec_to_ipa(self): - self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩') + self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), "kl uua j ˥˩") def test_ipa_to_rtgs(self): self.assertEqual(ipa_to_rtgs("kluaj"), "kluai") @@ -852,15 +793,17 @@ def test_remove_tone_ipa(self): self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj") def test_tis620_to_utf8(self): - self.assertEqual(tis620_to_utf8( - "¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") + self.assertEqual( + tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม" + ) def test_spell_word(self): - self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ']) - self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) - self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน']) - self.assertEqual(spell_word("คนดี"), [ - 'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) + self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"]) + self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"]) + self.assertEqual(spell_word("คน"), ["คอ", "นอ", "คน"]) + self.assertEqual( + spell_word("คนดี"), ["คอ", "นอ", "คน", "ดอ", "อี", "ดี", "คนดี"] + ) def test_rhyme(self): self.assertIsInstance(rhyme("แมว"), list) @@ -869,26 +812,24 @@ def test_rhyme(self): def test_remove_repeat_consonants(self): # update of pythainlp.copus.thai_words() able to break this self.assertEqual( - remove_trailing_repeat_consonants('เริ่ดดดดดดดด'), - 'เริ่ด' + remove_trailing_repeat_consonants("เริ่ดดดดดดดด"), "เริ่ด" ) self.assertEqual( - remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'), - 'อืมมม' + remove_trailing_repeat_consonants("อืมมมมมมมมมมมมมมม"), "อืมมม" ) - custom_dictionary = dict_trie(["อืมมมมม"]) + custom_dict = dict_trie(["อืมมมมม"]) self.assertEqual( - remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary), - 'อืมมมมม' + remove_trailing_repeat_consonants("อืมมมมมมมมมมมมมมม", custom_dict), + "อืมมมมม", ) self.assertEqual( remove_trailing_repeat_consonants( - 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด ' - 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ' + "อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด " + "ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ" ), - 'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ' + "อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ", ) # def test_abbreviation_to_full_text(self): diff --git a/tests/test_wsd.py b/tests/test_wsd.py index b58fe76fa..e6666a7dc 100644 --- a/tests/test_wsd.py +++ b/tests/test_wsd.py @@ -5,6 +5,6 @@ class TestWsdPackage(unittest.TestCase): def test_get_sense(self): - self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) - self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) - self.assertIsNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คน")) + self.assertTrue(get_sense("เขากำลังอบขนมคุกกี้", "คุกกี้")) + self.assertTrue(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน", "คุกกี้")) + self.assertFalse(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน", "คน"))