diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb
index 2364a87c6..07ffbf589 100644
--- a/notebooks/test_wsd.ipynb
+++ b/notebooks/test_wsd.ipynb
@@ -30,7 +30,7 @@
}
],
"source": [
- "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
+ "print(get_sense(\"เขากำลังอบขนมคุกกี้\", \"คุกกี้\"))"
]
},
{
@@ -50,7 +50,7 @@
}
],
"source": [
- "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
+ "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คุกกี้\"))"
]
},
{
@@ -68,7 +68,7 @@
}
],
"source": [
- "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
+ "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คน\"))"
]
},
{
@@ -92,7 +92,7 @@
},
"outputs": [],
"source": [
- "_w=thai_wsd_dict()"
+ "w = thai_wsd_dict()"
]
},
{
@@ -115,7 +115,7 @@
}
],
"source": [
- "_w.keys()"
+ "w.keys()"
]
},
{
@@ -138,7 +138,7 @@
}
],
"source": [
- "_w[\"word\"][0],_w[\"meaning\"][0]"
+ "w[\"word\"][0], w[\"meaning\"][0]"
]
},
{
diff --git a/pythainlp/ancient/aksonhan.py b/pythainlp/ancient/aksonhan.py
index 5d22fba3f..0ec28b621 100644
--- a/pythainlp/ancient/aksonhan.py
+++ b/pythainlp/ancient/aksonhan.py
@@ -2,22 +2,22 @@
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from pythainlp.util import Trie
-from pythainlp import thai_consonants,thai_tonemarks
+from pythainlp import thai_consonants, thai_tonemarks
from pythainlp.tokenize import Tokenizer
from pythainlp.corpus import thai_orst_words
_dict_aksonhan = {}
for i in list(thai_consonants):
- if i=="ร":
+ if i == "ร":
continue
for j in list(thai_tonemarks):
- _dict_aksonhan[i+j+i] = "ั"+j+i
- _dict_aksonhan[i+i+j+i] = i+"ั"+j+i
- _dict_aksonhan[i+i] = "ั"+i
+ _dict_aksonhan[i + j + i] = "ั" + j + i
+ _dict_aksonhan[i + i + j + i] = i + "ั" + j + i
+ _dict_aksonhan[i + i] = "ั" + i
_set_aksonhan = set(_dict_aksonhan.keys())
-_trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants))
-_tokenizer = Tokenizer(custom_dict=_trie,engine="mm")
+_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
+_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
_dict_thai = set(thai_orst_words()) # call Thai words
@@ -52,8 +52,9 @@ def aksonhan_to_current(word: str) -> str:
return word
elif word in _set_aksonhan:
return _dict_aksonhan[word]
- elif word in _dict_thai: # word in Thai words
+ elif word in _dict_thai: # word in Thai words
return word
+
_seg = _tokenizer.word_tokenize(word)
_w = []
for i in _seg:
@@ -61,4 +62,4 @@ def aksonhan_to_current(word: str) -> str:
_w.append(_dict_aksonhan[i])
else:
_w.append(i)
- return ''.join(_w)
+ return "".join(_w)
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index fce5b0032..a901102fc 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -10,12 +10,12 @@
"Tokenizer",
"Trie",
"clause_tokenize",
+ "paragraph_tokenize",
"sent_tokenize",
"subword_tokenize",
"syllable_tokenize",
- "word_tokenize",
"word_detokenize",
- "paragraph_tokenize",
+ "word_tokenize",
]
from pythainlp.corpus import thai_syllables, thai_words
@@ -33,12 +33,12 @@
from pythainlp.tokenize.core import (
Tokenizer,
clause_tokenize,
+ paragraph_tokenize,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
- word_tokenize,
word_detokenize,
- paragraph_tokenize,
+ word_tokenize,
)
from pythainlp.corpus import get_corpus as _get_corpus
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index e59f18a42..f1854fd67 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -48,7 +48,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
def word_detokenize(
segments: Union[List[List[str]], List[str]], output: str = "str"
-) -> Union[str, List[str]]:
+) -> Union[List[str], str]:
"""
Word detokenizer.
@@ -65,16 +65,18 @@ def word_detokenize(
print(word_detokenize(["เรา", "เล่น"]))
# output: เราเล่น
"""
- _list_all = []
+ list_all = []
+
if isinstance(segments[0], str):
segments = [segments]
+
from pythainlp import thai_characters
for i, s in enumerate(segments):
- _list_sents = []
- _add_index = []
- _space_index = []
- _mark_index = []
+ list_sents = []
+ add_index = []
+ space_index = []
+ mark_index = []
for j, w in enumerate(s):
if j > 0:
# previous word
@@ -85,35 +87,36 @@ def word_detokenize(
and not w.isspace()
and not p_w.isspace()
):
- _list_sents.append(" ")
- _add_index.append(j)
+ list_sents.append(" ")
+ add_index.append(j)
# if previous word is number or other language and is not space
elif p_w[0] not in thai_characters and not p_w.isspace():
- _list_sents.append(" ")
- _add_index.append(j)
+ list_sents.append(" ")
+ add_index.append(j)
# if word is Thai iteration mark
elif w == "ๆ":
if not p_w.isspace():
- _list_sents.append(" ")
- _mark_index.append(j)
- elif w.isspace() and j - 1 not in _space_index:
- _space_index.append(j)
- elif j - 1 in _mark_index:
- _list_sents.append(" ")
- _list_sents.append(w)
- _list_all.append(_list_sents)
+ list_sents.append(" ")
+ mark_index.append(j)
+ elif w.isspace() and j - 1 not in space_index:
+ space_index.append(j)
+ elif j - 1 in mark_index:
+ list_sents.append(" ")
+ list_sents.append(w)
+ list_all.append(list_sents)
+
if output == "list":
- return _list_all
- else:
- _text = []
- for i in _list_all:
- _text.append("".join(i))
- return " ".join(_text)
+ return list_all
+
+ text = []
+ for i in list_all:
+ text.append("".join(i))
+ return " ".join(text)
def word_tokenize(
text: str,
- custom_dict: Trie = None,
+ custom_dict: Trie = Trie([]),
engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
join_broken_num: bool = True,
@@ -290,7 +293,7 @@ def word_tokenize(
if isinstance(custom_dict, str):
segments = segment(text, custom_dict=custom_dict)
- elif not isinstance(custom_dict, str) and custom_dict is not None:
+ elif not isinstance(custom_dict, str) and not custom_dict:
raise ValueError(
f"""Tokenizer \"{engine}\":
custom_dict must be a str.
@@ -415,11 +418,12 @@ def sent_tokenize(
segments = segment.split_into_sentences(text)
elif engine.startswith("wtp"):
if "-" not in engine:
- _size="mini"
+ _size = "mini"
else:
_size = engine.split("-")[-1]
from pythainlp.tokenize.wtsplit import tokenize as segment
- segments = segment(text,size=_size,tokenize="sentence")
+
+ segments = segment(text, size=_size, tokenize="sentence")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
@@ -435,8 +439,8 @@ def sent_tokenize(
def paragraph_tokenize(
text: str,
engine: str = "wtp-mini",
- paragraph_threshold:float=0.5,
- style:str='newline',
+ paragraph_threshold: float = 0.5,
+ style: str = "newline",
) -> List[List[str]]:
"""
Paragraph tokenizer.
@@ -479,23 +483,25 @@ def paragraph_tokenize(
"""
if engine.startswith("wtp"):
if "-" not in engine:
- _size="mini"
+ size = "mini"
else:
- _size = engine.split("-")[-1]
+ size = engine.split("-")[-1]
+
from pythainlp.tokenize.wtsplit import tokenize as segment
- segments = segment(
- text,
- size=_size,
- tokenize="paragraph",
- paragraph_threshold=paragraph_threshold,
- style=style,
- )
+ segments = segment(
+ text,
+ size=size,
+ tokenize="paragraph",
+ paragraph_threshold=paragraph_threshold,
+ style=style,
+ )
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)
+
return segments
@@ -622,7 +628,7 @@ def subword_tokenize(
def syllable_tokenize(
text: str,
- engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
+ engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
) -> List[str]:
"""
@@ -652,9 +658,7 @@ def syllable_tokenize(
It might be a typo; if not, please consult our document."""
)
return subword_tokenize(
- text=text,
- engine=engine,
- keep_whitespace=keep_whitespace
+ text=text, engine=engine, keep_whitespace=keep_whitespace
)
@@ -727,7 +731,7 @@ class Tokenizer:
def __init__(
self,
- custom_dict: Union[Trie, Iterable[str], str] = None,
+ custom_dict: Union[Trie, Iterable[str], str] = [],
engine: str = "newmm",
keep_whitespace: bool = True,
join_broken_num: bool = True,
@@ -743,7 +747,7 @@ def __init__(
:param bool keep_whitespace: True to keep whitespace, a common mark
for end of phrase in Thai
"""
- self.__trie_dict = None
+ self.__trie_dict = Trie([])
if custom_dict:
self.__trie_dict = dict_trie(custom_dict)
else:
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
index a8992c5c9..91adab9a0 100644
--- a/pythainlp/tokenize/deepcut.py
+++ b/pythainlp/tokenize/deepcut.py
@@ -21,7 +21,7 @@
def segment(
- text: str, custom_dict: Union[Trie, List[str], str] = None
+ text: str, custom_dict: Union[Trie, List[str], str] = []
) -> List[str]:
if not text or not isinstance(text, str):
return []
diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py
index 6bef737fc..3547b9bf4 100644
--- a/pythainlp/tokenize/han_solo.py
+++ b/pythainlp/tokenize/han_solo.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-FileCopyrightText: Copyright 2019 Ponrawee Prasertsom
# SPDX-License-Identifier: Apache-2.0
"""
🪿 Han-solo: Thai syllable segmenter
@@ -8,54 +9,45 @@
"""
from typing import List
from pythainlp.corpus import path_pythainlp_corpus
+
try:
import pycrfsuite
except ImportError:
- raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite")
+ raise ImportError(
+ "ImportError; Install pycrfsuite by pip install python-crfsuite"
+ )
tagger = pycrfsuite.Tagger()
-tagger.open(path_pythainlp_corpus('han_solo.crfsuite'))
+tagger.open(path_pythainlp_corpus("han_solo.crfsuite"))
class Featurizer:
-# This class from ssg at https://github.com/ponrawee/ssg.
-# Copyright 2019 Ponrawee Prasertsom
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-# http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# {
-# "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1
-# }
+ # This class from ssg at https://github.com/ponrawee/ssg.
def __init__(self, N=2, sequence_size=1, delimiter=None):
self.N = N
self.delimiter = delimiter
self.radius = N + sequence_size
- def pad(self, sentence, padder='#'):
+ def pad(self, sentence, padder="#"):
return padder * (self.radius) + sentence + padder * (self.radius)
- def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'):
+ def featurize(
+ self, sentence, padding=True, indiv_char=True, return_type="list"
+ ):
if padding:
sentence = self.pad(sentence)
all_features = []
all_labels = []
skip_next = False
- for current_position in range(self.radius, len(sentence) - self.radius + 1):
+ for current_position in range(
+ self.radius, len(sentence) - self.radius + 1
+ ):
if skip_next:
skip_next = False
continue
features = {}
- if return_type == 'list':
+ if return_type == "list":
features = []
cut = 0
char = sentence[current_position]
@@ -63,13 +55,15 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list')
cut = 1
skip_next = True
counter = 0
- chars_left = ''
- chars_right = ''
- chars = ''
- abs_index_left = current_position # left start at -1
- abs_index_right = current_position - 1 # right start at 0
+ chars_left = ""
+ chars_right = ""
+ chars = ""
+ abs_index_left = current_position # left start at -1
+ abs_index_right = current_position - 1 # right start at 0
while counter < self.radius:
- abs_index_left -= 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5)
+ abs_index_left -= (
+ 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5)
+ )
char_left = sentence[abs_index_left]
while char_left == self.delimiter:
abs_index_left -= 1
@@ -79,13 +73,15 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list')
chars_left = char_left + chars_left
# ใส่ลง feature
if indiv_char:
- left_key = '|'.join([str(relative_index_left), char_left])
- if return_type == 'dict':
+ left_key = "|".join([str(relative_index_left), char_left])
+ if return_type == "dict":
features[left_key] = 1
else:
features.append(left_key)
- abs_index_right += 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5)
+ abs_index_right += (
+ 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5)
+ )
char_right = sentence[abs_index_right]
while char_right == self.delimiter:
abs_index_right += 1
@@ -93,8 +89,10 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list')
relative_index_right = counter
chars_right += char_right
if indiv_char:
- right_key = '|'.join([str(relative_index_right), char_right])
- if return_type == 'dict':
+ right_key = "|".join(
+ [str(relative_index_right), char_right]
+ )
+ if return_type == "dict":
features[right_key] = 1
else:
features.append(right_key)
@@ -103,31 +101,30 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list')
chars = chars_left + chars_right
for i in range(0, len(chars) - self.N + 1):
- ngram = chars[i:i + self.N]
- ngram_key = '|'.join([str(i - self.radius), ngram])
- if return_type == 'dict':
+ ngram = chars[i : i + self.N]
+ ngram_key = "|".join([str(i - self.radius), ngram])
+ if return_type == "dict":
features[ngram_key] = 1
else:
features.append(ngram_key)
all_features.append(features)
- if return_type == 'list':
+ if return_type == "list":
cut = str(cut)
all_labels.append(cut)
- return {
- 'X': all_features,
- 'Y': all_labels
- }
+ return {"X": all_features, "Y": all_labels}
+
+
_to_feature = Featurizer()
def segment(text: str) -> List[str]:
- x=_to_feature.featurize(text)["X"]
+ x = _to_feature.featurize(text)["X"]
y_pred = tagger.tag(x)
list_cut = []
- for j,k in zip(list(text),y_pred):
- if k=="1":
+ for j, k in zip(list(text), y_pred):
+ if k == "1":
list_cut.append(j)
else:
- list_cut[-1]+=j
+ list_cut[-1] += j
return list_cut
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index 722c47d99..4acecd61f 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
"""
Dictionary-based longest-matching Thai word segmentation. Implementation is based
on the codes from Patorn Utenpattanun.
@@ -40,7 +42,7 @@
_UNKNOWN = False
-class LongestMatchTokenizer():
+class LongestMatchTokenizer:
def __init__(self, trie: Trie):
self.__trie = trie
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 7b318575f..63f958db0 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -28,7 +28,7 @@
# `|` is used as like "early return",
# which divides "abc123" to "abc", "123" for example.
_PAT_NONTHAI = re.compile(
-r"""(?x)
+ r"""(?x)
[-a-zA-Z]+| # Latin characters
\d+([,\.]\d+)*| # numbers
[ \t]+| # spaces
diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py
index 3eaf75684..c5fa52b5c 100644
--- a/pythainlp/tokenize/nlpo3.py
+++ b/pythainlp/tokenize/nlpo3.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-#SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from sys import stderr
from typing import List
diff --git a/pythainlp/tokenize/thaisumcut.py b/pythainlp/tokenize/thaisumcut.py
index eb12144e7..16d4da9e5 100644
--- a/pythainlp/tokenize/thaisumcut.py
+++ b/pythainlp/tokenize/thaisumcut.py
@@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-FileCopyrightText: Copyright 2020 Nakhun Chumpolsathien
+# SPDX-License-Identifier: Apache-2.0
"""
The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020
original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum
@@ -10,22 +13,6 @@
author={Chumpolsathien, Nakhun},
year={2020},
school={Beijing Institute of Technology}
-
-**ThaiSum License**
-
- Copyright [2020 [Nakhun Chumpolsathien]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
"""
import re
diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py
index 111531d65..dcfaf61d0 100644
--- a/pythainlp/tokenize/wtsplit.py
+++ b/pythainlp/tokenize/wtsplit.py
@@ -14,28 +14,30 @@
def _tokenize(
- text:str,
- lang_code:str="th",
- model:str="wtp-bert-mini",
- tokenize:str="sentence",
- paragraph_threshold:float=0.5,
- style:str='newline',
- )-> List[str]:
- global _MODEL_NAME,_MODEL
+ text: str,
+ lang_code: str = "th",
+ model: str = "wtp-bert-mini",
+ tokenize: str = "sentence",
+ paragraph_threshold: float = 0.5,
+ style: str = "newline",
+) -> List[str]:
+ global _MODEL_NAME, _MODEL
+
if _MODEL_NAME != model:
_MODEL = WtP(model_name_or_model=model)
_MODEL_NAME = model
- if tokenize=="sentence":
- return _MODEL.split(text,lang_code=lang_code)
- else: # Paragraph
- if style=='newline':
+
+ if tokenize == "sentence":
+ return _MODEL.split(text, lang_code=lang_code)
+ else: # Paragraph
+ if style == "newline":
return _MODEL.split(
text,
lang_code=lang_code,
do_paragraph_segmentation=True,
- paragraph_threshold=paragraph_threshold
+ paragraph_threshold=paragraph_threshold,
)
- elif style=='opus100':
+ elif style == "opus100":
return _MODEL.split(
text,
lang_code=lang_code,
@@ -45,26 +47,28 @@ def _tokenize(
)
else:
raise ValueError(
- f"""Segmentation style \"{style}\" not found.
+ f"""Segmentation style \"{style}\" not found.
It might be a typo; if not, please consult our document."""
- )
+ )
+
def tokenize(
- text:str,
- size:str="mini",
- tokenize:str="sentence",
- paragraph_threshold:float=0.5,
- style:str='newline',
- )-> List[str]:
- _model_load=""
- if size=="tiny":
- _model_load="wtp-bert-tiny"
- elif size=="base":
- _model_load="wtp-canine-s-1l"
- elif size=="large":
- _model_load="wtp-canine-s-12l"
+ text: str,
+ size: str = "mini",
+ tokenize: str = "sentence",
+ paragraph_threshold: float = 0.5,
+ style: str = "newline",
+) -> List[str]:
+ _model_load = ""
+ if size == "tiny":
+ _model_load = "wtp-bert-tiny"
+ elif size == "base":
+ _model_load = "wtp-canine-s-1l"
+ elif size == "large":
+ _model_load = "wtp-canine-s-12l"
else: # mini
- _model_load="wtp-bert-mini"
+ _model_load = "wtp-bert-mini"
+
return _tokenize(
text,
model=_model_load,
diff --git a/pythainlp/util/phoneme.py b/pythainlp/util/phoneme.py
index 40aed32b1..d709cbb1c 100644
--- a/pythainlp/util/phoneme.py
+++ b/pythainlp/util/phoneme.py
@@ -9,72 +9,80 @@
from pythainlp.tokenize import Tokenizer
consonants_ipa_nectec = [
- ("k","k","k^"),
- ("kʰ","kh"),
- ("ŋ","ng","ng^"),
- ("tɕ","c"),
- ("tɕʰ","ch"),
- ("s","s"),
- ("j","j","j^"),
- ("d","d"),
- ("t","y","t^"),
- ("tʰ","th"),
- ("n","n","n^"),
- ("b","b"),
- ("p","p","p^"),
- ("pʰ","ph"),
- ("f","f"),
- ("m","m","m^"),
- ("r","r"),
- ("l","l"),
- ("w","w","w^"),
- ("h","h"),
- ("?","z","z^")
+ ("k", "k", "k^"),
+ ("kʰ", "kh"),
+ ("ŋ", "ng", "ng^"),
+ ("tɕ", "c"),
+ ("tɕʰ", "ch"),
+ ("s", "s"),
+ ("j", "j", "j^"),
+ ("d", "d"),
+ ("t", "y", "t^"),
+ ("tʰ", "th"),
+ ("n", "n", "n^"),
+ ("b", "b"),
+ ("p", "p", "p^"),
+ ("pʰ", "ph"),
+ ("f", "f"),
+ ("m", "m", "m^"),
+ ("r", "r"),
+ ("l", "l"),
+ ("w", "w", "w^"),
+ ("h", "h"),
+ ("?", "z", "z^"),
]
# ipa, initial, final
monophthong_ipa_nectec = [
- ("i","i"),
- ("e","e"),
- ("ɛ","x"),
- ("ɤ","q"),
- ("a","a"),
- ("am","am^"),
- ("aj","aj^"),
- ("aw","aw^"),
- ("u","u"),
- ("o","o"),
- ("ɔ","@"),
- ("ii","ii"),
- ("ee","ee"),
- ("ɛɛ","xx"),
- ("ɯɯ","vv"),
- ("ɤɤ","qq"),
- ("aa","aa"),
- ("uu","uu"),
- ("oo","oo"),
- ("","@@"), #-อ long
+ ("i", "i"),
+ ("e", "e"),
+ ("ɛ", "x"),
+ ("ɤ", "q"),
+ ("a", "a"),
+ ("am", "am^"),
+ ("aj", "aj^"),
+ ("aw", "aw^"),
+ ("u", "u"),
+ ("o", "o"),
+ ("ɔ", "@"),
+ ("ii", "ii"),
+ ("ee", "ee"),
+ ("ɛɛ", "xx"),
+ ("ɯɯ", "vv"),
+ ("ɤɤ", "qq"),
+ ("aa", "aa"),
+ ("uu", "uu"),
+ ("oo", "oo"),
+ ("", "@@"), # -อ long
]
diphthong_ipa_nectec = [
- ("ia","ia"),
- ("ɯa","va"),
- ("ua","ua"),
- ("iia","iia"),
- ("ɯɯa","vva"),
- ("uua","uua"),
+ ("ia", "ia"),
+ ("ɯa", "va"),
+ ("ua", "ua"),
+ ("iia", "iia"),
+ ("ɯɯa", "vva"),
+ ("uua", "uua"),
]
tones_ipa_nectec = [
- ("˧","0"),
- ("˨˩","1"),
- ("˥˩","2"),
- ("˦˥","3"),
- ("˩˩˦","4"),
+ ("˧", "0"),
+ ("˨˩", "1"),
+ ("˥˩", "2"),
+ ("˦˥", "3"),
+ ("˩˩˦", "4"),
]
-dict_nectec_to_ipa = {i[1]:i[0] for i in consonants_ipa_nectec+monophthong_ipa_nectec+diphthong_ipa_nectec+tones_ipa_nectec}
-dict_nectec_to_ipa.update({i[2]:i[0] for i in consonants_ipa_nectec if len(i)>2})
+dict_nectec_to_ipa = {
+ i[1]: i[0]
+ for i in consonants_ipa_nectec
+ + monophthong_ipa_nectec
+ + diphthong_ipa_nectec
+ + tones_ipa_nectec
+}
+dict_nectec_to_ipa.update(
+ {i[2]: i[0] for i in consonants_ipa_nectec if len(i) > 2}
+)
def nectec_to_ipa(pronunciation: str) -> str:
@@ -89,7 +97,7 @@ def nectec_to_ipa(pronunciation: str) -> str:
::
from pythainlp.util import nectec_to_ipa
-
+
print(nectec_to_ipa("kl-uua-j^-2"))
# output : 'kl uua j ˥˩'
@@ -97,23 +105,25 @@ def nectec_to_ipa(pronunciation: str) -> str:
References
----------
- Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing and Speech corpus. In: Handbook of Thai Electronic Corpus. 1st ed. p. 122–56.
+ Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing \
+ and Speech corpus. In: Handbook of Thai Electronic Corpus. \
+ 1st ed. p. 122–56.
"""
- pronunciation = pronunciation.split("-")
- _temp = []
- for i in pronunciation:
- if i in dict_nectec_to_ipa.keys():
- _temp.append(dict_nectec_to_ipa[i])
+ parts = pronunciation.split("-")
+ ipa = []
+ for part in parts:
+ if part in dict_nectec_to_ipa.keys():
+ ipa.append(dict_nectec_to_ipa[part])
else:
- _temp.append(i)
- return ' '.join(_temp)
+ ipa.append(part)
+ return " ".join(ipa)
dict_ipa_rtgs = {
- "b":"b",
- "d":"d",
- "f":"f",
- "h":"h",
+ "b": "b",
+ "d": "d",
+ "f": "f",
+ "h": "h",
# The conversion of j depends on its position in the syllable.
# But, unfortunately, the current implementation cannot handle both cases.
# To remove confusions without changing the behavior and breaking existing codes,
@@ -121,68 +131,66 @@ def nectec_to_ipa(pronunciation: str) -> str:
# as it would be overridden by the second one and thus never take effect from the beginning.
# See #846 for a more detailed discussion: https://github.com/PyThaiNLP/pythainlp/issues/846
# "j":"y",
- "k":"k",
- "kʰ":"kh",
- "l":"l",
- "m":"m",
- "n":"n",
- "ŋ":"ng",
- "p":"p",
- "pʰ":"ph",
- "r":"r",
- "s":"s",
- "t":"t",
- "tʰ":"th",
- "tɕ":"ch",
- "tɕʰ":"ch",
- "w":"w",
- "ʔ":"",
- "j":"i",
- "a":"a",
- "e":"e",
- "ɛ":"ae",
- "i":"i",
- "o":"o",
- "ɔ":"o",
- "u":"u",
- "ɯ":"ue",
- "ɤ":"oe",
- "aː":"a",
- "eː":"e",
- "ɛː":"ae",
- "iː":"i",
- "oː":"o",
- "ɔː":"o",
- "uː":"u",
- "ɯː":"ue",
- "ɤː":"oe",
- "ia":"ia",
- "ua":"ua",
- "ɯa":"uea",
- "aj":"ai",
- "aw":"ao",
- "ew":"eo",
- "ɛw":"aeo",
- "iw":"io",
- "ɔj":"io",
- "uj":"ui",
- "aːj":"ai",
- "aːw":"ao",
- "eːw":"eo",
- "ɛːw":"aeo",
- "oːj":"oi",
- "ɔːj":"oi",
- "ɤːj":"oei",
- "iaw":"iao",
- "uaj":"uai",
- "ɯaj":"ueai",
- ".":".",
+ "k": "k",
+ "kʰ": "kh",
+ "l": "l",
+ "m": "m",
+ "n": "n",
+ "ŋ": "ng",
+ "p": "p",
+ "pʰ": "ph",
+ "r": "r",
+ "s": "s",
+ "t": "t",
+ "tʰ": "th",
+ "tɕ": "ch",
+ "tɕʰ": "ch",
+ "w": "w",
+ "ʔ": "",
+ "j": "i",
+ "a": "a",
+ "e": "e",
+ "ɛ": "ae",
+ "i": "i",
+ "o": "o",
+ "ɔ": "o",
+ "u": "u",
+ "ɯ": "ue",
+ "ɤ": "oe",
+ "aː": "a",
+ "eː": "e",
+ "ɛː": "ae",
+ "iː": "i",
+ "oː": "o",
+ "ɔː": "o",
+ "uː": "u",
+ "ɯː": "ue",
+ "ɤː": "oe",
+ "ia": "ia",
+ "ua": "ua",
+ "ɯa": "uea",
+ "aj": "ai",
+ "aw": "ao",
+ "ew": "eo",
+ "ɛw": "aeo",
+ "iw": "io",
+ "ɔj": "io",
+ "uj": "ui",
+ "aːj": "ai",
+ "aːw": "ao",
+ "eːw": "eo",
+ "ɛːw": "aeo",
+ "oːj": "oi",
+ "ɔːj": "oi",
+ "ɤːj": "oei",
+ "iaw": "iao",
+ "uaj": "uai",
+ "ɯaj": "ueai",
+ ".": ".",
}
-dict_ipa_rtgs_final = {
- "w":"o"
-}
-trie = Trie(list(dict_ipa_rtgs.keys())+list(dict_ipa_rtgs_final.keys()))
+dict_ipa_rtgs_final = {"w": "o"}
+trie = Trie(list(dict_ipa_rtgs.keys()) + list(dict_ipa_rtgs_final.keys()))
ipa_cut = Tokenizer(custom_dict=trie, engine="newmm")
@@ -200,23 +208,30 @@ def ipa_to_rtgs(ipa: str) -> str:
::
from pythainlp.util import ipa_to_rtgs
-
+
print(ipa_to_rtgs("kluaj"))
# output : 'kluai'
"""
- _temp = []
- _list_ipa = ipa_cut.word_tokenize(ipa)
- for i,p in enumerate(_list_ipa):
- if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final):
- _temp.append(dict_ipa_rtgs_final[p])
- elif p in list(dict_ipa_rtgs):
- _temp.append(dict_ipa_rtgs[p])
+ rtgs_parts = []
+
+ ipa_parts = ipa_cut.word_tokenize(ipa)
+ for i, ipa_part in enumerate(ipa_parts):
+ if i == len(ipa_parts) - 1 and ipa_part in list(dict_ipa_rtgs_final):
+ rtgs_parts.append(dict_ipa_rtgs_final[ipa_part])
+ elif ipa_part in list(dict_ipa_rtgs):
+ rtgs_parts.append(dict_ipa_rtgs[ipa_part])
else:
- _temp.append(p)
- _text = ''.join(_temp)
- _text = unicodedata.normalize('NFKD', _text).encode('ascii', 'ignore')
- return _text.decode("utf-8")
+ rtgs_parts.append(ipa_part)
+
+ rtgs = "".join(rtgs_parts)
+ rtgs = (
+ unicodedata.normalize("NFKD", rtgs)
+ .encode("ascii", "ignore")
+ .decode("utf-8")
+ )
+
+ return rtgs
def remove_tone_ipa(ipa: str) -> str:
@@ -231,7 +246,7 @@ def remove_tone_ipa(ipa: str) -> str:
::
from pythainlp.util import remove_tone_ipa
-
+
print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"))
# output : laː.sa.maj
diff --git a/pythainlp/util/remove_trailing_repeat_consonants.py b/pythainlp/util/remove_trailing_repeat_consonants.py
index 0dc29d77a..11a826c00 100644
--- a/pythainlp/util/remove_trailing_repeat_consonants.py
+++ b/pythainlp/util/remove_trailing_repeat_consonants.py
@@ -7,7 +7,7 @@
from pythainlp.corpus import thai_words
from pythainlp.util.trie import Trie
from pythainlp import thai_consonants as consonants
-from typing import Tuple, List
+from typing import Iterable, List, Tuple
# used by remove_trailing_repeat_consonants()
# contains all words that has repeating consonants at the end
@@ -19,7 +19,9 @@
def remove_trailing_repeat_consonants(
- text: str, dictionary: Trie = None, has_dictionary_updated: bool = True
+ text: str,
+ custom_dict: Iterable[str] = [],
+ has_dictionary_updated: bool = True,
) -> str:
"""
Remove repeating consonants at the last of the sentence.
@@ -58,8 +60,8 @@ def remove_trailing_repeat_consonants(
# "อืมมม" is in the default dictionary
# use custom dictionary
- custom_dictionary = dict_trie(["อืมมมมม"])
- remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary)
+ custom_dict = dict_trie(["อืมมมมม"])
+ remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dict)
# output: อืมมมมม
# long text
@@ -69,12 +71,12 @@ def remove_trailing_repeat_consonants(
# นี่เป็นความลับ
"""
# use default dictionary if not given
- if dictionary is None:
- dictionary = thai_words()
+ if not custom_dict:
+ custom_dict = thai_words()
# update repeaters dictionary if not updated
if has_dictionary_updated:
- _update_consonant_repeaters(dictionary)
+ _update_consonant_repeaters(custom_dict)
# seperate by newline
modified_lines = []
@@ -167,7 +169,7 @@ def _remove_all_last_consonants(text: str, dup: str) -> str:
return removed
-def _update_consonant_repeaters(dictionary: Trie) -> None:
+def _update_consonant_repeaters(custom_dict: Iterable[str]) -> None:
"""
Update dictionary of all words that has
repeating consonants at the end from the dictionary.
@@ -184,7 +186,7 @@ def _update_consonant_repeaters(dictionary: Trie) -> None:
last_consonants_repeaters[consonant] = []
# register
- for word in dictionary:
+ for word in custom_dict:
if _is_last_consonant_repeater(word):
last_consonants_repeaters[word[-1]].append(word)
diff --git a/pythainlp/util/spell_words.py b/pythainlp/util/spell_words.py
index 6305cd025..c2b83bbda 100644
--- a/pythainlp/util/spell_words.py
+++ b/pythainlp/util/spell_words.py
@@ -4,62 +4,79 @@
import re
from typing import List
from pythainlp import (
- thai_letters,
- thai_consonants,
- thai_lead_vowels,
- thai_follow_vowels,
thai_above_vowels,
thai_below_vowels,
- thai_tonemarks
+ thai_consonants,
+ thai_follow_vowels,
+ thai_lead_vowels,
+ thai_letters,
+ thai_tonemarks,
)
-from pythainlp.tokenize import Tokenizer
-from pythainlp.tokenize import subword_tokenize
-
-
-_r1=["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"]
-_r2=["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"]
-tonemarks={i: "ไม้"+j for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"])}
-
-rule1=[i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1]
-rule2=[i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2]
-rule3=[i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])") for i in _r2]
-dict_vowel_ex={}
-for i in _r1+_r2:
- dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")]=i.replace("-", "อ").replace(":", "").replace("–", "อ")
-dict_vowel={}
-for i in _r1+_r2:
- dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")]=i.replace("-", "อ").replace(":", "").replace("–", "อ")
+from pythainlp.tokenize import subword_tokenize, Tokenizer
+
+
+_r1 = ["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"]
+_r2 = ["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"]
+tonemarks = {
+ i: "ไม้" + j
+ for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"])
+}
+
+rule1 = [i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1]
+rule2 = [i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2]
+rule3 = [
+ i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])")
+ for i in _r2
+]
+dict_vowel_ex = {}
+for i in _r1 + _r2:
+ dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = (
+ i.replace("-", "อ").replace(":", "").replace("–", "อ")
+ )
+dict_vowel = {}
+for i in _r1 + _r2:
+ dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = (
+ i.replace("-", "อ").replace(":", "").replace("–", "อ")
+ )
for i in thai_lead_vowels:
- dict_vowel[i]=i+"อ"
+ dict_vowel[i] = i + "อ"
for i in thai_follow_vowels:
- dict_vowel[i]="อ"+i
+ dict_vowel[i] = "อ" + i
for i in thai_above_vowels:
- dict_vowel[i]="อ"+i
+ dict_vowel[i] = "อ" + i
for i in thai_below_vowels:
- dict_vowel[i]="อ"+i
+ dict_vowel[i] = "อ" + i
-_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants), engine="mm")
+_cut = Tokenizer(list(dict_vowel.keys()) + list(thai_consonants), engine="mm")
def _clean(w):
- if bool(re.match('|'.join(rule3), w)):
+ if bool(re.match("|".join(rule3), w)):
for r in rule3:
if bool(re.match(r, w)):
- _w=re.sub(r, "\\1==\\2==", w)
- _temp=_w.split("==")
- w=_temp[0]+r.replace(f"([{thai_letters}])", "อ").replace(f"([{thai_tonemarks}])", "")+_temp[1]
- elif bool(re.match('|'.join(rule2), w)):
+ w = re.sub(r, "\\1==\\2==", w)
+ temp = w.split("==")
+ w = (
+ temp[0]
+ + r.replace(f"([{thai_letters}])", "อ").replace(
+ f"([{thai_tonemarks}])", ""
+ )
+ + temp[1]
+ )
+ elif bool(re.match("|".join(rule2), w)):
for r in rule2:
if bool(re.match(r, w)):
- w=re.sub(r, "\\1", w)+r.replace(f"([{thai_letters}])", "อ")
- elif bool(re.match('|'.join(rule1), w)):
+ w = re.sub(r, "\\1", w) + r.replace(f"([{thai_letters}])", "อ")
+ elif bool(re.match("|".join(rule1), w)):
for r in rule1:
if bool(re.match(r, w)):
- w=re.sub(r, "\\1", w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)", "อ")
+ w = re.sub(r, "\\1", w) + r.replace(
+ f"([{thai_letters}](thai_tonemarks)?)", "อ"
+ )
return w
-def spell_syllable(s: str)-> List[str]:
+def spell_syllable(text: str) -> List[str]:
"""
Spell out syllables in Thai word distribution form.
@@ -75,17 +92,16 @@ def spell_syllable(s: str)-> List[str]:
print(spell_syllable("แมว"))
# output: ['มอ', 'วอ', 'แอ', 'แมว']
"""
- _t=s
- s=_cut.word_tokenize(_clean(s))
- _c_only = [i+"อ" for i in s if i in set(thai_consonants)]
- _v_only = [dict_vowel[i] for i in s if i in set(dict_vowel)]
- _t_only = [tonemarks[i] for i in s if i in set(tonemarks.keys())]
- _out=_c_only+_v_only+_t_only
- _out.append(_t)
- return _out
+ tokens = _cut.word_tokenize(_clean(text))
+
+ c_only = [tok + "อ" for tok in tokens if tok in set(thai_consonants)]
+ v_only = [dict_vowel[tok] for tok in tokens if tok in set(dict_vowel)]
+ t_only = [tonemarks[tok] for tok in tokens if tok in set(tonemarks.keys())]
+
+ return c_only + v_only + t_only + [text]
-def spell_word(w: str)-> List[str]:
+def spell_word(text: str) -> List[str]:
"""
Spell out words in Thai word distribution form.
@@ -101,10 +117,13 @@ def spell_word(w: str)-> List[str]:
print(spell_word("คนดี"))
# output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']
"""
- _r=[]
- _temp=subword_tokenize(w, engine="ssg")
- for i in _temp:
- _r.extend(spell_syllable(i))
- if len(_temp)>1:
- _r.append(w)
- return _r
+ spellouts = []
+ tokens = subword_tokenize(text, engine="ssg")
+
+ for tok in tokens:
+ spellouts.extend(spell_syllable(tok))
+
+ if len(tokens) > 1:
+ spellouts.append(text)
+
+ return spellouts
diff --git a/pythainlp/util/time.py b/pythainlp/util/time.py
index be89b9702..27eb03a10 100644
--- a/pythainlp/util/time.py
+++ b/pythainlp/util/time.py
@@ -128,11 +128,7 @@ def _format(
raise NotImplementedError(f"Time format not supported: {fmt}")
if precision in ("m", "s"):
- if (
- m == 30
- and (s == 0 or precision == "m")
- and (fmt in ("6h", "m6h"))
- ):
+ if m == 30 and (s == 0 or precision == "m") and (fmt in ("6h", "m6h")):
text += "ครึ่ง"
else:
text += num_to_thaiword(m) + "นาที"
diff --git a/pythainlp/util/trie.py b/pythainlp/util/trie.py
index 0d0cda7f2..2b24ab79a 100644
--- a/pythainlp/util/trie.py
+++ b/pythainlp/util/trie.py
@@ -6,11 +6,11 @@
Designed to be used for tokenizer's dictionary, but can be for other purposes.
"""
-from typing import Iterable, List, Union
+from typing import Iterable, Iterator, List, Union
-class Trie:
- class Node():
+class Trie(Iterable[str]):
+ class Node:
__slots__ = "end", "children"
def __init__(self):
@@ -90,7 +90,7 @@ def prefixes(self, text: str) -> List[str]:
def __contains__(self, key: str) -> bool:
return key in self.words
- def __iter__(self) -> Iterable[str]:
+ def __iter__(self) -> Iterator[str]:
yield from self.words
def __len__(self) -> int:
@@ -106,7 +106,7 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
:return: a trie object
:rtype: pythainlp.util.Trie
"""
- trie = None
+ trie = Trie([])
if isinstance(dict_source, str) and len(dict_source) > 0:
# dict_source is a path to dictionary text file
diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py
index e3ae86b37..e68c3d3b0 100644
--- a/pythainlp/wangchanberta/__init__.py
+++ b/pythainlp/wangchanberta/__init__.py
@@ -2,9 +2,13 @@
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
__all__ = [
+ "NamedEntityRecognition",
"ThaiNameTagger",
"segment",
- "NamedEntityRecognition",
]
-from pythainlp.wangchanberta.core import ThaiNameTagger, segment, NamedEntityRecognition
+from pythainlp.wangchanberta.core import (
+ NamedEntityRecognition,
+ ThaiNameTagger,
+ segment,
+)
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index c38b97ff3..93755bc99 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -52,7 +52,7 @@ def _clear_tag(self, tag):
return tag.replace("B-", "").replace("I-", "")
def get_ner(
- self, text: str, pos: bool= False,tag: bool = False
+ self, text: str, pos: bool = False, tag: bool = False
) -> Union[List[Tuple[str, str]], str]:
"""
This function tags named entities in text in IOB format.
@@ -61,15 +61,17 @@ def get_ner(
:param str text: text in Thai to be tagged
:param bool tag: output HTML-like tags.
- :return: a list of tuples associated with tokenized word groups, NER tags, \
- and output HTML-like tags (if the parameter `tag` is \
- specified as `True`). \
- Otherwise, return a list of tuples associated with tokenized \
- words and NER tags
+ :return: a list of tuples associated with tokenized word groups,\
+ NER tags, and output HTML-like tags (if the parameter `tag` is \
+ specified as `True`). \
+ Otherwise, return a list of tuples associated with tokenized \
+ words and NER tags
:rtype: Union[list[tuple[str, str]]], str
"""
if pos:
- warnings.warn("This model doesn't support output of POS tags and it doesn't output the POS tags.")
+ warnings.warn(
+ "This model doesn't support output of POS tags and it doesn't output the POS tags."
+ )
text = re.sub(" ", "<_>", text)
self.json_ner = self.classify_tokens(text)
self.output = ""
@@ -128,7 +130,9 @@ def get_ner(
class NamedEntityRecognition:
- def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None:
+ def __init__(
+ self, model: str = "pythainlp/thainer-corpus-v2-base-model"
+ ) -> None:
"""
This function tags named entities in text in IOB format.
@@ -138,24 +142,27 @@ def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None
"""
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
+
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.model = AutoModelForTokenClassification.from_pretrained(model)
+
def _fix_span_error(self, words, ner):
_ner = []
- _ner=ner
- _new_tag=[]
- for i,j in zip(words,_ner):
- i=self.tokenizer.decode(i)
+ _ner = ner
+ _new_tag = []
+ for i, j in zip(words, _ner):
+ i = self.tokenizer.decode(i)
if i.isspace() and j.startswith("B-"):
- j="O"
+ j = "O"
if i in ("", "", ""):
continue
- if i=="<_>":
- i=" "
- _new_tag.append((i,j))
+ if i == "<_>":
+ i = " "
+ _new_tag.append((i, j))
return _new_tag
+
def get_ner(
- self, text: str, pos: bool= False,tag: bool = False
+ self, text: str, pos: bool = False, tag: bool = False
) -> Union[List[Tuple[str, str]], str]:
"""
This function tags named entities in text in IOB format.
@@ -172,18 +179,27 @@ def get_ner(
:rtype: Union[list[tuple[str, str]]], str
"""
import torch
+
if pos:
- warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
+ warnings.warn(
+ "This model doesn't support output postag and It doesn't output the postag."
+ )
words_token = word_tokenize(text.replace(" ", "<_>"))
- inputs=self.tokenizer(words_token,is_split_into_words=True,return_tensors="pt")
+ inputs = self.tokenizer(
+ words_token, is_split_into_words=True, return_tensors="pt"
+ )
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
# forward pass
outputs = self.model(ids, attention_mask=mask)
logits = outputs[0]
predictions = torch.argmax(logits, dim=2)
- predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
- ner_tag=self._fix_span_error(inputs['input_ids'][0],predicted_token_class)
+ predicted_token_class = [
+ self.model.config.id2label[t.item()] for t in predictions[0]
+ ]
+ ner_tag = self._fix_span_error(
+ inputs["input_ids"][0], predicted_token_class
+ )
if tag:
temp = ""
sent = ""
diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index 7e97aae8a..62b26f6ae 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -9,39 +9,50 @@
_wsd_dict = thai_wsd_dict()
_mean_all = {}
-for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
- _mean_all[i]=j
+
+for i, j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
+ _mean_all[i] = j
+
_all_word = set(list(_mean_all.keys()))
_TRIE = Trie(list(_all_word))
_word_cut = Tokenizer(custom_dict=_TRIE)
+_MODEL = None
+
class _SentenceTransformersModel:
- def __init__(self, model:str="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", device:str="cpu"):
+ def __init__(
+ self,
+ model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+ device: str = "cpu",
+ ):
from sentence_transformers import SentenceTransformer
+
self.device = device
self.model_name = model
self.model = SentenceTransformer(self.model_name, device=self.device)
+
def change_device(self, device: str):
from sentence_transformers import SentenceTransformer
+
self.device = device
self.model = SentenceTransformer(self.model_name, device=self.device)
- def get_score(self, sentences1: str,sentences2: str)->float:
+
+ def get_score(self, sentences1: str, sentences2: str) -> float:
from sentence_transformers import util
- embedding_1= self.model.encode(sentences1, convert_to_tensor=True)
- embedding_2 = self.model.encode(sentences2, convert_to_tensor=True)
- return 1-util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item()
-_MODEL = None
+ embedding_1 = self.model.encode(sentences1, convert_to_tensor=True)
+ embedding_2 = self.model.encode(sentences2, convert_to_tensor=True)
+ return 1 - util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item()
def get_sense(
sentence: str,
word: str,
- device: str="cpu",
- custom_dict: Union[dict,None]=None,
- custom_tokenizer: Tokenizer=_word_cut,
-) -> Union[List[Tuple[str, float]], None]:
+ device: str = "cpu",
+ custom_dict: dict = dict(),
+ custom_tokenizer: Tokenizer = _word_cut,
+) -> List[Tuple[str, float]]:
"""
Get word sense from the sentence.
This function will get definition and distance from context in sentence.
@@ -50,19 +61,23 @@ def get_sense(
:param str word: Thai word
:param str device: device for running model on.
:param dict custom_dict: Thai dictionary {"word":["definition",..]}
- :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in sentence.
- :return: list of definitions and distances (1 - cos_sim) or None (If word is not in the dictionary)
- :rtype: Union[List[Tuple[str, float]], None]
+ :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in \
+ sentence.
+ :return: a list of definitions and distances (1 - cos_sim) or \
+ an empty list (if word is not in the dictionary)
+ :rtype: List[Tuple[str, float]]
- We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised \
- Word Sense Disambiguation `_ to build get_sense function.
+ We get the ideas from `Context-Aware Semantic Similarity Measurement for \
+ Unsupervised Word Sense Disambiguation \
+ `_ to build get_sense function.
- For Thai dictionary, we use Thai dictionary from wiktionary.
- See more `thai_dict `_.
+ Use Thai dictionary from wiktionary.
+ See `thai_dict `_.
- For the model, we use sentence transformers model from \
- `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 `_ for \
- unsupervised word sense disambiguation.
+ Use sentence transformers model from \
+ `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 \
+ `_ \
+ for unsupervised word sense disambiguation.
:Example:
::
@@ -83,22 +98,31 @@ def get_sense(
# 0.12473666667938232)]
"""
global _MODEL
- if custom_dict is None:
+ if not custom_dict:
custom_dict = _mean_all
- _w = custom_tokenizer.word_tokenize(sentence)
+
+ w = custom_tokenizer.word_tokenize(sentence)
if word not in set(custom_dict.keys()) or word not in sentence:
- return None
- if _MODEL is None:
+ return []
+
+ if not _MODEL:
_MODEL = _SentenceTransformersModel(device=device)
- if _MODEL.device!=device:
+ if _MODEL.device != device:
_MODEL.change_device(device=device)
- _temp_mean = custom_dict[word]
- _temp =[]
- for i in _temp_mean:
+
+ temp_mean = custom_dict[word]
+ temp = []
+ for i in temp_mean:
_temp_2 = []
- for j in _w:
+ for j in w:
if j == word:
- j = word+f" ({word} ความหมาย '"+i.replace('(',"").replace(')',"")+"') "
+ j = (
+ word
+ + f" ({word} ความหมาย '"
+ + i.replace("(", "").replace(")", "")
+ + "') "
+ )
_temp_2.append(j)
- _temp.append((i,_MODEL.get_score(sentence,''.join(_temp_2))))
- return _temp
+ temp.append((i, _MODEL.get_score(sentence, "".join(_temp_2))))
+
+ return temp
diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py
index 0abd403e9..c504731da 100644
--- a/tests/test_ulmfit.py
+++ b/tests/test_ulmfit.py
@@ -5,6 +5,7 @@
import pandas as pd
import torch
+
# fastai
import fastai
from fastai.text import *
@@ -213,15 +214,15 @@ def test_process_thai_dense(self):
def test_document_vector(self):
imdb = untar_data(URLs.IMDB_SAMPLE)
- dummy_df = pd.read_csv(imdb/'texts.csv')
+ dummy_df = pd.read_csv(imdb / "texts.csv")
thwiki = THWIKI_LSTM
- thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb'))
+ thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))
thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
tt = Tokenizer(
tok_func=ThaiTokenizer,
- lang='th',
+ lang="th",
pre_rules=pre_rules_th,
- post_rules=post_rules_th
+ post_rules=post_rules_th,
)
processor = [
TokenizeProcessor(
@@ -229,14 +230,11 @@ def test_document_vector(self):
),
NumericalizeProcessor(
vocab=thwiki_vocab, max_vocab=60000, min_freq=3
- )
+ ),
]
data_lm = (
TextList.from_df(
- dummy_df,
- imdb,
- cols=['text'],
- processor=processor
+ dummy_df, imdb, cols=["text"], processor=processor
)
.split_by_rand_pct(0.2)
.label_for_lm()
@@ -255,28 +253,22 @@ def test_document_vector(self):
"hidden_p": 0.1,
"input_p": 0.2,
"embed_p": 0.02,
- "weight_p": 0.15
+ "weight_p": 0.15,
}
trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1}
learn = language_model_learner(
- data_lm,
- AWD_LSTM,
- config=config,
- pretrained=False,
- **trn_args
+ data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args
)
learn.load_pretrained(**thwiki)
+ self.assertIsNotNone(document_vector("วันนี้วันดีปีใหม่", learn, data_lm))
self.assertIsNotNone(
- document_vector('วันนี้วันดีปีใหม่', learn, data_lm)
- )
- self.assertIsNotNone(
- document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg="sum")
+ document_vector("วันนี้วันดีปีใหม่", learn, data_lm, agg="sum")
)
with self.assertRaises(ValueError):
- document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg='abc')
+ document_vector("วันนี้วันดีปีใหม่", learn, data_lm, agg="abc")
def test_merge_wgts(self):
- wgts = {'0.encoder.weight': torch.randn(5,3)}
+ wgts = {"0.encoder.weight": torch.randn(5, 3)}
itos_pre = ["แมว", "คน", "หนู"]
itos_new = ["ปลา", "เต่า", "นก"]
em_sz = 3
diff --git a/tests/test_util.py b/tests/test_util.py
index ee02a278c..85a03ddc3 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -61,13 +61,12 @@
ipa_to_rtgs,
remove_tone_ipa,
tis620_to_utf8,
- remove_trailing_repeat_consonants
+ remove_trailing_repeat_consonants,
)
from pythainlp.util.spell_words import spell_word
class TestUtilPackage(unittest.TestCase):
-
# ### pythainlp.util.collate
def test_collate(self):
@@ -102,9 +101,7 @@ def test_number(self):
)
self.assertEqual(thaiword_to_num("สองล้านสามแสนหกร้อยสิบสอง"), 2300612)
self.assertEqual(thaiword_to_num("หนึ่งร้อยสิบล้าน"), 110000000)
- self.assertEqual(
- thaiword_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072
- )
+ self.assertEqual(thaiword_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072)
self.assertEqual(thaiword_to_num("หนึ่งล้านล้าน"), 1000000000000)
self.assertEqual(
thaiword_to_num("สองแสนสี่หมื่นสามสิบล้านสี่พันล้าน"),
@@ -137,9 +134,7 @@ def test_number(self):
)
self.assertEqual(words_to_num("สองล้านสามแสนหกร้อยสิบสอง"), 2300612)
self.assertEqual(words_to_num("หนึ่งร้อยสิบล้าน"), 110000000)
- self.assertEqual(
- words_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072
- )
+ self.assertEqual(words_to_num("สิบห้าล้านล้านเจ็ดสิบสอง"), 15000000000072)
self.assertEqual(words_to_num("หนึ่งล้านล้าน"), 1000000000000)
self.assertEqual(
words_to_num("สองแสนสี่หมื่นสามสิบล้านสี่พันล้าน"),
@@ -149,15 +144,9 @@ def test_number(self):
self.assertEqual(words_to_num("ลบหนึ่ง"), -1)
text = "ลบหนึ่งร้อยล้านสี่แสนห้าพันยี่สิบเอ็ด"
self.assertEqual(num_to_thaiword(words_to_num(text)), text)
- self.assertIsNotNone(
- text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด")
- )
- self.assertIsNotNone(
- text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท")
- )
- self.assertIsNotNone(
- text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้า")
- )
+ self.assertIsNotNone(text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด"))
+ self.assertIsNotNone(text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท"))
+ self.assertIsNotNone(text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้า"))
self.assertEqual(
arabic_digit_to_thai_digit("ไทยแลนด์ 4.0"), "ไทยแลนด์ ๔.๐"
@@ -293,16 +282,10 @@ def test_thai_strftime(self):
def test_time_to_thaiword(self):
self.assertEqual(time_to_thaiword("8:17"), time_to_thaiword("08:17"))
self.assertEqual(time_to_thaiword("8:17"), "แปดนาฬิกาสิบเจ็ดนาที")
- self.assertEqual(
- time_to_thaiword("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที"
- )
+ self.assertEqual(time_to_thaiword("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที")
self.assertEqual(time_to_thaiword("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที")
- self.assertEqual(
- time_to_thaiword("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง"
- )
- self.assertEqual(
- time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที"
- )
+ self.assertEqual(time_to_thaiword("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง")
+ self.assertEqual(time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที")
self.assertEqual(
time_to_thaiword(time(12, 3, 1)),
"สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
@@ -320,9 +303,7 @@ def test_time_to_thaiword(self):
"เที่ยงครึ่ง",
)
self.assertEqual(time_to_thaiword("18:30"), "สิบแปดนาฬิกาสามสิบนาที")
- self.assertEqual(
- time_to_thaiword("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที"
- )
+ self.assertEqual(time_to_thaiword("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที")
self.assertEqual(
time_to_thaiword("18:30:01"), "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที"
)
@@ -389,9 +370,7 @@ def test_thaiword_to_time(self):
self.assertEqual(thaiword_to_time("สิบโมงเช้าสิบสองนาที"), "10:12")
self.assertEqual(thaiword_to_time("บ่ายโมงสิบสามนาที"), "13:13")
self.assertEqual(thaiword_to_time("ศูนย์นาฬิกาสิบเอ็ดนาที"), "00:11")
- self.assertEqual(
- thaiword_to_time("บ่ายโมงเย็นสามสิบเอ็ดนาที"), "13:31"
- )
+ self.assertEqual(thaiword_to_time("บ่ายโมงเย็นสามสิบเอ็ดนาที"), "13:31")
self.assertEqual(thaiword_to_time("เที่ยงคืนหนึ่งนาที"), "00:01")
self.assertEqual(thaiword_to_time("เที่ยงครึ่ง"), "12:30")
self.assertEqual(thaiword_to_time("ห้าโมงเย็นสามสิบสี่นาที"), "17:34")
@@ -412,9 +391,7 @@ def test_thaiword_to_time(self):
def test_thaiword_to_date(self):
now = datetime.now()
- self.assertEqual(
- now + timedelta(days=0), thaiword_to_date("วันนี้", now)
- )
+ self.assertEqual(now + timedelta(days=0), thaiword_to_date("วันนี้", now))
self.assertEqual(
now + timedelta(days=1),
thaiword_to_date("พรุ่งนี้", now),
@@ -548,52 +525,25 @@ def test_normalize(self):
# maiyamok
self.assertEqual(
maiyamok("เด็กๆชอบไปโรงเรียน"),
- ['เด็ก', 'เด็ก', 'ชอบ', 'ไป', 'โรงเรียน']
- )
- self.assertEqual(
- maiyamok([
- "ทำไม",
- "คน",
- "ดี",
- " ",
- "ๆ",
- "ๆ",
- " ",
- "ถึง",
- "ทำ",
- "ไม่ได้"
- ]),
- ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
- )
- self.assertEqual(
- maiyamok([
- "ทำไม",
- "คน",
- "ดี",
- " ",
- " ๆ",
- "ๆ",
- " ",
- "ถึง",
- "ทำ",
- "ไม่ได้"
- ]),
- ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
- )
- self.assertEqual(
- maiyamok([
- "ทำไม",
- "คน",
- "ดีๆ",
- " ",
- "ๆ",
- "ๆ",
- " ",
- "ถึง",
- "ทำ",
- "ไม่ได้"
- ]),
- ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"]
+ ["เด็ก", "เด็ก", "ชอบ", "ไป", "โรงเรียน"],
+ )
+ self.assertEqual(
+ maiyamok(
+ ["ทำไม", "คน", "ดี", " ", "ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"]
+ ),
+ ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"],
+ )
+ self.assertEqual(
+ maiyamok(
+ ["ทำไม", "คน", "ดี", " ", " ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"]
+ ),
+ ["ทำไม", "คน", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"],
+ )
+ self.assertEqual(
+ maiyamok(
+ ["ทำไม", "คน", "ดีๆ", " ", "ๆ", "ๆ", " ", "ถึง", "ทำ", "ไม่ได้"]
+ ),
+ ["ทำไม", "คน", "ดี", "ดี", "ดี", "ดี", " ", "ถึง", "ทำ", "ไม่ได้"],
)
# ### pythainlp.util.thai
@@ -611,34 +561,34 @@ def test_count_thai_chars(self):
self.assertEqual(
count_thai_chars("ทดสอบภาษาไทย"),
{
- 'vowels': 3,
- 'lead_vowels': 1,
- 'follow_vowels': 2,
- 'above_vowels': 0,
- 'below_vowels': 0,
- 'consonants': 9,
- 'tonemarks': 0,
- 'signs': 0,
- 'thai_digits': 0,
- 'punctuations': 0,
- 'non_thai': 0,
- }
+ "vowels": 3,
+ "lead_vowels": 1,
+ "follow_vowels": 2,
+ "above_vowels": 0,
+ "below_vowels": 0,
+ "consonants": 9,
+ "tonemarks": 0,
+ "signs": 0,
+ "thai_digits": 0,
+ "punctuations": 0,
+ "non_thai": 0,
+ },
)
self.assertEqual(
count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"),
{
- 'vowels': 12,
- 'lead_vowels': 6,
- 'follow_vowels': 1,
- 'above_vowels': 4,
- 'below_vowels': 1,
- 'consonants': 22,
- 'tonemarks': 3,
- 'signs': 2,
- 'thai_digits': 1,
- 'punctuations': 1,
- 'non_thai': 4,
- }
+ "vowels": 12,
+ "lead_vowels": 6,
+ "follow_vowels": 1,
+ "above_vowels": 4,
+ "below_vowels": 1,
+ "consonants": 22,
+ "tonemarks": 3,
+ "signs": 2,
+ "thai_digits": 1,
+ "punctuations": 1,
+ "non_thai": 4,
+ },
)
def test_isthaichar(self):
@@ -687,13 +637,8 @@ def test_display_thai_char(self):
def test_emoji_to_thai(self):
self.assertEqual(
- emoji_to_thai(
- "จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀"
- ),
- (
- "จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
- "ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:"
- ),
+ emoji_to_thai("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀"),
+ ("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ " "ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:"),
)
self.assertEqual(
emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣"),
@@ -787,16 +732,13 @@ def test_to_idna(self):
def test_thai_word_tone_detector(self):
self.assertIsNotNone(thai_word_tone_detector("คนดี"))
self.assertEqual(
- thai_word_tone_detector("ราคา"),
- [('รา', 'm'), ('คา', 'm')]
+ thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")]
)
def test_thai_strptime(self):
self.assertIsNotNone(
thai_strptime(
- "05-7-65 09:00:01.10600",
- "%d-%B-%Y %H:%M:%S.%f",
- year="be"
+ "05-7-65 09:00:01.10600", "%d-%B-%Y %H:%M:%S.%f", year="be"
)
)
self.assertIsNotNone(
@@ -804,14 +746,12 @@ def test_thai_strptime(self):
"24-6-75 09:00:00",
"%d-%B-%Y %H:%M:%S",
year="be",
- add_year="2400"
+ add_year="2400",
)
)
self.assertIsNotNone(
thai_strptime(
- "05-7-22 09:00:01.10600",
- "%d-%B-%Y %H:%M:%S.%f",
- year="ad"
+ "05-7-22 09:00:01.10600", "%d-%B-%Y %H:%M:%S.%f", year="ad"
)
)
self.assertIsNotNone(
@@ -819,7 +759,7 @@ def test_thai_strptime(self):
"05-7-99 09:00:01.10600",
"%d-%B-%Y %H:%M:%S.%f",
year="ad",
- add_year="1900"
+ add_year="1900",
)
)
@@ -837,11 +777,12 @@ def test_convert_years(self):
self.assertEqual(convert_years("242", src="re", target="ad"), "2023")
self.assertEqual(convert_years("242", src="re", target="ah"), "1444")
with self.assertRaises(NotImplementedError):
- self.assertIsNotNone(convert_years(
- "2023", src="cat", target="dog"))
+ self.assertIsNotNone(
+ convert_years("2023", src="cat", target="dog")
+ )
def test_nectec_to_ipa(self):
- self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), 'kl uua j ˥˩')
+ self.assertEqual(nectec_to_ipa("kl-uua-j^-2"), "kl uua j ˥˩")
def test_ipa_to_rtgs(self):
self.assertEqual(ipa_to_rtgs("kluaj"), "kluai")
@@ -852,15 +793,17 @@ def test_remove_tone_ipa(self):
self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj")
def test_tis620_to_utf8(self):
- self.assertEqual(tis620_to_utf8(
- "¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
+ self.assertEqual(
+ tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม"
+ )
def test_spell_word(self):
- self.assertEqual(spell_word("เสือ"), ['สอ', 'เอือ', 'เสือ'])
- self.assertEqual(spell_word("เสื้อ"), ['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
- self.assertEqual(spell_word("คน"), ['คอ', 'นอ', 'คน'])
- self.assertEqual(spell_word("คนดี"), [
- 'คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
+ self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"])
+ self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"])
+ self.assertEqual(spell_word("คน"), ["คอ", "นอ", "คน"])
+ self.assertEqual(
+ spell_word("คนดี"), ["คอ", "นอ", "คน", "ดอ", "อี", "ดี", "คนดี"]
+ )
def test_rhyme(self):
self.assertIsInstance(rhyme("แมว"), list)
@@ -869,26 +812,24 @@ def test_rhyme(self):
def test_remove_repeat_consonants(self):
# update of pythainlp.copus.thai_words() able to break this
self.assertEqual(
- remove_trailing_repeat_consonants('เริ่ดดดดดดดด'),
- 'เริ่ด'
+ remove_trailing_repeat_consonants("เริ่ดดดดดดดด"), "เริ่ด"
)
self.assertEqual(
- remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม'),
- 'อืมมม'
+ remove_trailing_repeat_consonants("อืมมมมมมมมมมมมมมม"), "อืมมม"
)
- custom_dictionary = dict_trie(["อืมมมมม"])
+ custom_dict = dict_trie(["อืมมมมม"])
self.assertEqual(
- remove_trailing_repeat_consonants('อืมมมมมมมมมมมมมมม', custom_dictionary),
- 'อืมมมมม'
+ remove_trailing_repeat_consonants("อืมมมมมมมมมมมมมมม", custom_dict),
+ "อืมมมมม",
)
self.assertEqual(
remove_trailing_repeat_consonants(
- 'อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด '
- 'ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ'
+ "อืมมมมมมมมมมมมม คุณมีบุคลิกที่เริ่ดดดดด "
+ "ฉันจะให้เกรดดีกับคุณณณ\nนี่เป็นความลับบบบบ"
),
- 'อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ'
+ "อืมมม คุณมีบุคลิกที่เริ่ด ฉันจะให้เกรดดีกับคุณ\nนี่เป็นความลับ",
)
# def test_abbreviation_to_full_text(self):
diff --git a/tests/test_wsd.py b/tests/test_wsd.py
index b58fe76fa..e6666a7dc 100644
--- a/tests/test_wsd.py
+++ b/tests/test_wsd.py
@@ -5,6 +5,6 @@
class TestWsdPackage(unittest.TestCase):
def test_get_sense(self):
- self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้"))
- self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้"))
- self.assertIsNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คน"))
+ self.assertTrue(get_sense("เขากำลังอบขนมคุกกี้", "คุกกี้"))
+ self.assertTrue(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน", "คุกกี้"))
+ self.assertFalse(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน", "คน"))