diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index bff98f67e..8b8b08c14 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -19,34 +19,54 @@ Modules Tokenization Engines -------------------- -newmm -+++++ -.. automodule:: pythainlp.tokenize.newmm -.. autofunction:: pythainlp.tokenize.newmm.segment +Word level +---------- +attacut ++++++++ +.. automodule:: pythainlp.tokenize.attacut -longest +.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer + :members: + +deepcut +++++++ -.. automodule:: pythainlp.tokenize.longest +.. automodule:: pythainlp.tokenize.deepcut multi_cut +++++++++ .. automodule:: pythainlp.tokenize.multi_cut +.. autofunction:: pythainlp.tokenize.multi_cut.segment +.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment + +longest ++++++++ +.. automodule:: pythainlp.tokenize.longest + +.. autofunction:: pythainlp.tokenize.longest.segment + pyicu +++++ .. automodule:: pythainlp.tokenize.pyicu -deepcut -+++++++ -.. automodule:: pythainlp.tokenize.deepcut +nercut +++++++ +.. automodule:: pythainlp.tokenize.nercut -attacut -+++++++ -.. automodule:: pythainlp.tokenize.attacut +.. autofunction:: pythainlp.tokenize.nercut.segment -.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer - :members: +newmm ++++++ + +The default word tokenization engine. + +.. automodule:: pythainlp.tokenize.newmm + +.. autofunction:: pythainlp.tokenize.newmm.segment + +Subword level +------------- tcc +++ diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 50f0e0bfb..502798fad 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -36,6 +36,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]: ['และ', 'คุณ', 'เล่น', 'มือถือ'], ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']] """ + if not doc or not isinstance(doc, str): + return [] + from .crfcls import segment return segment(doc) @@ -74,6 +77,9 @@ def word_tokenize( * *deepcut* - wrapper for `DeepCut `_, learning-based approach + * *nercut* - Dictionary-based maximal matching word segmentation, + constrained with Thai Character Cluster (TCC) boundaries, + and combining tokens that are parts of the same named-entity. :Note: - The parameter **custom_dict** can be provided as an argument \ @@ -162,6 +168,10 @@ def word_tokenize( elif engine == "icu": from .pyicu import segment + segments = segment(text) + elif engine == "nercut": + from .nercut import segment + segments = segment(text) else: raise ValueError( diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py new file mode 100644 index 000000000..2b3d5bff2 --- /dev/null +++ b/pythainlp/tokenize/nercut.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +""" +nercut 0.1 + +Dictionary-based maximal matching word segmentation, constrained with +Thai Character Cluster (TCC) boundaries, and combining tokens that are +parts of the same named-entity. + +Code by Wannaphong Phatthiyaphaibun +""" +from typing import Iterable, List + +from pythainlp.tag.named_entity import ThaiNameTagger + +_thainer = ThaiNameTagger() + + +def segment( + text: str, + taglist: Iterable[str] = [ + "ORGANIZATION", + "PERSON", + "PHONE", + "EMAIL", + "DATE", + "TIME", + ], +) -> List[str]: + """ + Dictionary-based maximal matching word segmentation, constrained with + Thai Character Cluster (TCC) boundaries, and combining tokens that are + parts of the same named-entity. + + :param str text: text to be tokenized to words + :parm list taglist: a list of named-entity tags to be used + :return: list of words, tokenized from the text + """ + if not text or not isinstance(text, str): + return [] + + global _thainer + tagged_words = _thainer.get_ner(text, pos=False) + + words = [] + combining_word = "" + combining_word = "" + for curr_word, curr_tag in tagged_words: + if curr_tag != "O": + tag = curr_tag[2:] + else: + tag = "O" + + if curr_tag.startswith("B-") and tag in taglist: + if combining_word != "": + words.append(combining_word) + combining_word = curr_word + elif ( + curr_tag.startswith("I-") + and combining_word != "" + and tag in taglist + ): + combining_word += curr_word + elif ( + curr_tag == "O" + and combining_word != "" + ): + words.append(combining_word) + combining_word = "" + words.append(curr_word) + else: + combining_word = "" + words.append(curr_word) + + if combining_word != "": + words.append(combining_word) + + return words diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index ee66a75e3..a66494754 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -7,24 +7,26 @@ Tokenizer, attacut, clause_tokenize, -) -from pythainlp.tokenize import deepcut as tokenize_deepcut -from pythainlp.tokenize import etcc, longest, multi_cut, newmm -from pythainlp.tokenize import pyicu as tokenize_pyicu -from pythainlp.tokenize import ( + deepcut, + etcc, + longest, + multi_cut, + nercut, + newmm, + pyicu, sent_tokenize, + ssg, subword_tokenize, syllable_tokenize, tcc, word_tokenize, ) -from pythainlp.tokenize.ssg import segment as ssg_segment from pythainlp.util import dict_trie class TestTokenizePackage(unittest.TestCase): def setUp(self): - self.text_1 = "หมอนทองตากลมหูว์MBK39" + self.text_1 = "หมอนทองตากลมหูว์MBK39 :.ฉฺ๐๐๓-#™±" self.text_2 = "ทดสอบ" self.long_text = ( @@ -189,34 +191,127 @@ def setUp(self): "กกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก" ) + def test_Tokenizer(self): + _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE) + self.assertEqual(_tokenizer.word_tokenize(""), []) + _tokenizer.set_tokenize_engine("longest") + self.assertEqual(_tokenizer.word_tokenize(None), []) + + _tokenizer = Tokenizer() + self.assertEqual(_tokenizer.word_tokenize("ก"), ["ก"]) + def test_clause_tokenize(self): + self.assertEqual(clause_tokenize(None), []) + self.assertEqual(clause_tokenize(""), []) self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"])) self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list) - def test_Tokenizer(self): - t_test = Tokenizer(DEFAULT_WORD_DICT_TRIE) - self.assertEqual(t_test.word_tokenize(""), []) - t_test.set_tokenize_engine("longest") - self.assertEqual(t_test.word_tokenize(None), []) + def test_sent_tokenize(self): + self.assertEqual(sent_tokenize(None), []) + self.assertEqual(sent_tokenize(""), []) + self.assertEqual( + sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), + ["รักน้ำ", "รักปลา", ""], + ) + self.assertEqual( + sent_tokenize("รักน้ำ รักปลา ", engine="whitespace+newline"), + ["รักน้ำ", "รักปลา"], + ) - t_test = Tokenizer() - self.assertEqual(t_test.word_tokenize("ก"), ["ก"]) + sent_1 = "ฉันไปโรงเรียน เธอไปโรงพยาบาล" + sent_1_toks = ["ฉันไปโรงเรียน ", "เธอไปโรงพยาบาล"] + sent_2 = "วันนี้ฉันกินข้าว และโดดเรียน" + sent_2_toks = ["วันนี้ฉันกินข้าว และโดดเรียน"] + sent_3 = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต" + + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + sent_3_toks = [ + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต ", + "มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ", + "จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้", + ] - def test_etcc(self): - self.assertEqual(etcc.segment(None), []) - self.assertEqual(etcc.segment(""), []) - self.assertIsInstance(etcc.segment("คืนความสุข"), list) self.assertEqual( - etcc.segment("หาเงินเพื่อเรียน"), - ["หา", "เงิน", "เพื่", "อ", "เรีย", "น"], + sent_tokenize(sent_1, engine="crfcut"), sent_1_toks, + ) + self.assertEqual( + sent_tokenize(sent_2, engine="crfcut"), sent_2_toks, + ) + self.assertEqual( + sent_tokenize(sent_3, engine="crfcut"), sent_3_toks, + ) + self.assertEqual( + sent_tokenize(sent_1), sent_1_toks, + ) + self.assertEqual( + sent_tokenize(sent_2), sent_2_toks, + ) + self.assertEqual( + sent_tokenize(sent_3), sent_3_toks, ) - self.assertEqual(etcc.segment("หนังสือ"), ["ห", "นัง", "สือ"]) self.assertIsNotNone( - etcc.segment( - "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" - + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" + sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",), + ) + self.assertFalse( + " " + in sent_tokenize( + sent_1, engine="whitespace", keep_whitespace=False, ) ) + with self.assertRaises(ValueError): + sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist + + def test_subword_tokenize(self): + self.assertEqual(subword_tokenize(None), []) + self.assertEqual(subword_tokenize(""), []) + self.assertIsInstance( + subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") + ) + self.assertEqual(subword_tokenize(None, engine="etcc"), []) + self.assertEqual(subword_tokenize("", engine="etcc"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") + ) + self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list) + self.assertFalse( + " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) + ) + with self.assertRaises(ValueError): + subword_tokenize("นกแก้ว", engine="XX") # engine does not exist + + def test_syllable_tokenize(self): + self.assertEqual(syllable_tokenize(None), []) + self.assertEqual(syllable_tokenize(""), []) + self.assertEqual( + syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] + ) + self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก")) + self.assertEqual(syllable_tokenize(None, engine="ssg"), []) + self.assertEqual(syllable_tokenize("", engine="ssg"), []) + self.assertEqual( + syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] + ) + self.assertTrue( + "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + "า" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + " " in syllable_tokenize("พันธมิตร ชา นม", keep_whitespace=False) + ) + with self.assertRaises(ValueError): + syllable_tokenize("กรอเทป", engine="XX") # engine does not exist def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) @@ -224,12 +319,14 @@ def test_word_tokenize(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) - self.assertIsNotNone(word_tokenize(self.text_1, engine="newmm")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="mm")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="longest")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="icu")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="icu")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="longest")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="mm")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="nercut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="newmm")) + with self.assertRaises(ValueError): word_tokenize("หมอนทอง", engine="XX") # engine does not exist @@ -237,13 +334,30 @@ def test_word_tokenize(self): "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) - def test_word_tokenize_deepcut(self): - self.assertEqual(tokenize_deepcut.segment(None), []) - self.assertEqual(tokenize_deepcut.segment(""), []) + def test_attacut(self): + self.assertEqual(attacut.segment(None), []) + self.assertEqual(attacut.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), + ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], + ) + self.assertEqual( + attacut.segment( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-sc" + ), + ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], + ) self.assertIsNotNone( - tokenize_deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE) + attacut.segment( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c" + ) ) - self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) + + def test_deepcut(self): + self.assertEqual(deepcut.segment(None), []) + self.assertEqual(deepcut.segment(""), []) + self.assertIsNotNone(deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE)) + self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut")) self.assertIsNotNone( word_tokenize( @@ -251,15 +365,31 @@ def test_word_tokenize_deepcut(self): ) ) - def test_word_tokenize_icu(self): - self.assertEqual(tokenize_pyicu.segment(None), []) - self.assertEqual(tokenize_pyicu.segment(""), []) + def test_etcc(self): + self.assertEqual(etcc.segment(None), []) + self.assertEqual(etcc.segment(""), []) + self.assertIsInstance(etcc.segment("คืนความสุข"), list) + self.assertEqual( + etcc.segment("หาเงินเพื่อเรียน"), + ["หา", "เงิน", "เพื่", "อ", "เรีย", "น"], + ) + self.assertEqual(etcc.segment("หนังสือ"), ["ห", "นัง", "สือ"]) + self.assertIsNotNone( + etcc.segment( + "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" + + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" + ) + ) + + def test_icu(self): + self.assertEqual(pyicu.segment(None), []) + self.assertEqual(pyicu.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) - def test_word_tokenize_longest(self): + def test_longest(self): self.assertEqual(longest.segment(None), []) self.assertEqual(longest.segment(""), []) self.assertIsInstance( @@ -278,7 +408,7 @@ def test_word_tokenize_longest(self): longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"], ) - def test_word_tokenize_mm(self): + def test_mm(self): self.assertEqual(multi_cut.segment(None), []) self.assertEqual(multi_cut.segment(""), []) self.assertIsNotNone(multi_cut.segment("ตัด", dict_trie([""]))) @@ -296,7 +426,7 @@ def test_word_tokenize_mm(self): ) self.assertEqual(multi_cut.find_all_segment(None), []) - def test_word_tokenize_newmm(self): + def test_newmm(self): self.assertEqual(newmm.segment(None), []) self.assertEqual(newmm.segment(""), []) self.assertEqual( @@ -328,7 +458,7 @@ def test_word_tokenize_newmm(self): " " in word_tokenize("จุ๋มง่วง", keep_whitespace=False,) ) - def test_word_tokenize_newmm_longtext(self): + def test_newmm_longtext(self): self.assertIsInstance( word_tokenize(self.long_text, engine="newmm"), list ) @@ -336,7 +466,7 @@ def test_word_tokenize_newmm_longtext(self): word_tokenize(self.long_text, engine="newmm-safe"), list ) - def test_word_tokenize_newmm_dangertext(self): + def test_newmm_dangertext(self): self.assertIsInstance( word_tokenize(self.danger_text1, engine="newmm"), list ) @@ -356,134 +486,19 @@ def test_word_tokenize_newmm_dangertext(self): word_tokenize(self.danger_text3, engine="newmm-safe"), list ) - def test_word_tokenize_attacut(self): - self.assertEqual(attacut.segment(None), []) - self.assertEqual(attacut.segment(""), []) - self.assertEqual( - word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), - ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], - ) - self.assertEqual( - attacut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-sc"), - ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], - ) - self.assertIsNotNone( - attacut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c") - ) - - def test_sent_tokenize(self): - self.assertEqual(sent_tokenize(None), []) - self.assertEqual(sent_tokenize(""), []) - self.assertEqual( - sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), - ["รักน้ำ", "รักปลา", ""], - ) - self.assertEqual( - sent_tokenize("รักน้ำ รักปลา ", engine="whitespace+newline"), - ["รักน้ำ", "รักปลา"], - ) - - sent_1 = "ฉันไปโรงเรียน เธอไปโรงพยาบาล" - sent_1_toks = ["ฉันไปโรงเรียน ", "เธอไปโรงพยาบาล"] - sent_2 = "วันนี้ฉันกินข้าว และโดดเรียน" - sent_2_toks = ["วันนี้ฉันกินข้าว และโดดเรียน"] - sent_3 = ( - "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" - + "จากผลงานวิจัยที่เคยทำมาในอดีต" - + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" - + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" - ) - sent_3_toks = [ - "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" - + "จากผลงานวิจัยที่เคยทำมาในอดีต ", - "มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ", - "จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้", - ] - - self.assertEqual( - sent_tokenize(sent_1, engine="crfcut"), sent_1_toks, - ) - self.assertEqual( - sent_tokenize(sent_2, engine="crfcut"), sent_2_toks, - ) - self.assertEqual( - sent_tokenize(sent_3, engine="crfcut"), sent_3_toks, - ) - self.assertEqual( - sent_tokenize(sent_1), sent_1_toks, - ) - self.assertEqual( - sent_tokenize(sent_2), sent_2_toks, - ) - self.assertEqual( - sent_tokenize(sent_3), sent_3_toks, - ) - self.assertIsNotNone( - sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",), - ) - self.assertFalse( - " " - in sent_tokenize( - sent_1, engine="whitespace", keep_whitespace=False, - ) - ) - with self.assertRaises(ValueError): - sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist - - def test_ssg_tokenize(self): - self.assertEqual(ssg_segment(None), []) - self.assertEqual(ssg_segment(""), []) - self.assertTrue( - "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") - ) - - def test_subword_tokenize(self): - self.assertEqual(subword_tokenize(None), []) - self.assertEqual(subword_tokenize(""), []) - self.assertIsInstance( - subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"), list - ) - self.assertFalse( - "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") - ) - self.assertEqual(subword_tokenize(None, engine="etcc"), []) - self.assertEqual(subword_tokenize("", engine="etcc"), []) - self.assertIsInstance( - subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc"), list - ) - self.assertFalse( - "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") - ) - self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list) - self.assertFalse( - " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) - ) - with self.assertRaises(ValueError): - subword_tokenize("นกแก้ว", engine="XX") # engine does not exist + def test_nercut(self): + self.assertEqual(nercut.segment(None), []) + self.assertEqual(nercut.segment(""), []) + self.assertIsNotNone(nercut.segment("ทดสอบ")) + self.assertIsNotNone(nercut.segment("ทดสอบ")) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) - def test_syllable_tokenize(self): - self.assertEqual(syllable_tokenize(None), []) - self.assertEqual(syllable_tokenize(""), []) - self.assertEqual( - syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] - ) - self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก")) - self.assertEqual(syllable_tokenize(None, engine="ssg"), []) - self.assertEqual(syllable_tokenize("", engine="ssg"), []) - self.assertEqual( - syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] - ) + def test_ssg(self): + self.assertEqual(ssg.segment(None), []) + self.assertEqual(ssg.segment(""), []) self.assertTrue( "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") ) - self.assertFalse( - "า" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") - ) - self.assertFalse( - " " in syllable_tokenize("พันธมิตร ชา นม", keep_whitespace=False) - ) - with self.assertRaises(ValueError): - syllable_tokenize("กรอเทป", engine="XX") # engine does not exist def test_tcc(self): self.assertEqual(tcc.segment(None), [])