From eee7a655da6823a3059216a65233d9543cd1c49a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 16 Dec 2020 19:08:47 +0700 Subject: [PATCH 1/8] Add NERCut --- pythainlp/tokenize/core.py | 4 +++ pythainlp/tokenize/nercut.py | 66 ++++++++++++++++++++++++++++++++++++ tests/test_tokenize.py | 11 ++++++ 3 files changed, 81 insertions(+) create mode 100644 pythainlp/tokenize/nercut.py diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 50f0e0bfb..862b86e52 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -162,6 +162,10 @@ def word_tokenize( elif engine == "icu": from .pyicu import segment + segments = segment(text) + elif engine == "nercut": + from .nercut import segment + segments = segment(text) else: raise ValueError( diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py new file mode 100644 index 000000000..3735e8ec1 --- /dev/null +++ b/pythainlp/tokenize/nercut.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +""" +nercut 0.1 + +Code by Wannaphong Phatthiyaphaibun +""" +from typing import List +from pythainlp.tag.named_entity import ThaiNameTagger + +_thainer = ThaiNameTagger() + +def segment( + text: str, + tag:List[str] = [ + "ORGANIZATION", + "PERSON", + "PHONE", + "EMAIL", + "DATE", + "TIME" + ] +) -> List[str]: + """ + nercut 0.1 + + Code by Wannaphong Phatthiyaphaibun + + neww+thainer word segmentation. + + :param str text: text to be tokenized to words + :parm list tag: ThaiNER tag + :return: list of words, tokenized from the text + """ + global _thainer + if not text or not isinstance(text, str): + return [] + + _ws = _thainer.get_ner(text, pos = False) + _list_w = [] + _bi = "" + _tag = "" + for i,t in _ws: + if t != "O": + _tag_temp = t.split('-')[1] + else: + _tag_temp = "O" + if t.startswith('B-') and _tag_temp in tag: + if _bi!="" and _tag in tag: + _list_w.append(_bi) + _bi="" + _bi += i + _tag = t.replace('B-','') + elif t.startswith('I-') and t.replace('I-','') == _tag and _tag_temp in tag: + _bi += i + elif t == "O" and _tag != "" and _tag in tag: + _list_w.append(_bi) + _bi="" + _tag = "" + _list_w.append(i) + else: + _bi="" + _tag = "" + _list_w.append(i) + if _bi!="": + _list_w.append(_bi) + return _list_w \ No newline at end of file diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 35256e088..374e05676 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -20,6 +20,7 @@ ) from pythainlp.tokenize.ssg import segment as ssg_segment from pythainlp.util import dict_trie +from pythainlp.tokenize import nercut as tokenize_nercut class TestTokenizePackage(unittest.TestCase): @@ -230,6 +231,7 @@ def test_word_tokenize(self): self.assertIsNotNone(word_tokenize(self.text_1, engine="icu")) self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="nercut")) with self.assertRaises(ValueError): word_tokenize("หมอนทอง", engine="XX") # engine does not exist @@ -364,6 +366,15 @@ def test_word_tokenize_attacut(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) + def test_word_tokenize_nercut(self): + self.assertEqual(tokenize_nercut.segment(None), []) + self.assertEqual(tokenize_nercut.segment(""), []) + self.assertIsNotNone( + tokenize_nercut.segment("ทดสอบ") + ) + self.assertIsNotNone(tokenize_nercut.segment("ทดสอบ")) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) + def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), []) From 08994e433aebef91d1bab56d7d48262ed7c359e1 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 16 Dec 2020 21:21:10 +0700 Subject: [PATCH 2/8] More readable variable names (#504) --- pythainlp/tokenize/nercut.py | 86 ++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 33 deletions(-) diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py index 3735e8ec1..b1f90d4bb 100644 --- a/pythainlp/tokenize/nercut.py +++ b/pythainlp/tokenize/nercut.py @@ -2,65 +2,85 @@ """ nercut 0.1 +Dictionary-based maximal matching word segmentation, constrained with +Thai Character Cluster (TCC) boundaries, and combining tokens that are +parts of the same named-entity. + Code by Wannaphong Phatthiyaphaibun """ from typing import List + from pythainlp.tag.named_entity import ThaiNameTagger _thainer = ThaiNameTagger() + def segment( text: str, - tag:List[str] = [ + taglist: List[str] = [ "ORGANIZATION", "PERSON", "PHONE", "EMAIL", "DATE", - "TIME" - ] + "TIME", + ], ) -> List[str]: """ nercut 0.1 Code by Wannaphong Phatthiyaphaibun - neww+thainer word segmentation. + Dictionary-based maximal matching word segmentation, constrained with + Thai Character Cluster (TCC) boundaries, and combining tokens that are + parts of the same named-entity. :param str text: text to be tokenized to words - :parm list tag: ThaiNER tag + :parm list taglist: a list of named-entity tags to be used :return: list of words, tokenized from the text """ - global _thainer if not text or not isinstance(text, str): return [] - _ws = _thainer.get_ner(text, pos = False) - _list_w = [] - _bi = "" - _tag = "" - for i,t in _ws: - if t != "O": - _tag_temp = t.split('-')[1] + global _thainer + tagged_words = _thainer.get_ner(text, pos=False) + + words = [] + combining_word = "" + combining_word = "" + for curr_word, curr_tag in tagged_words: + if curr_tag != "O": + tag = curr_tag[2:] else: - _tag_temp = "O" - if t.startswith('B-') and _tag_temp in tag: - if _bi!="" and _tag in tag: - _list_w.append(_bi) - _bi="" - _bi += i - _tag = t.replace('B-','') - elif t.startswith('I-') and t.replace('I-','') == _tag and _tag_temp in tag: - _bi += i - elif t == "O" and _tag != "" and _tag in tag: - _list_w.append(_bi) - _bi="" - _tag = "" - _list_w.append(i) + tag = "O" + + if curr_tag.startswith("B-") and tag in taglist: + if combining_word != "" and combining_word in taglist: + words.append(combining_word) + combining_word = "" + combining_word += curr_word + combining_word = curr_tag[2:] + elif ( + curr_tag.startswith("I-") + and curr_tag[2:] == combining_word + and tag in taglist + ): + combining_word += curr_word + elif ( + curr_tag == "O" + and combining_word != "" + and combining_word in taglist + ): + words.append(combining_word) + combining_word = "" + combining_word = "" + words.append(curr_word) else: - _bi="" - _tag = "" - _list_w.append(i) - if _bi!="": - _list_w.append(_bi) - return _list_w \ No newline at end of file + combining_word = "" + combining_word = "" + words.append(curr_word) + + if combining_word != "": + words.append(combining_word) + + return words From 23101998044b263099dab1e2b12ef9a0650c8da3 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 16 Dec 2020 22:09:35 +0700 Subject: [PATCH 3/8] Update nercut docs and update nercut code --- pythainlp/tokenize/core.py | 3 +++ pythainlp/tokenize/nercut.py | 11 +++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 862b86e52..0c043414d 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -74,6 +74,9 @@ def word_tokenize( * *deepcut* - wrapper for `DeepCut `_, learning-based approach + * *nercut* - Dictionary-based maximal matching word segmentation, + constrained with Thai Character Cluster (TCC) boundaries, + and combining tokens that are parts of the same named-entity. :Note: - The parameter **custom_dict** can be provided as an argument \ diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py index b1f90d4bb..6bef9d7bd 100644 --- a/pythainlp/tokenize/nercut.py +++ b/pythainlp/tokenize/nercut.py @@ -55,28 +55,23 @@ def segment( tag = "O" if curr_tag.startswith("B-") and tag in taglist: - if combining_word != "" and combining_word in taglist: + if combining_word != "": words.append(combining_word) - combining_word = "" - combining_word += curr_word - combining_word = curr_tag[2:] + combining_word = curr_word elif ( curr_tag.startswith("I-") - and curr_tag[2:] == combining_word + and combining_word != "" and tag in taglist ): combining_word += curr_word elif ( curr_tag == "O" and combining_word != "" - and combining_word in taglist ): words.append(combining_word) combining_word = "" - combining_word = "" words.append(curr_word) else: - combining_word = "" combining_word = "" words.append(curr_word) From bc0ad857738c785e2a15d4ec348526c66757e7d1 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 16 Dec 2020 22:45:32 +0700 Subject: [PATCH 4/8] Update tokenize.rst --- docs/api/tokenize.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index b66642cc3..c4656581e 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -22,6 +22,7 @@ Tokenization Engines newmm +++++ .. automodule:: pythainlp.tokenize.newmm + .. autofunction:: pythainlp.tokenize.newmm.segment @@ -29,10 +30,15 @@ longest +++++++ .. automodule:: pythainlp.tokenize.longest +.. autofunction:: pythainlp.tokenize.longest.segment + multi_cut +++++++++ .. automodule:: pythainlp.tokenize.multi_cut +.. autofunction:: pythainlp.tokenize.multi_cut.segment +.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment + pyicu +++++ .. automodule:: pythainlp.tokenize.pyicu @@ -45,6 +51,12 @@ attacut +++++++ .. automodule:: pythainlp.tokenize.attacut +nercut +++++++ +.. automodule:: pythainlp.tokenize.nercut + +.. autofunction:: pythainlp.tokenize.nercut.segment + tcc +++ .. automodule:: pythainlp.tokenize.tcc From 5040c10c2a8003b930f9538af8b503744f350ffd Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 18 Dec 2020 10:41:35 +0700 Subject: [PATCH 5/8] Update nercut.py --- pythainlp/tokenize/nercut.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py index 6bef9d7bd..2b3d5bff2 100644 --- a/pythainlp/tokenize/nercut.py +++ b/pythainlp/tokenize/nercut.py @@ -8,7 +8,7 @@ Code by Wannaphong Phatthiyaphaibun """ -from typing import List +from typing import Iterable, List from pythainlp.tag.named_entity import ThaiNameTagger @@ -17,7 +17,7 @@ def segment( text: str, - taglist: List[str] = [ + taglist: Iterable[str] = [ "ORGANIZATION", "PERSON", "PHONE", @@ -27,10 +27,6 @@ def segment( ], ) -> List[str]: """ - nercut 0.1 - - Code by Wannaphong Phatthiyaphaibun - Dictionary-based maximal matching word segmentation, constrained with Thai Character Cluster (TCC) boundaries, and combining tokens that are parts of the same named-entity. From b0c7199636304acad3af623d8e11ef4372830a9a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 18 Dec 2020 10:45:20 +0700 Subject: [PATCH 6/8] Update test_tokenize.py --- tests/test_tokenize.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 5ff13efae..71cea57b5 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -9,7 +9,9 @@ clause_tokenize, ) from pythainlp.tokenize import deepcut as tokenize_deepcut -from pythainlp.tokenize import etcc, longest, multi_cut, newmm +from pythainlp.tokenize import etcc, longest, multi_cut +from pythainlp.tokenize import nercut as tokenize_nercut +from pythainlp.tokenize import newmm from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.tokenize import ( sent_tokenize, @@ -20,7 +22,6 @@ ) from pythainlp.tokenize.ssg import segment as ssg_segment from pythainlp.util import dict_trie -from pythainlp.tokenize import nercut as tokenize_nercut class TestTokenizePackage(unittest.TestCase): @@ -366,19 +367,21 @@ def test_word_tokenize_attacut(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) self.assertEqual( - attacut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-sc"), + attacut.segment( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-sc" + ), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) self.assertIsNotNone( - attacut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c") + attacut.segment( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c" + ) ) def test_word_tokenize_nercut(self): self.assertEqual(tokenize_nercut.segment(None), []) self.assertEqual(tokenize_nercut.segment(""), []) - self.assertIsNotNone( - tokenize_nercut.segment("ทดสอบ") - ) + self.assertIsNotNone(tokenize_nercut.segment("ทดสอบ")) self.assertIsNotNone(tokenize_nercut.segment("ทดสอบ")) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) From cca63296e7482ba24bcae5a3d4ae774a9adb9b95 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 18 Dec 2020 11:05:37 +0700 Subject: [PATCH 7/8] Update test_tokenize.py --- tests/test_tokenize.py | 357 +++++++++++++++++++++-------------------- 1 file changed, 179 insertions(+), 178 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 71cea57b5..a66494754 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -7,26 +7,26 @@ Tokenizer, attacut, clause_tokenize, -) -from pythainlp.tokenize import deepcut as tokenize_deepcut -from pythainlp.tokenize import etcc, longest, multi_cut -from pythainlp.tokenize import nercut as tokenize_nercut -from pythainlp.tokenize import newmm -from pythainlp.tokenize import pyicu as tokenize_pyicu -from pythainlp.tokenize import ( + deepcut, + etcc, + longest, + multi_cut, + nercut, + newmm, + pyicu, sent_tokenize, + ssg, subword_tokenize, syllable_tokenize, tcc, word_tokenize, ) -from pythainlp.tokenize.ssg import segment as ssg_segment from pythainlp.util import dict_trie class TestTokenizePackage(unittest.TestCase): def setUp(self): - self.text_1 = "หมอนทองตากลมหูว์MBK39" + self.text_1 = "หมอนทองตากลมหูว์MBK39 :.ฉฺ๐๐๓-#™±" self.text_2 = "ทดสอบ" self.long_text = ( @@ -191,34 +191,127 @@ def setUp(self): "กกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก" ) + def test_Tokenizer(self): + _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE) + self.assertEqual(_tokenizer.word_tokenize(""), []) + _tokenizer.set_tokenize_engine("longest") + self.assertEqual(_tokenizer.word_tokenize(None), []) + + _tokenizer = Tokenizer() + self.assertEqual(_tokenizer.word_tokenize("ก"), ["ก"]) + def test_clause_tokenize(self): + self.assertEqual(clause_tokenize(None), []) + self.assertEqual(clause_tokenize(""), []) self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"])) self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list) - def test_Tokenizer(self): - t_test = Tokenizer(DEFAULT_WORD_DICT_TRIE) - self.assertEqual(t_test.word_tokenize(""), []) - t_test.set_tokenize_engine("longest") - self.assertEqual(t_test.word_tokenize(None), []) + def test_sent_tokenize(self): + self.assertEqual(sent_tokenize(None), []) + self.assertEqual(sent_tokenize(""), []) + self.assertEqual( + sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), + ["รักน้ำ", "รักปลา", ""], + ) + self.assertEqual( + sent_tokenize("รักน้ำ รักปลา ", engine="whitespace+newline"), + ["รักน้ำ", "รักปลา"], + ) - t_test = Tokenizer() - self.assertEqual(t_test.word_tokenize("ก"), ["ก"]) + sent_1 = "ฉันไปโรงเรียน เธอไปโรงพยาบาล" + sent_1_toks = ["ฉันไปโรงเรียน ", "เธอไปโรงพยาบาล"] + sent_2 = "วันนี้ฉันกินข้าว และโดดเรียน" + sent_2_toks = ["วันนี้ฉันกินข้าว และโดดเรียน"] + sent_3 = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต" + + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + sent_3_toks = [ + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต ", + "มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ", + "จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้", + ] - def test_etcc(self): - self.assertEqual(etcc.segment(None), []) - self.assertEqual(etcc.segment(""), []) - self.assertIsInstance(etcc.segment("คืนความสุข"), list) self.assertEqual( - etcc.segment("หาเงินเพื่อเรียน"), - ["หา", "เงิน", "เพื่", "อ", "เรีย", "น"], + sent_tokenize(sent_1, engine="crfcut"), sent_1_toks, + ) + self.assertEqual( + sent_tokenize(sent_2, engine="crfcut"), sent_2_toks, + ) + self.assertEqual( + sent_tokenize(sent_3, engine="crfcut"), sent_3_toks, + ) + self.assertEqual( + sent_tokenize(sent_1), sent_1_toks, + ) + self.assertEqual( + sent_tokenize(sent_2), sent_2_toks, + ) + self.assertEqual( + sent_tokenize(sent_3), sent_3_toks, ) - self.assertEqual(etcc.segment("หนังสือ"), ["ห", "นัง", "สือ"]) self.assertIsNotNone( - etcc.segment( - "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" - + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" + sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",), + ) + self.assertFalse( + " " + in sent_tokenize( + sent_1, engine="whitespace", keep_whitespace=False, ) ) + with self.assertRaises(ValueError): + sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist + + def test_subword_tokenize(self): + self.assertEqual(subword_tokenize(None), []) + self.assertEqual(subword_tokenize(""), []) + self.assertIsInstance( + subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") + ) + self.assertEqual(subword_tokenize(None, engine="etcc"), []) + self.assertEqual(subword_tokenize("", engine="etcc"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") + ) + self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list) + self.assertFalse( + " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) + ) + with self.assertRaises(ValueError): + subword_tokenize("นกแก้ว", engine="XX") # engine does not exist + + def test_syllable_tokenize(self): + self.assertEqual(syllable_tokenize(None), []) + self.assertEqual(syllable_tokenize(""), []) + self.assertEqual( + syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] + ) + self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก")) + self.assertEqual(syllable_tokenize(None, engine="ssg"), []) + self.assertEqual(syllable_tokenize("", engine="ssg"), []) + self.assertEqual( + syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] + ) + self.assertTrue( + "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + "า" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + " " in syllable_tokenize("พันธมิตร ชา นม", keep_whitespace=False) + ) + with self.assertRaises(ValueError): + syllable_tokenize("กรอเทป", engine="XX") # engine does not exist def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) @@ -226,13 +319,14 @@ def test_word_tokenize(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) - self.assertIsNotNone(word_tokenize(self.text_1, engine="newmm")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="mm")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="longest")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="icu")) - self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut")) self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="icu")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="longest")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="mm")) self.assertIsNotNone(word_tokenize(self.text_1, engine="nercut")) + self.assertIsNotNone(word_tokenize(self.text_1, engine="newmm")) + with self.assertRaises(ValueError): word_tokenize("หมอนทอง", engine="XX") # engine does not exist @@ -240,13 +334,30 @@ def test_word_tokenize(self): "ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) - def test_word_tokenize_deepcut(self): - self.assertEqual(tokenize_deepcut.segment(None), []) - self.assertEqual(tokenize_deepcut.segment(""), []) + def test_attacut(self): + self.assertEqual(attacut.segment(None), []) + self.assertEqual(attacut.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), + ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], + ) + self.assertEqual( + attacut.segment( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-sc" + ), + ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], + ) self.assertIsNotNone( - tokenize_deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE) + attacut.segment( + "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c" + ) ) - self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) + + def test_deepcut(self): + self.assertEqual(deepcut.segment(None), []) + self.assertEqual(deepcut.segment(""), []) + self.assertIsNotNone(deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE)) + self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut")) self.assertIsNotNone( word_tokenize( @@ -254,15 +365,31 @@ def test_word_tokenize_deepcut(self): ) ) - def test_word_tokenize_icu(self): - self.assertEqual(tokenize_pyicu.segment(None), []) - self.assertEqual(tokenize_pyicu.segment(""), []) + def test_etcc(self): + self.assertEqual(etcc.segment(None), []) + self.assertEqual(etcc.segment(""), []) + self.assertIsInstance(etcc.segment("คืนความสุข"), list) + self.assertEqual( + etcc.segment("หาเงินเพื่อเรียน"), + ["หา", "เงิน", "เพื่", "อ", "เรีย", "น"], + ) + self.assertEqual(etcc.segment("หนังสือ"), ["ห", "นัง", "สือ"]) + self.assertIsNotNone( + etcc.segment( + "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" + + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" + ) + ) + + def test_icu(self): + self.assertEqual(pyicu.segment(None), []) + self.assertEqual(pyicu.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) - def test_word_tokenize_longest(self): + def test_longest(self): self.assertEqual(longest.segment(None), []) self.assertEqual(longest.segment(""), []) self.assertIsInstance( @@ -281,7 +408,7 @@ def test_word_tokenize_longest(self): longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"], ) - def test_word_tokenize_mm(self): + def test_mm(self): self.assertEqual(multi_cut.segment(None), []) self.assertEqual(multi_cut.segment(""), []) self.assertIsNotNone(multi_cut.segment("ตัด", dict_trie([""]))) @@ -299,7 +426,7 @@ def test_word_tokenize_mm(self): ) self.assertEqual(multi_cut.find_all_segment(None), []) - def test_word_tokenize_newmm(self): + def test_newmm(self): self.assertEqual(newmm.segment(None), []) self.assertEqual(newmm.segment(""), []) self.assertEqual( @@ -331,7 +458,7 @@ def test_word_tokenize_newmm(self): " " in word_tokenize("จุ๋มง่วง", keep_whitespace=False,) ) - def test_word_tokenize_newmm_longtext(self): + def test_newmm_longtext(self): self.assertIsInstance( word_tokenize(self.long_text, engine="newmm"), list ) @@ -339,7 +466,7 @@ def test_word_tokenize_newmm_longtext(self): word_tokenize(self.long_text, engine="newmm-safe"), list ) - def test_word_tokenize_newmm_dangertext(self): + def test_newmm_dangertext(self): self.assertIsInstance( word_tokenize(self.danger_text1, engine="newmm"), list ) @@ -359,145 +486,19 @@ def test_word_tokenize_newmm_dangertext(self): word_tokenize(self.danger_text3, engine="newmm-safe"), list ) - def test_word_tokenize_attacut(self): - self.assertEqual(attacut.segment(None), []) - self.assertEqual(attacut.segment(""), []) - self.assertEqual( - word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), - ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], - ) - self.assertEqual( - attacut.segment( - "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-sc" - ), - ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], - ) - self.assertIsNotNone( - attacut.segment( - "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c" - ) - ) - - def test_word_tokenize_nercut(self): - self.assertEqual(tokenize_nercut.segment(None), []) - self.assertEqual(tokenize_nercut.segment(""), []) - self.assertIsNotNone(tokenize_nercut.segment("ทดสอบ")) - self.assertIsNotNone(tokenize_nercut.segment("ทดสอบ")) + def test_nercut(self): + self.assertEqual(nercut.segment(None), []) + self.assertEqual(nercut.segment(""), []) + self.assertIsNotNone(nercut.segment("ทดสอบ")) + self.assertIsNotNone(nercut.segment("ทดสอบ")) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) - def test_sent_tokenize(self): - self.assertEqual(sent_tokenize(None), []) - self.assertEqual(sent_tokenize(""), []) - self.assertEqual( - sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), - ["รักน้ำ", "รักปลา", ""], - ) - self.assertEqual( - sent_tokenize("รักน้ำ รักปลา ", engine="whitespace+newline"), - ["รักน้ำ", "รักปลา"], - ) - - sent_1 = "ฉันไปโรงเรียน เธอไปโรงพยาบาล" - sent_1_toks = ["ฉันไปโรงเรียน ", "เธอไปโรงพยาบาล"] - sent_2 = "วันนี้ฉันกินข้าว และโดดเรียน" - sent_2_toks = ["วันนี้ฉันกินข้าว และโดดเรียน"] - sent_3 = ( - "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" - + "จากผลงานวิจัยที่เคยทำมาในอดีต" - + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" - + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" - ) - sent_3_toks = [ - "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" - + "จากผลงานวิจัยที่เคยทำมาในอดีต ", - "มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ", - "จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้", - ] - - self.assertEqual( - sent_tokenize(sent_1, engine="crfcut"), sent_1_toks, - ) - self.assertEqual( - sent_tokenize(sent_2, engine="crfcut"), sent_2_toks, - ) - self.assertEqual( - sent_tokenize(sent_3, engine="crfcut"), sent_3_toks, - ) - self.assertEqual( - sent_tokenize(sent_1), sent_1_toks, - ) - self.assertEqual( - sent_tokenize(sent_2), sent_2_toks, - ) - self.assertEqual( - sent_tokenize(sent_3), sent_3_toks, - ) - self.assertIsNotNone( - sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",), - ) - self.assertFalse( - " " - in sent_tokenize( - sent_1, engine="whitespace", keep_whitespace=False, - ) - ) - with self.assertRaises(ValueError): - sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist - - def test_ssg_tokenize(self): - self.assertEqual(ssg_segment(None), []) - self.assertEqual(ssg_segment(""), []) - self.assertTrue( - "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") - ) - - def test_subword_tokenize(self): - self.assertEqual(subword_tokenize(None), []) - self.assertEqual(subword_tokenize(""), []) - self.assertIsInstance( - subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"), list - ) - self.assertFalse( - "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc") - ) - self.assertEqual(subword_tokenize(None, engine="etcc"), []) - self.assertEqual(subword_tokenize("", engine="etcc"), []) - self.assertIsInstance( - subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc"), list - ) - self.assertFalse( - "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") - ) - self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list) - self.assertFalse( - " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) - ) - with self.assertRaises(ValueError): - subword_tokenize("นกแก้ว", engine="XX") # engine does not exist - - def test_syllable_tokenize(self): - self.assertEqual(syllable_tokenize(None), []) - self.assertEqual(syllable_tokenize(""), []) - self.assertEqual( - syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] - ) - self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก")) - self.assertEqual(syllable_tokenize(None, engine="ssg"), []) - self.assertEqual(syllable_tokenize("", engine="ssg"), []) - self.assertEqual( - syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] - ) + def test_ssg(self): + self.assertEqual(ssg.segment(None), []) + self.assertEqual(ssg.segment(""), []) self.assertTrue( "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") ) - self.assertFalse( - "า" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg") - ) - self.assertFalse( - " " in syllable_tokenize("พันธมิตร ชา นม", keep_whitespace=False) - ) - with self.assertRaises(ValueError): - syllable_tokenize("กรอเทป", engine="XX") # engine does not exist def test_tcc(self): self.assertEqual(tcc.segment(None), []) From 1a7cbc9ec5dcd982644abc9ed18d4ca4e3e6daa9 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 18 Dec 2020 11:21:53 +0700 Subject: [PATCH 8/8] Update core.py --- pythainlp/tokenize/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 0c043414d..502798fad 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -36,6 +36,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]: ['และ', 'คุณ', 'เล่น', 'มือถือ'], ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']] """ + if not doc or not isinstance(doc, str): + return [] + from .crfcls import segment return segment(doc)