From c742ded9a314377ffc30ca7e8c61075477829ea3 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 14 Apr 2021 16:52:13 +0700 Subject: [PATCH 1/5] Deprecated syllable_tokenize #322 syllable_tokenize is deprecated, use subword_tokenize instead --- pythainlp/tokenize/core.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 3fdd66e52..408e61ae7 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -4,6 +4,7 @@ """ import re from typing import Iterable, List, Union +import warnings from pythainlp.tokenize import ( DEFAULT_SENT_TOKENIZE_ENGINE, @@ -302,6 +303,8 @@ def subword_tokenize( * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) * *wangchanberta* - SentencePiece from wangchanberta model. + * *dict* (default) - newmm word tokenizer with a syllable dictionary + * *ssg* - CRF syllable segmenter for Thai :Example: @@ -346,19 +349,32 @@ def subword_tokenize( if not text or not isinstance(text, str): return [] + segments = [] + if engine == "tcc": from pythainlp.tokenize.tcc import segment elif engine == "etcc": from pythainlp.tokenize.etcc import segment elif engine == "wangchanberta": from pythainlp.wangchanberta import segment + elif engine == "dict": # use syllable dictionary + words = word_tokenize(text) + for word in words: + segments.extend( + word_tokenize( + text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE + ) + ) + elif engine == "ssg": + from pythainlp.tokenize.ssg import segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. It might be a typo; if not, please consult our document.""" ) - segments = segment(text) + if segments == []: + segments = segment(text) if not keep_whitespace: segments = [token.strip(" ") for token in segments if token.strip(" ")] @@ -374,6 +390,8 @@ def syllable_tokenize( """ Syllable tokenizer. + **syllable_tokenize is deprecated, use subword_tokenize instead** + Tokenizes text into syllable (Thai: พยางค์), a unit of pronunciation having one vowel sound. For example, the word 'รถไฟ' contains two syallbles including 'รถ', and 'ไฟ'. @@ -403,6 +421,10 @@ def syllable_tokenize( ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว', 'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า'] """ + warnings.warn( + "syllable_tokenize is deprecated, use subword_tokenize instead", + DeprecationWarning + ) if not text or not isinstance(text, str): return [] From 9d0453d7f85973e17cd92f492881d090195d4341 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 14 Apr 2021 16:55:56 +0700 Subject: [PATCH 2/5] Update core.py --- pythainlp/tokenize/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 408e61ae7..04c37410f 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -422,8 +422,8 @@ def syllable_tokenize( 'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า'] """ warnings.warn( - "syllable_tokenize is deprecated, use subword_tokenize instead", - DeprecationWarning + "syllable_tokenize will be deprecated in PyThaiNLP version 2.4, use subword_tokenize instead", + PendingDeprecationWarning ) if not text or not isinstance(text, str): From 92cefd3511ab922e62199e82cc594feafd7285d7 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 14 Apr 2021 16:56:39 +0700 Subject: [PATCH 3/5] Update core.py --- pythainlp/tokenize/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 04c37410f..f0a1fe2f5 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -422,7 +422,8 @@ def syllable_tokenize( 'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า'] """ warnings.warn( - "syllable_tokenize will be deprecated in PyThaiNLP version 2.4, use subword_tokenize instead", + """syllable_tokenize will be deprecated in PyThaiNLP version 2.4, + use subword_tokenize instead""", PendingDeprecationWarning ) From 2f396030cac2f767d1dbf0ffa087605c63012a26 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 23 Apr 2021 00:29:51 +0700 Subject: [PATCH 4/5] Update core.py --- pythainlp/tokenize/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index f0a1fe2f5..b43e7915e 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -303,7 +303,7 @@ def subword_tokenize( * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) * *wangchanberta* - SentencePiece from wangchanberta model. - * *dict* (default) - newmm word tokenizer with a syllable dictionary + * *dict* - newmm word tokenizer with a syllable dictionary * *ssg* - CRF syllable segmenter for Thai :Example: From 9bf184288cebd656b346d9b4d52f607406761a37 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 23 Apr 2021 00:31:59 +0700 Subject: [PATCH 5/5] Update test_tokenize.py --- tests/test_tokenize.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index d163238ce..398a3f322 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -300,6 +300,24 @@ def test_subword_tokenize(self): self.assertFalse( " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) + self.assertEqual( + subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"] + ) + self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict")) + self.assertEqual(subword_tokenize(None, engine="ssg"), []) + self.assertEqual(syllable_tokenize("", engine="ssg"), []) + self.assertEqual( + subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] + ) + self.assertTrue( + "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg") + ) + self.assertFalse( + " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) + ) with self.assertRaises(ValueError): subword_tokenize("นกแก้ว", engine="XX") # engine does not exist