diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 2fd08ea85..67c11b3d6 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -12,6 +12,7 @@ Modules .. autofunction:: sent_tokenize .. autofunction:: paragraph_tokenize .. autofunction:: subword_tokenize +.. autofunction:: syllable_tokenize .. autofunction:: word_tokenize .. autofunction:: word_detokenize .. autoclass:: Tokenizer @@ -92,3 +93,7 @@ tcc+ etcc ++++ .. automodule:: pythainlp.tokenize.etcc + +han_solo +++++++++ +.. automodule:: pythainlp.tokenize.han_solo \ No newline at end of file diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 674153cc7..348b48957 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -23,6 +23,7 @@ "clause_tokenize", "sent_tokenize", "subword_tokenize", + "syllable_tokenize", "word_tokenize", "word_detokenize", "paragraph_tokenize", @@ -34,7 +35,7 @@ DEFAULT_WORD_TOKENIZE_ENGINE = "newmm" DEFAULT_SENT_TOKENIZE_ENGINE = "crfcut" DEFAULT_SUBWORD_TOKENIZE_ENGINE = "tcc" -DEFAULT_SYLLABLE_TOKENIZE_ENGINE = "dict" +DEFAULT_SYLLABLE_TOKENIZE_ENGINE = "han_solo" DEFAULT_WORD_DICT_TRIE = Trie(thai_words()) DEFAULT_SYLLABLE_DICT_TRIE = Trie(thai_syllables()) @@ -45,6 +46,7 @@ clause_tokenize, sent_tokenize, subword_tokenize, + syllable_tokenize, word_tokenize, word_detokenize, paragraph_tokenize, diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 39c59c706..23c08a472 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -525,8 +525,9 @@ def subword_tokenize( :param str text: text to be tokenized :param str engine: the name subword tokenizer + :param bool keep_whitespace: keep whitespace :return: list of subwords - :rtype: list[str] + :rtype: List[str] **Options for engine** * *dict* - newmm word tokenizer with a syllable dictionary * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) @@ -622,6 +623,44 @@ def subword_tokenize( return segments +def syllable_tokenize( + text: str, + engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE, + keep_whitespace: bool = True, +) -> List[str]: + """ + Syllable tokenizer + + Tokenizes text into inseparable units of + Thai syllable. + + :param str text: text to be tokenized + :param str engine: the name syllable tokenizer + :param bool keep_whitespace: keep whitespace + :return: list of subwords + :rtype: List[str] + **Options for engine** + * *dict* - newmm word tokenizer with a syllable dictionary + * *han_solo* - CRF syllable segmenter for Thai that can work in the \ + Thai social media domain. See `PyThaiNLP/Han-solo \ + `_. + * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \ + `_. + * *tltk* - syllable tokenizer from tltk. See `tltk \ + `_. + """ + if engine not in ["dict", "han_solo", "ssg", "tltk"]: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + return subword_tokenize( + text=text, + engine=engine, + keep_whitespace=keep_whitespace + ) + + class Tokenizer: """ Tokenizer class, for a custom tokenizer. diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py index 63053eca0..c17da83a1 100644 --- a/pythainlp/tokenize/han_solo.py +++ b/pythainlp/tokenize/han_solo.py @@ -14,6 +14,7 @@ # limitations under the License. """ 🪿 Han-solo: Thai syllable segmenter + GitHub: https://github.com/PyThaiNLP/Han-solo """ from typing import List diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 80a0daa28..750186328 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -16,6 +16,7 @@ sent_tokenize, ssg, subword_tokenize, + syllable_tokenize, tcc, tcc_p, word_tokenize, @@ -433,6 +434,11 @@ def test_subword_tokenize(self): with self.assertRaises(ValueError): subword_tokenize("นกแก้ว", engine="XX") # engine does not exist + def test_syllable_tokenize(self): + self.assertIsInstance(syllable_tokenize("โควิด19", engine="dict"), list) + with self.assertRaises(ValueError): + syllable_tokenize("นกแก้ว", engine="XX") # engine does not exist + def test_word_tokenize(self): self.assertEqual(word_tokenize(""), []) self.assertEqual(