PyThaiNLP · wannaphong · Aug 16, 2023 · Aug 14, 2023 · Aug 14, 2023
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -12,6 +12,7 @@ Modules
 .. autofunction:: sent_tokenize
 .. autofunction:: paragraph_tokenize
 .. autofunction:: subword_tokenize
+.. autofunction:: syllable_tokenize
 .. autofunction:: word_tokenize
 .. autofunction:: word_detokenize
 .. autoclass:: Tokenizer
@@ -92,3 +93,7 @@ tcc+
 etcc
 ++++
 .. automodule:: pythainlp.tokenize.etcc
+
+han_solo
+++++++++
+.. automodule:: pythainlp.tokenize.han_solo
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -23,6 +23,7 @@
     "clause_tokenize",
     "sent_tokenize",
     "subword_tokenize",
+    "syllable_tokenize",
     "word_tokenize",
     "word_detokenize",
     "paragraph_tokenize",
@@ -34,7 +35,7 @@
 DEFAULT_WORD_TOKENIZE_ENGINE = "newmm"
 DEFAULT_SENT_TOKENIZE_ENGINE = "crfcut"
 DEFAULT_SUBWORD_TOKENIZE_ENGINE = "tcc"
-DEFAULT_SYLLABLE_TOKENIZE_ENGINE = "dict"
+DEFAULT_SYLLABLE_TOKENIZE_ENGINE = "han_solo"
 
 DEFAULT_WORD_DICT_TRIE = Trie(thai_words())
 DEFAULT_SYLLABLE_DICT_TRIE = Trie(thai_syllables())
@@ -45,6 +46,7 @@
     clause_tokenize,
     sent_tokenize,
     subword_tokenize,
+    syllable_tokenize,
     word_tokenize,
     word_detokenize,
     paragraph_tokenize,

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -525,8 +525,9 @@ def subword_tokenize(
 
     :param str text: text to be tokenized
     :param str engine: the name subword tokenizer
+    :param bool keep_whitespace: keep whitespace
     :return: list of subwords
-    :rtype: list[str]
+    :rtype: List[str]
     **Options for engine**
         * *dict* - newmm word tokenizer with a syllable dictionary
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
@@ -622,6 +623,44 @@ def subword_tokenize(
     return segments
 
 
+def syllable_tokenize(
+    text: str,
+    engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
+    keep_whitespace: bool = True,
+) -> List[str]:
+    """
+    Syllable tokenizer
+
+    Tokenizes text into inseparable units of
+    Thai syllable.
+
+    :param str text: text to be tokenized
+    :param str engine: the name syllable tokenizer
+    :param bool keep_whitespace: keep whitespace
+    :return: list of subwords
+    :rtype: List[str]
+    **Options for engine**
+        * *dict* - newmm word tokenizer with a syllable dictionary
+        * *han_solo* - CRF syllable segmenter for Thai that can work in the \
+            Thai social media domain. See `PyThaiNLP/Han-solo \
+        <https://github.com/PyThaiNLP/Han-solo>`_.
+        * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \
+        <https://github.com/ponrawee/ssg>`_.
+        * *tltk* - syllable tokenizer from tltk. See `tltk \
+        <https://pypi.org/project/tltk/>`_.
+    """
+    if engine not in ["dict", "han_solo", "ssg", "tltk"]:
+        raise ValueError(
+            f"""Tokenizer \"{engine}\" not found.
+            It might be a typo; if not, please consult our document."""
+        )
+    return subword_tokenize(
+        text=text,
+        engine=engine,
+        keep_whitespace=keep_whitespace
+    )
+
+
 class Tokenizer:
     """
     Tokenizer class, for a custom tokenizer.

diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 🪿 Han-solo: Thai syllable segmenter
+
 GitHub: https://github.com/PyThaiNLP/Han-solo
 """
 from typing import List

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -16,6 +16,7 @@
     sent_tokenize,
     ssg,
     subword_tokenize,
+    syllable_tokenize,
     tcc,
     tcc_p,
     word_tokenize,
@@ -433,6 +434,11 @@ def test_subword_tokenize(self):
         with self.assertRaises(ValueError):
             subword_tokenize("นกแก้ว", engine="XX")  # engine does not exist
 
+    def test_syllable_tokenize(self):
+        self.assertIsInstance(syllable_tokenize("โควิด19", engine="dict"), list)
+        with self.assertRaises(ValueError):
+            syllable_tokenize("นกแก้ว", engine="XX")  # engine does not exist
+
     def test_word_tokenize(self):
         self.assertEqual(word_tokenize(""), [])
         self.assertEqual(