From c742ded9a314377ffc30ca7e8c61075477829ea3 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 14 Apr 2021 16:52:13 +0700
Subject: [PATCH 1/5] Deprecated syllable_tokenize #322

syllable_tokenize is deprecated, use subword_tokenize instead
---
 pythainlp/tokenize/core.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 3fdd66e52..408e61ae7 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -4,6 +4,7 @@
 """
 import re
 from typing import Iterable, List, Union
+import warnings
 
 from pythainlp.tokenize import (
     DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -302,6 +303,8 @@ def subword_tokenize(
         * *tcc* (default) -  Thai Character Cluster (Theeramunkong et al. 2000)
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
         * *wangchanberta* - SentencePiece from wangchanberta model.
+        * *dict* (default) - newmm word tokenizer with a syllable dictionary
+        * *ssg* - CRF syllable segmenter for Thai
 
     :Example:
 
@@ -346,19 +349,32 @@ def subword_tokenize(
     if not text or not isinstance(text, str):
         return []
 
+    segments = []
+
     if engine == "tcc":
         from pythainlp.tokenize.tcc import segment
     elif engine == "etcc":
         from pythainlp.tokenize.etcc import segment
     elif engine == "wangchanberta":
         from pythainlp.wangchanberta import segment
+    elif engine == "dict":  # use syllable dictionary
+        words = word_tokenize(text)
+        for word in words:
+            segments.extend(
+                word_tokenize(
+                    text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
+                )
+            )
+    elif engine == "ssg":
+        from pythainlp.tokenize.ssg import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
             It might be a typo; if not, please consult our document."""
         )
 
-    segments = segment(text)
+    if segments == []:
+        segments = segment(text)
 
     if not keep_whitespace:
         segments = [token.strip(" ") for token in segments if token.strip(" ")]
@@ -374,6 +390,8 @@ def syllable_tokenize(
     """
     Syllable tokenizer.
 
+    **syllable_tokenize is deprecated, use subword_tokenize instead**
+
     Tokenizes text into syllable (Thai: พยางค์), a unit of
     pronunciation having one vowel sound.  For example, the word 'รถไฟ'
     contains two syallbles including 'รถ', and 'ไฟ'.
@@ -403,6 +421,10 @@ def syllable_tokenize(
         ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
         'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
     """
+    warnings.warn(
+        "syllable_tokenize is deprecated, use subword_tokenize instead",
+        DeprecationWarning
+    )
 
     if not text or not isinstance(text, str):
         return []

From 9d0453d7f85973e17cd92f492881d090195d4341 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 14 Apr 2021 16:55:56 +0700
Subject: [PATCH 2/5] Update core.py

---
 pythainlp/tokenize/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 408e61ae7..04c37410f 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -422,8 +422,8 @@ def syllable_tokenize(
         'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
     """
     warnings.warn(
-        "syllable_tokenize is deprecated, use subword_tokenize instead",
-        DeprecationWarning
+        "syllable_tokenize will be deprecated in PyThaiNLP version 2.4, use subword_tokenize instead",
+        PendingDeprecationWarning
     )
 
     if not text or not isinstance(text, str):

From 92cefd3511ab922e62199e82cc594feafd7285d7 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 14 Apr 2021 16:56:39 +0700
Subject: [PATCH 3/5] Update core.py

---
 pythainlp/tokenize/core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 04c37410f..f0a1fe2f5 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -422,7 +422,8 @@ def syllable_tokenize(
         'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
     """
     warnings.warn(
-        "syllable_tokenize will be deprecated in PyThaiNLP version 2.4, use subword_tokenize instead",
+        """syllable_tokenize will be deprecated in PyThaiNLP version 2.4,
+        use subword_tokenize instead""",
         PendingDeprecationWarning
     )
 

From 2f396030cac2f767d1dbf0ffa087605c63012a26 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 23 Apr 2021 00:29:51 +0700
Subject: [PATCH 4/5] Update core.py

---
 pythainlp/tokenize/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index f0a1fe2f5..b43e7915e 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -303,7 +303,7 @@ def subword_tokenize(
         * *tcc* (default) -  Thai Character Cluster (Theeramunkong et al. 2000)
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
         * *wangchanberta* - SentencePiece from wangchanberta model.
-        * *dict* (default) - newmm word tokenizer with a syllable dictionary
+        * *dict* - newmm word tokenizer with a syllable dictionary
         * *ssg* - CRF syllable segmenter for Thai
 
     :Example:

From 9bf184288cebd656b346d9b4d52f607406761a37 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 23 Apr 2021 00:31:59 +0700
Subject: [PATCH 5/5] Update test_tokenize.py

---
 tests/test_tokenize.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index d163238ce..398a3f322 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -300,6 +300,24 @@ def test_subword_tokenize(self):
         self.assertFalse(
             " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
         )
+        self.assertEqual(
+            subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"]
+        )
+        self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict"))
+        self.assertEqual(subword_tokenize(None, engine="ssg"), [])
+        self.assertEqual(syllable_tokenize("", engine="ssg"), [])
+        self.assertEqual(
+            subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
+        )
+        self.assertTrue(
+            "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
+        )
+        self.assertFalse(
+            "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
+        )
+        self.assertFalse(
+            " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
+        )
         with self.assertRaises(ValueError):
             subword_tokenize("นกแก้ว", engine="XX")  # engine does not exist