diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 708c5efdd..e19272989 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -36,12 +36,15 @@ from pythainlp.tokenize.tcc_p import tcc_pos # match non-Thai tokens +# `|` is used as like "early return", +# which divides "abc123" to "abc", "123" for example. _PAT_NONTHAI = re.compile( - r"""(?x) +r"""(?x) [-a-zA-Z]+| # Latin characters \d+([,\.]\d+)*| # numbers [ \t]+| # spaces -\r?\n # newlines +\r?\n| # newlines +[^\u0E00-\u0E7F \t\r\n]+ # other non-Thai characters, and stops matching until space/newline """ ) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index c224a675c..1537b62c9 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -653,6 +653,15 @@ def test_newmm(self): keep_whitespace=False, ) ) + self.assertEqual( + word_tokenize("(คนไม่เอา)", engine="newmm"), ['(', 'คน', 'ไม่', 'เอา', ')'] + ) + self.assertEqual( + word_tokenize("กม/ชม", engine="newmm"), ['กม', '/', 'ชม'] + ) + self.assertEqual( + word_tokenize("สีหน้า(รถ)", engine="newmm"), ['สีหน้า', '(', 'รถ', ')'] + ) def test_newmm_longtext(self): self.assertIsInstance(