Skip to content
Merged
7 changes: 5 additions & 2 deletions pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@
from pythainlp.tokenize.tcc_p import tcc_pos

# match non-Thai tokens
# `|` is used as like "early return",
# which divides "abc123" to "abc", "123" for example.
_PAT_NONTHAI = re.compile(
r"""(?x)
r"""(?x)
[-a-zA-Z]+| # Latin characters
\d+([,\.]\d+)*| # numbers
[ \t]+| # spaces
\r?\n # newlines
\r?\n| # newlines
[^\u0E00-\u0E7F \t\r\n]+ # other non-Thai characters, and stops matching until space/newline
"""
)

Expand Down
9 changes: 9 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,15 @@ def test_newmm(self):
keep_whitespace=False,
)
)
self.assertEqual(
word_tokenize("(คนไม่เอา)", engine="newmm"), ['(', 'คน', 'ไม่', 'เอา', ')']
)
self.assertEqual(
word_tokenize("กม/ชม", engine="newmm"), ['กม', '/', 'ชม']
)
self.assertEqual(
word_tokenize("สีหน้า(รถ)", engine="newmm"), ['สีหน้า', '(', 'รถ', ')']
)

def test_newmm_longtext(self):
self.assertIsInstance(
Expand Down