From abd9769935ebe2f2c50ae1d46b58a00fda8d5975 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 17 May 2021 15:52:15 +0700
Subject: [PATCH 1/9] fixed #461

---
 pythainlp/tokenize/multi_cut.py | 5 +++--
 pythainlp/tokenize/newmm.py     | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index 724df3c2e..ff25621e0 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -38,8 +38,9 @@ def __init__(self, value, multi=None, in_dict=True):
 
 
 _RE_NONTHAI = r"""(?x)
-[-a-zA-Z]+|   # Latin
-\d[\d,\.]*|   # number
+[-a-zA-Z]+|   # Latin characters
+\d+\.{1,1}\d+|   # float number
+\d+| # number
 [ \t]+|       # space
 \r?\n         # newline
 """
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 03c783cf8..b8b444b8c 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -26,7 +26,8 @@
 _PAT_NONTHAI = re.compile(
     r"""(?x)
 [-a-zA-Z]+|   # Latin characters
-\d[\d,\.]*|   # number
+\d+\.{1,1}\d+|   # float number
+\d+| # number
 [ \t]+|       # space
 \r?\n         # newline
 """

From 5b5952a482d4798bb7f086c78f3302a307b5145f Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 17 May 2021 16:12:38 +0700
Subject: [PATCH 2/9] Update rule

---
 pythainlp/tokenize/multi_cut.py | 4 ++--
 pythainlp/tokenize/newmm.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index ff25621e0..2bf27c37b 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -39,8 +39,8 @@ def __init__(self, value, multi=None, in_dict=True):
 
 _RE_NONTHAI = r"""(?x)
 [-a-zA-Z]+|   # Latin characters
-\d+\.{1,1}\d+|   # float number
-\d+| # number
+\d+([\,\.]\d+)*|   # float number
+\d*| # number
 [ \t]+|       # space
 \r?\n         # newline
 """
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index b8b444b8c..4525a4d71 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -26,8 +26,8 @@
 _PAT_NONTHAI = re.compile(
     r"""(?x)
 [-a-zA-Z]+|   # Latin characters
-\d+\.{1,1}\d+|   # float number
-\d+| # number
+\d+([\,\.]\d+)*|   # float number
+\d*| # number
 [ \t]+|       # space
 \r?\n         # newline
 """

From b0f1d5c2d4e6cf068700d63e5f0b07b19d0e7de0 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 17 May 2021 16:25:15 +0700
Subject: [PATCH 3/9] Update rule

---
 pythainlp/tokenize/multi_cut.py | 2 +-
 pythainlp/tokenize/newmm.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index 2bf27c37b..4ce5ca1b5 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -40,7 +40,7 @@ def __init__(self, value, multi=None, in_dict=True):
 _RE_NONTHAI = r"""(?x)
 [-a-zA-Z]+|   # Latin characters
 \d+([\,\.]\d+)*|   # float number
-\d*| # number
+\d+| # number
 [ \t]+|       # space
 \r?\n         # newline
 """
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 4525a4d71..7fb385aba 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -27,7 +27,7 @@
     r"""(?x)
 [-a-zA-Z]+|   # Latin characters
 \d+([\,\.]\d+)*|   # float number
-\d*| # number
+\d+| # number
 [ \t]+|       # space
 \r?\n         # newline
 """

From 39e594371dde0e5dac6f81b9a28562da7d85d810 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 17 May 2021 19:06:46 +0700
Subject: [PATCH 4/9] Update rule

---
 pythainlp/tokenize/multi_cut.py | 3 +--
 pythainlp/tokenize/newmm.py     | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index 4ce5ca1b5..11148de83 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -39,8 +39,7 @@ def __init__(self, value, multi=None, in_dict=True):
 
 _RE_NONTHAI = r"""(?x)
 [-a-zA-Z]+|   # Latin characters
-\d+([\,\.]\d+)*|   # float number
-\d+| # number
+\d+([\,\.]\d+)*|   # number
 [ \t]+|       # space
 \r?\n         # newline
 """
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 7fb385aba..a0f921b98 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -27,7 +27,6 @@
     r"""(?x)
 [-a-zA-Z]+|   # Latin characters
 \d+([\,\.]\d+)*|   # float number
-\d+| # number
 [ \t]+|       # space
 \r?\n         # newline
 """

From 15e112d8072e9ad8891eb6e47788c9425be27f92 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 17 May 2021 22:48:15 +0700
Subject: [PATCH 5/9] Add type hintings

---
 pythainlp/tokenize/multi_cut.py | 59 +++++++++++++++++----------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index 11148de83..5d69b5366 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Multi cut -- Thai word segmentation with maximum matching. The original source
-code is from Korakot Chaovavanich.
+Multi cut -- Thai word segmentation with maximum matching.
+Original code from Korakot Chaovavanich.
 
 :See Also:
     * `Facebook post \
@@ -12,16 +12,14 @@
 
 import re
 from collections import defaultdict
-from typing import List
+from typing import Iterator, List
 
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
 from pythainlp.util import Trie
 
 
 class LatticeString(str):
-    """
-    String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
-    """
+    """String that keeps possible tokenizations"""
 
     def __new__(cls, value, multi=None, in_dict=True):
         return str.__new__(cls, value)
@@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True):
                 self.unique = False
         else:
             self.multi = [value]
-        self.in_dict = in_dict  # บอกว่าเป็นคำมีในดิกหรือเปล่า
+        self.in_dict = in_dict  # if in dictionary
 
 
 _RE_NONTHAI = r"""(?x)
-[-a-zA-Z]+|   # Latin characters
-\d+([\,\.]\d+)*|   # number
-[ \t]+|       # space
-\r?\n         # newline
+[-a-zA-Z]+|       # Latin characters
+\d+([,\.]\d+)*|   # number
+[ \t]+|           # space
+\r?\n             # newline
 """
 _PAT_NONTHAI = re.compile(_RE_NONTHAI)
 
 
-def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE):
-    """
-    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
-    """
+def _multicut(
+    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
+) -> Iterator[LatticeString]:
+    """Return LatticeString"""
     if not custom_dict:
         custom_dict = DEFAULT_WORD_DICT_TRIE
 
@@ -100,7 +98,7 @@ def serialize(p, p2):  # helper function
             q.add(i)
 
 
-def mmcut(text: str):
+def mmcut(text: str) -> List[str]:
     res = []
     for w in _multicut(text):
         mm = min(w.multi, key=lambda x: x.count("/"))
@@ -108,7 +106,7 @@ def mmcut(text: str):
     return res
 
 
-def _combine(ww: str):
+def _combine(ww: List[LatticeString]) -> Iterator[str]:
     if ww == []:
         yield ""
     else:
@@ -124,12 +122,14 @@ def _combine(ww: str):
 def segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
-    """
-    Dictionary-based maximum matching word segmentation.
-
-    :param str text: text to be tokenized to words
-    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
-    :return: list of words, tokenized from the text
+    """Dictionary-based maximum matching word segmentation.
+
+    :param text: text to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :return: list of segmented tokens
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []
@@ -140,11 +140,14 @@ def segment(
 def find_all_segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
-    """
-    Get all possible segment variations
-
-    :param str text: input string to be tokenized
-    :return: returns list of segment variations
+    """Get all possible segment variations.
+
+    :param text: input string to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :return: list of segment variations
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []

From 5a7977fee989182de9c5f3a032a78b31a8f76134 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 17 May 2021 23:06:13 +0700
Subject: [PATCH 6/9] \, -> ,

---
 pythainlp/tokenize/newmm.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index a0f921b98..92cceb842 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -25,10 +25,10 @@
 # match non-Thai tokens
 _PAT_NONTHAI = re.compile(
     r"""(?x)
-[-a-zA-Z]+|   # Latin characters
-\d+([\,\.]\d+)*|   # float number
-[ \t]+|       # space
-\r?\n         # newline
+[-a-zA-Z]+|        # Latin characters
+\d+([,\.]\d+)*|    # number
+[ \t]+|            # space
+\r?\n              # newline
 """
 )
 
@@ -138,16 +138,23 @@ def segment(
     custom_dict: Trie = DEFAULT_WORD_DICT_TRIE,
     safe_mode: bool = False,
 ) -> List[str]:
-    """
-    Dictionary-based maximal matching word segmentation, constrained with
-    Thai Character Cluster boundaries.
-
-    :param str text: text to be tokenized to words
-    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
-    :param bool safe_mode: True to help avoid long wait for text with long\
-        and continuous ambiguous breaking points. Long wait may still able\
-        to occur. Default is False.
-    :return: list of words, tokenized from the text
+    """Maximal-matching word segmentation, Thai Character Cluster constrained.
+
+    A dictionary-based word segmentation using maximal matching algorithm,
+    constrainted to Thai Character Cluster boundaries.
+
+    A custom dictionary can be supplied.
+
+    :param text: text to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :param safe_mode: reduce chance for long processing time in long text\
+        with many ambiguous breaking points, defaults to False
+    :type safe_mode: bool, optional
+    :return: list of tokens
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []

From e2fe494872c07581e82503d1c1b201b59b0acef3 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 17 May 2021 23:07:19 +0700
Subject: [PATCH 7/9] Update newmm.py

---
 pythainlp/tokenize/newmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 92cceb842..2d81032bb 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -141,7 +141,7 @@ def segment(
     """Maximal-matching word segmentation, Thai Character Cluster constrained.
 
     A dictionary-based word segmentation using maximal matching algorithm,
-    constrainted to Thai Character Cluster boundaries.
+    constrained to Thai Character Cluster boundaries.
 
     A custom dictionary can be supplied.
 

From ee776da1f6d870b07d9728974e1694a1f27518a7 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 17 May 2021 23:08:15 +0700
Subject: [PATCH 8/9] Update multi_cut.py

---
 pythainlp/tokenize/multi_cut.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index 5d69b5366..8f2df023a 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -126,7 +126,8 @@ def segment(
 
     :param text: text to be tokenized
     :type text: str
-    :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
     :type custom_dict: Trie, optional
     :return: list of segmented tokens
     :rtype: List[str]
@@ -144,7 +145,8 @@ def find_all_segment(
 
     :param text: input string to be tokenized
     :type text: str
-    :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
     :type custom_dict: Trie, optional
     :return: list of segment variations
     :rtype: List[str]

From 86eae1c650bdba4849f250d291989f6224ff6115 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 20 May 2021 16:14:30 +0700
Subject: [PATCH 9/9] Add test cases for word_tokenize (newmm, mm)

---
 tests/test_tokenize.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 398a3f322..ddd766487 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -450,6 +450,26 @@ def test_mm(self):
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
+        self.assertEqual(
+            word_tokenize("19...", engine="mm"),
+            ['19', '...'],
+        )
+        self.assertEqual(
+            word_tokenize("19.", engine="mm"),
+            ['19', '.'],
+        )
+        self.assertEqual(
+            word_tokenize("19.84", engine="mm"),
+            ['19.84'],
+        )
+        self.assertEqual(
+            word_tokenize("127.0.0.1", engine="mm"),
+            ["127.0.0.1"],
+        )
+        self.assertEqual(
+            word_tokenize("USD1,984.42", engine="mm"),
+            ['USD', '1,984.42'],
+        )
 
         self.assertIsNotNone(multi_cut.mmcut("ทดสอบ"))
 
@@ -465,6 +485,26 @@ def test_newmm(self):
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
+        self.assertEqual(
+            word_tokenize("19...", engine="newmm"),
+            ['19', '...'],
+        )
+        self.assertEqual(
+            word_tokenize("19.", engine="newmm"),
+            ['19', '.'],
+        )
+        self.assertEqual(
+            word_tokenize("19.84", engine="newmm"),
+            ['19.84'],
+        )
+        self.assertEqual(
+            word_tokenize("127.0.0.1", engine="newmm"),
+            ["127.0.0.1"],
+        )
+        self.assertEqual(
+            word_tokenize("USD1,984.42", engine="newmm"),
+            ['USD', '1,984.42'],
+        )
         self.assertEqual(
             word_tokenize(
                 "สวัสดีครับ สบายดีไหมครับ",