From abd9769935ebe2f2c50ae1d46b58a00fda8d5975 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 May 2021 15:52:15 +0700 Subject: [PATCH 1/9] fixed #461 --- pythainlp/tokenize/multi_cut.py | 5 +++-- pythainlp/tokenize/newmm.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 724df3c2e..ff25621e0 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -38,8 +38,9 @@ def __init__(self, value, multi=None, in_dict=True): _RE_NONTHAI = r"""(?x) -[-a-zA-Z]+| # Latin -\d[\d,\.]*| # number +[-a-zA-Z]+| # Latin characters +\d+\.{1,1}\d+| # float number +\d+| # number [ \t]+| # space \r?\n # newline """ diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 03c783cf8..b8b444b8c 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -26,7 +26,8 @@ _PAT_NONTHAI = re.compile( r"""(?x) [-a-zA-Z]+| # Latin characters -\d[\d,\.]*| # number +\d+\.{1,1}\d+| # float number +\d+| # number [ \t]+| # space \r?\n # newline """ From 5b5952a482d4798bb7f086c78f3302a307b5145f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 May 2021 16:12:38 +0700 Subject: [PATCH 2/9] Update rule --- pythainlp/tokenize/multi_cut.py | 4 ++-- pythainlp/tokenize/newmm.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index ff25621e0..2bf27c37b 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -39,8 +39,8 @@ def __init__(self, value, multi=None, in_dict=True): _RE_NONTHAI = r"""(?x) [-a-zA-Z]+| # Latin characters -\d+\.{1,1}\d+| # float number -\d+| # number +\d+([\,\.]\d+)*| # float number +\d*| # number [ \t]+| # space \r?\n # newline """ diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index b8b444b8c..4525a4d71 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -26,8 +26,8 @@ _PAT_NONTHAI = re.compile( r"""(?x) [-a-zA-Z]+| # Latin characters -\d+\.{1,1}\d+| # float number -\d+| # number +\d+([\,\.]\d+)*| # float number +\d*| # number [ \t]+| # space \r?\n # newline """ From b0f1d5c2d4e6cf068700d63e5f0b07b19d0e7de0 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 May 2021 16:25:15 +0700 Subject: [PATCH 3/9] Update rule --- pythainlp/tokenize/multi_cut.py | 2 +- pythainlp/tokenize/newmm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 2bf27c37b..4ce5ca1b5 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -40,7 +40,7 @@ def __init__(self, value, multi=None, in_dict=True): _RE_NONTHAI = r"""(?x) [-a-zA-Z]+| # Latin characters \d+([\,\.]\d+)*| # float number -\d*| # number +\d+| # number [ \t]+| # space \r?\n # newline """ diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 4525a4d71..7fb385aba 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -27,7 +27,7 @@ r"""(?x) [-a-zA-Z]+| # Latin characters \d+([\,\.]\d+)*| # float number -\d*| # number +\d+| # number [ \t]+| # space \r?\n # newline """ From 39e594371dde0e5dac6f81b9a28562da7d85d810 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 May 2021 19:06:46 +0700 Subject: [PATCH 4/9] Update rule --- pythainlp/tokenize/multi_cut.py | 3 +-- pythainlp/tokenize/newmm.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 4ce5ca1b5..11148de83 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -39,8 +39,7 @@ def __init__(self, value, multi=None, in_dict=True): _RE_NONTHAI = r"""(?x) [-a-zA-Z]+| # Latin characters -\d+([\,\.]\d+)*| # float number -\d+| # number +\d+([\,\.]\d+)*| # number [ \t]+| # space \r?\n # newline """ diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 7fb385aba..a0f921b98 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -27,7 +27,6 @@ r"""(?x) [-a-zA-Z]+| # Latin characters \d+([\,\.]\d+)*| # float number -\d+| # number [ \t]+| # space \r?\n # newline """ From 15e112d8072e9ad8891eb6e47788c9425be27f92 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 17 May 2021 22:48:15 +0700 Subject: [PATCH 5/9] Add type hintings --- pythainlp/tokenize/multi_cut.py | 59 +++++++++++++++++---------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 11148de83..5d69b5366 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Multi cut -- Thai word segmentation with maximum matching. The original source -code is from Korakot Chaovavanich. +Multi cut -- Thai word segmentation with maximum matching. +Original code from Korakot Chaovavanich. :See Also: * `Facebook post \ @@ -12,16 +12,14 @@ import re from collections import defaultdict -from typing import List +from typing import Iterator, List from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE from pythainlp.util import Trie class LatticeString(str): - """ - String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี - """ + """String that keeps possible tokenizations""" def __new__(cls, value, multi=None, in_dict=True): return str.__new__(cls, value) @@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True): self.unique = False else: self.multi = [value] - self.in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า + self.in_dict = in_dict # if in dictionary _RE_NONTHAI = r"""(?x) -[-a-zA-Z]+| # Latin characters -\d+([\,\.]\d+)*| # number -[ \t]+| # space -\r?\n # newline +[-a-zA-Z]+| # Latin characters +\d+([,\.]\d+)*| # number +[ \t]+| # space +\r?\n # newline """ _PAT_NONTHAI = re.compile(_RE_NONTHAI) -def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE): - """ - ส่งคืน LatticeString คืนมาเป็นก้อนๆ - """ +def _multicut( + text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE +) -> Iterator[LatticeString]: + """Return LatticeString""" if not custom_dict: custom_dict = DEFAULT_WORD_DICT_TRIE @@ -100,7 +98,7 @@ def serialize(p, p2): # helper function q.add(i) -def mmcut(text: str): +def mmcut(text: str) -> List[str]: res = [] for w in _multicut(text): mm = min(w.multi, key=lambda x: x.count("/")) @@ -108,7 +106,7 @@ def mmcut(text: str): return res -def _combine(ww: str): +def _combine(ww: List[LatticeString]) -> Iterator[str]: if ww == []: yield "" else: @@ -124,12 +122,14 @@ def _combine(ww: str): def segment( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> List[str]: - """ - Dictionary-based maximum matching word segmentation. - - :param str text: text to be tokenized to words - :param pythainlp.util.Trie custom_dict: dictionary for tokenization - :return: list of words, tokenized from the text + """Dictionary-based maximum matching word segmentation. + + :param text: text to be tokenized + :type text: str + :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :return: list of segmented tokens + :rtype: List[str] """ if not text or not isinstance(text, str): return [] @@ -140,11 +140,14 @@ def segment( def find_all_segment( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> List[str]: - """ - Get all possible segment variations - - :param str text: input string to be tokenized - :return: returns list of segment variations + """Get all possible segment variations. + + :param text: input string to be tokenized + :type text: str + :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :return: list of segment variations + :rtype: List[str] """ if not text or not isinstance(text, str): return [] From 5a7977fee989182de9c5f3a032a78b31a8f76134 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 17 May 2021 23:06:13 +0700 Subject: [PATCH 6/9] \, -> , --- pythainlp/tokenize/newmm.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index a0f921b98..92cceb842 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -25,10 +25,10 @@ # match non-Thai tokens _PAT_NONTHAI = re.compile( r"""(?x) -[-a-zA-Z]+| # Latin characters -\d+([\,\.]\d+)*| # float number -[ \t]+| # space -\r?\n # newline +[-a-zA-Z]+| # Latin characters +\d+([,\.]\d+)*| # number +[ \t]+| # space +\r?\n # newline """ ) @@ -138,16 +138,23 @@ def segment( custom_dict: Trie = DEFAULT_WORD_DICT_TRIE, safe_mode: bool = False, ) -> List[str]: - """ - Dictionary-based maximal matching word segmentation, constrained with - Thai Character Cluster boundaries. - - :param str text: text to be tokenized to words - :param pythainlp.util.Trie custom_dict: dictionary for tokenization - :param bool safe_mode: True to help avoid long wait for text with long\ - and continuous ambiguous breaking points. Long wait may still able\ - to occur. Default is False. - :return: list of words, tokenized from the text + """Maximal-matching word segmentation, Thai Character Cluster constrained. + + A dictionary-based word segmentation using maximal matching algorithm, + constrainted to Thai Character Cluster boundaries. + + A custom dictionary can be supplied. + + :param text: text to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :param safe_mode: reduce chance for long processing time in long text\ + with many ambiguous breaking points, defaults to False + :type safe_mode: bool, optional + :return: list of tokens + :rtype: List[str] """ if not text or not isinstance(text, str): return [] From e2fe494872c07581e82503d1c1b201b59b0acef3 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 17 May 2021 23:07:19 +0700 Subject: [PATCH 7/9] Update newmm.py --- pythainlp/tokenize/newmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 92cceb842..2d81032bb 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -141,7 +141,7 @@ def segment( """Maximal-matching word segmentation, Thai Character Cluster constrained. A dictionary-based word segmentation using maximal matching algorithm, - constrainted to Thai Character Cluster boundaries. + constrained to Thai Character Cluster boundaries. A custom dictionary can be supplied. From ee776da1f6d870b07d9728974e1694a1f27518a7 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 17 May 2021 23:08:15 +0700 Subject: [PATCH 8/9] Update multi_cut.py --- pythainlp/tokenize/multi_cut.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 5d69b5366..8f2df023a 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -126,7 +126,8 @@ def segment( :param text: text to be tokenized :type text: str - :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE :type custom_dict: Trie, optional :return: list of segmented tokens :rtype: List[str] @@ -144,7 +145,8 @@ def find_all_segment( :param text: input string to be tokenized :type text: str - :param custom_dict: tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE :type custom_dict: Trie, optional :return: list of segment variations :rtype: List[str] From 86eae1c650bdba4849f250d291989f6224ff6115 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 20 May 2021 16:14:30 +0700 Subject: [PATCH 9/9] Add test cases for word_tokenize (newmm, mm) --- tests/test_tokenize.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 398a3f322..ddd766487 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -450,6 +450,26 @@ def test_mm(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertEqual( + word_tokenize("19...", engine="mm"), + ['19', '...'], + ) + self.assertEqual( + word_tokenize("19.", engine="mm"), + ['19', '.'], + ) + self.assertEqual( + word_tokenize("19.84", engine="mm"), + ['19.84'], + ) + self.assertEqual( + word_tokenize("127.0.0.1", engine="mm"), + ["127.0.0.1"], + ) + self.assertEqual( + word_tokenize("USD1,984.42", engine="mm"), + ['USD', '1,984.42'], + ) self.assertIsNotNone(multi_cut.mmcut("ทดสอบ")) @@ -465,6 +485,26 @@ def test_newmm(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertEqual( + word_tokenize("19...", engine="newmm"), + ['19', '...'], + ) + self.assertEqual( + word_tokenize("19.", engine="newmm"), + ['19', '.'], + ) + self.assertEqual( + word_tokenize("19.84", engine="newmm"), + ['19.84'], + ) + self.assertEqual( + word_tokenize("127.0.0.1", engine="newmm"), + ["127.0.0.1"], + ) + self.assertEqual( + word_tokenize("USD1,984.42", engine="newmm"), + ['USD', '1,984.42'], + ) self.assertEqual( word_tokenize( "สวัสดีครับ สบายดีไหมครับ",