diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 724df3c2e..8f2df023a 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Multi cut -- Thai word segmentation with maximum matching. The original source -code is from Korakot Chaovavanich. +Multi cut -- Thai word segmentation with maximum matching. +Original code from Korakot Chaovavanich. :See Also: * `Facebook post \ @@ -12,16 +12,14 @@ import re from collections import defaultdict -from typing import List +from typing import Iterator, List from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE from pythainlp.util import Trie class LatticeString(str): - """ - String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี - """ + """String that keeps possible tokenizations""" def __new__(cls, value, multi=None, in_dict=True): return str.__new__(cls, value) @@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True): self.unique = False else: self.multi = [value] - self.in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า + self.in_dict = in_dict # if in dictionary _RE_NONTHAI = r"""(?x) -[-a-zA-Z]+| # Latin -\d[\d,\.]*| # number -[ \t]+| # space -\r?\n # newline +[-a-zA-Z]+| # Latin characters +\d+([,\.]\d+)*| # number +[ \t]+| # space +\r?\n # newline """ _PAT_NONTHAI = re.compile(_RE_NONTHAI) -def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE): - """ - ส่งคืน LatticeString คืนมาเป็นก้อนๆ - """ +def _multicut( + text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE +) -> Iterator[LatticeString]: + """Return LatticeString""" if not custom_dict: custom_dict = DEFAULT_WORD_DICT_TRIE @@ -100,7 +98,7 @@ def serialize(p, p2): # helper function q.add(i) -def mmcut(text: str): +def mmcut(text: str) -> List[str]: res = [] for w in _multicut(text): mm = min(w.multi, key=lambda x: x.count("/")) @@ -108,7 +106,7 @@ def mmcut(text: str): return res -def _combine(ww: str): +def _combine(ww: List[LatticeString]) -> Iterator[str]: if ww == []: yield "" else: @@ -124,12 +122,15 @@ def _combine(ww: str): def segment( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> List[str]: - """ - Dictionary-based maximum matching word segmentation. - - :param str text: text to be tokenized to words - :param pythainlp.util.Trie custom_dict: dictionary for tokenization - :return: list of words, tokenized from the text + """Dictionary-based maximum matching word segmentation. + + :param text: text to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :return: list of segmented tokens + :rtype: List[str] """ if not text or not isinstance(text, str): return [] @@ -140,11 +141,15 @@ def segment( def find_all_segment( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> List[str]: - """ - Get all possible segment variations - - :param str text: input string to be tokenized - :return: returns list of segment variations + """Get all possible segment variations. + + :param text: input string to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :return: list of segment variations + :rtype: List[str] """ if not text or not isinstance(text, str): return [] diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 03c783cf8..2d81032bb 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -25,10 +25,10 @@ # match non-Thai tokens _PAT_NONTHAI = re.compile( r"""(?x) -[-a-zA-Z]+| # Latin characters -\d[\d,\.]*| # number -[ \t]+| # space -\r?\n # newline +[-a-zA-Z]+| # Latin characters +\d+([,\.]\d+)*| # number +[ \t]+| # space +\r?\n # newline """ ) @@ -138,16 +138,23 @@ def segment( custom_dict: Trie = DEFAULT_WORD_DICT_TRIE, safe_mode: bool = False, ) -> List[str]: - """ - Dictionary-based maximal matching word segmentation, constrained with - Thai Character Cluster boundaries. - - :param str text: text to be tokenized to words - :param pythainlp.util.Trie custom_dict: dictionary for tokenization - :param bool safe_mode: True to help avoid long wait for text with long\ - and continuous ambiguous breaking points. Long wait may still able\ - to occur. Default is False. - :return: list of words, tokenized from the text + """Maximal-matching word segmentation, Thai Character Cluster constrained. + + A dictionary-based word segmentation using maximal matching algorithm, + constrained to Thai Character Cluster boundaries. + + A custom dictionary can be supplied. + + :param text: text to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :param safe_mode: reduce chance for long processing time in long text\ + with many ambiguous breaking points, defaults to False + :type safe_mode: bool, optional + :return: list of tokens + :rtype: List[str] """ if not text or not isinstance(text, str): return [] diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 398a3f322..ddd766487 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -450,6 +450,26 @@ def test_mm(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertEqual( + word_tokenize("19...", engine="mm"), + ['19', '...'], + ) + self.assertEqual( + word_tokenize("19.", engine="mm"), + ['19', '.'], + ) + self.assertEqual( + word_tokenize("19.84", engine="mm"), + ['19.84'], + ) + self.assertEqual( + word_tokenize("127.0.0.1", engine="mm"), + ["127.0.0.1"], + ) + self.assertEqual( + word_tokenize("USD1,984.42", engine="mm"), + ['USD', '1,984.42'], + ) self.assertIsNotNone(multi_cut.mmcut("ทดสอบ")) @@ -465,6 +485,26 @@ def test_newmm(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertEqual( + word_tokenize("19...", engine="newmm"), + ['19', '...'], + ) + self.assertEqual( + word_tokenize("19.", engine="newmm"), + ['19', '.'], + ) + self.assertEqual( + word_tokenize("19.84", engine="newmm"), + ['19.84'], + ) + self.assertEqual( + word_tokenize("127.0.0.1", engine="newmm"), + ["127.0.0.1"], + ) + self.assertEqual( + word_tokenize("USD1,984.42", engine="newmm"), + ['USD', '1,984.42'], + ) self.assertEqual( word_tokenize( "สวัสดีครับ สบายดีไหมครับ",