Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 33 additions & 28 deletions pythainlp/tokenize/multi_cut.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""
Multi cut -- Thai word segmentation with maximum matching. The original source
code is from Korakot Chaovavanich.
Multi cut -- Thai word segmentation with maximum matching.
Original code from Korakot Chaovavanich.

:See Also:
* `Facebook post \
Expand All @@ -12,16 +12,14 @@

import re
from collections import defaultdict
from typing import List
from typing import Iterator, List

from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
from pythainlp.util import Trie


class LatticeString(str):
"""
String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
"""
"""String that keeps possible tokenizations"""

def __new__(cls, value, multi=None, in_dict=True):
return str.__new__(cls, value)
Expand All @@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True):
self.unique = False
else:
self.multi = [value]
self.in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า
self.in_dict = in_dict # if in dictionary


_RE_NONTHAI = r"""(?x)
[-a-zA-Z]+| # Latin
\d[\d,\.]*| # number
[ \t]+| # space
\r?\n # newline
[-a-zA-Z]+| # Latin characters
\d+([,\.]\d+)*| # number
[ \t]+| # space
\r?\n # newline
"""
_PAT_NONTHAI = re.compile(_RE_NONTHAI)


def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE):
"""
ส่งคืน LatticeString คืนมาเป็นก้อนๆ
"""
def _multicut(
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> Iterator[LatticeString]:
"""Return LatticeString"""
if not custom_dict:
custom_dict = DEFAULT_WORD_DICT_TRIE

Expand Down Expand Up @@ -100,15 +98,15 @@ def serialize(p, p2): # helper function
q.add(i)


def mmcut(text: str):
def mmcut(text: str) -> List[str]:
res = []
for w in _multicut(text):
mm = min(w.multi, key=lambda x: x.count("/"))
res.extend(mm.split("/"))
return res


def _combine(ww: str):
def _combine(ww: List[LatticeString]) -> Iterator[str]:
if ww == []:
yield ""
else:
Expand All @@ -124,12 +122,15 @@ def _combine(ww: str):
def segment(
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> List[str]:
"""
Dictionary-based maximum matching word segmentation.

:param str text: text to be tokenized to words
:param pythainlp.util.Trie custom_dict: dictionary for tokenization
:return: list of words, tokenized from the text
"""Dictionary-based maximum matching word segmentation.

:param text: text to be tokenized
:type text: str
:param custom_dict: tokenization dictionary,\
defaults to DEFAULT_WORD_DICT_TRIE
:type custom_dict: Trie, optional
:return: list of segmented tokens
:rtype: List[str]
"""
if not text or not isinstance(text, str):
return []
Expand All @@ -140,11 +141,15 @@ def segment(
def find_all_segment(
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> List[str]:
"""
Get all possible segment variations

:param str text: input string to be tokenized
:return: returns list of segment variations
"""Get all possible segment variations.

:param text: input string to be tokenized
:type text: str
:param custom_dict: tokenization dictionary,\
defaults to DEFAULT_WORD_DICT_TRIE
:type custom_dict: Trie, optional
:return: list of segment variations
:rtype: List[str]
"""
if not text or not isinstance(text, str):
return []
Expand Down
35 changes: 21 additions & 14 deletions pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
# match non-Thai tokens
_PAT_NONTHAI = re.compile(
r"""(?x)
[-a-zA-Z]+| # Latin characters
\d[\d,\.]*| # number
[ \t]+| # space
\r?\n # newline
[-a-zA-Z]+| # Latin characters
\d+([,\.]\d+)*| # number
[ \t]+| # space
\r?\n # newline
"""
)

Expand Down Expand Up @@ -138,16 +138,23 @@ def segment(
custom_dict: Trie = DEFAULT_WORD_DICT_TRIE,
safe_mode: bool = False,
) -> List[str]:
"""
Dictionary-based maximal matching word segmentation, constrained with
Thai Character Cluster boundaries.

:param str text: text to be tokenized to words
:param pythainlp.util.Trie custom_dict: dictionary for tokenization
:param bool safe_mode: True to help avoid long wait for text with long\
and continuous ambiguous breaking points. Long wait may still able\
to occur. Default is False.
:return: list of words, tokenized from the text
"""Maximal-matching word segmentation, Thai Character Cluster constrained.

A dictionary-based word segmentation using maximal matching algorithm,
constrained to Thai Character Cluster boundaries.

A custom dictionary can be supplied.

:param text: text to be tokenized
:type text: str
:param custom_dict: tokenization dictionary,\
defaults to DEFAULT_WORD_DICT_TRIE
:type custom_dict: Trie, optional
:param safe_mode: reduce chance for long processing time in long text\
with many ambiguous breaking points, defaults to False
:type safe_mode: bool, optional
:return: list of tokens
:rtype: List[str]
"""
if not text or not isinstance(text, str):
return []
Expand Down
40 changes: 40 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,26 @@ def test_mm(self):
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
)
self.assertEqual(
word_tokenize("19...", engine="mm"),
['19', '...'],
)
self.assertEqual(
word_tokenize("19.", engine="mm"),
['19', '.'],
)
self.assertEqual(
word_tokenize("19.84", engine="mm"),
['19.84'],
)
self.assertEqual(
word_tokenize("127.0.0.1", engine="mm"),
["127.0.0.1"],
)
self.assertEqual(
word_tokenize("USD1,984.42", engine="mm"),
['USD', '1,984.42'],
)

self.assertIsNotNone(multi_cut.mmcut("ทดสอบ"))

Expand All @@ -465,6 +485,26 @@ def test_newmm(self):
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
)
self.assertEqual(
word_tokenize("19...", engine="newmm"),
['19', '...'],
)
self.assertEqual(
word_tokenize("19.", engine="newmm"),
['19', '.'],
)
self.assertEqual(
word_tokenize("19.84", engine="newmm"),
['19.84'],
)
self.assertEqual(
word_tokenize("127.0.0.1", engine="newmm"),
["127.0.0.1"],
)
self.assertEqual(
word_tokenize("USD1,984.42", engine="newmm"),
['USD', '1,984.42'],
)
self.assertEqual(
word_tokenize(
"สวัสดีครับ สบายดีไหมครับ",
Expand Down