Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# import sys
# sys.path.insert(0, os.path.abspath('.'))
from datetime import datetime
import sys, os

# -- Project information -----------------------------------------------------

Expand Down
239 changes: 119 additions & 120 deletions docs/pythainlp-dev-thai.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@
print(word_tokenize(text2))
# ['กฎหมายแรงงาน']

print(word_tokenize(text2, engine="longest-matching"))
print(word_tokenize(text2, engine="longest"))
# ['กฎหมาย', 'แรงงาน']
64 changes: 0 additions & 64 deletions pythainlp/MetaSound.py

This file was deleted.

2 changes: 1 addition & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pythainlp.collation import collation
from pythainlp.date import now
from pythainlp.keywords import find_keyword
from pythainlp.MetaSound import MetaSound
from pythainlp.metasound import metasound
from pythainlp.rank import rank
from pythainlp.romanization import romanize
from pythainlp.sentiment import sentiment
Expand Down
98 changes: 98 additions & 0 deletions pythainlp/metasound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
"""
MetaSound - Thai soundex system

References:
Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
Ontology for Analysing Names Given in Accordance with Thai Astrology.
https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
"""

_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
_THANTHAKHAT = "์" # \u0e4c
_C1 = "กขฃคฆฅ" # sound K -> coded letter 1
_C2 = "จฉชฌซฐทฒดฎตสศษ" # D -> 2
_C3 = "ฟฝพผภบป" # B -> 3
_C4 = "ง" # NG -> 4
_C5 = "ลฬรนณฦญ" # N -> 5
_C6 = "ม" # M -> 6
_C7 = "ย" # Y -> 7
_C8 = "ว" # W -> 8


def metasound(text, length=4):
"""
Thai MetaSound

:param str text: Thai text
:param int length: preferred length of the MetaSound (default is 4)
:return: MetaSound for the text
**Example**::
from pythainlp.metasound import metasound
metasound("ลัก") # 'ล100'
metasound("รัก") # 'ร100'
metasound("รักษ์") # 'ร100'
metasound("บูรณการ", 5)) # 'บ5515'
"""
# keep only consonants and thanthakhat
chars = []
for ch in text:
if ch in _CONS_THANTHAKHAT:
chars.append(ch)

# remove karan (thanthakhat and a consonant before it)
i = 0
while i < len(chars):
if chars[i] == _THANTHAKHAT:
if i > 0:
chars[i - 1] = " "
chars[i] = " "
i += 1

# retain first consonant, encode the rest
chars = chars[:length]
i = 1
while i < len(chars):
if chars[i] in _C1:
chars[i] = "1"
elif chars[i] in _C2:
chars[i] = "2"
elif chars[i] in _C3:
chars[i] = "3"
elif chars[i] in _C4:
chars[i] = "4"
elif chars[i] in _C5:
chars[i] = "5"
elif chars[i] in _C6:
chars[i] = "6"
elif chars[i] in _C7:
chars[i] = "7"
elif chars[i] in _C8:
chars[i] = "8"
else:
chars[i] = "0"
i += 1

while len(chars) < length:
chars.append("0")

return "".join(chars)


if __name__ == "__main__":
print(metasound("บูรณะ")) # บ550 (an example from the original paper [Figure 4])
print(metasound("บูรณการ", 5)) # บ5515
print(metasound("ลักษณะ")) # ล125
print(metasound("ลัก")) # ล100
print(metasound("รัก")) # ร100
print(metasound("รักษ์")) # ร100
print(metasound("")) # 0000

print(metasound("คน"))
print(metasound("คนA"))
print(metasound("ดา"))
print(metasound("ปา"))
print(metasound("งา"))
print(metasound("ลา"))
print(metasound("มา"))
print(metasound("วา"))
2 changes: 0 additions & 2 deletions pythainlp/romanization/pyicu.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-

import sys

try:
import icu
except ImportError:
Expand Down
4 changes: 2 additions & 2 deletions pythainlp/sentiment/ulmfit_sent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
Sentiment analyzer based on thai2vec ("ulmfit" engine)
Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
"""
import sys
from collections import defaultdict

from pythainlp.corpus import download, get_file
Expand Down Expand Up @@ -85,7 +84,8 @@ def about():
return """
Sentiment analyzer based on thai2vec
Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline.
89% accuracy based on 15% validation set compared to
72% of fastText and 52% most-frequent-class baseline.

Development: Charin Polpanumas
GitHub: https://github.com/cstorm125/thai2vec
Expand Down
7 changes: 3 additions & 4 deletions pythainlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
"""
Part-Of-Speech tagger
"""
import sys

ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
_ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"


def pos_tag(words, engine="unigram", corpus="orchid"):
Expand All @@ -31,11 +30,11 @@ def _tag(text, corpus=None):
except ImportError:
from pythainlp.tools import install_package

install_package(ARTAGGER_URL)
install_package(_ARTAGGER_URL)
try:
from artagger import Tagger
except ImportError:
raise ImportError("Error: Try 'pip install " + ARTAGGER_URL + "'")
raise ImportError("Error: Try 'pip install " + _ARTAGGER_URL + "'")

words = Tagger().tag(" ".join(text))

Expand Down
2 changes: 1 addition & 1 deletion pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def pud_data():
return model


def tag(text, corpus):
def tag(text, corpus="pud"):
"""
รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
if corpus == "orchid":
Expand Down
88 changes: 48 additions & 40 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,48 @@

def word_tokenize(text, engine="newmm", whitespaces=True):
"""
:param str text: the text to be tokenized
:param str engine: the engine to tokenize text
:param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
:param str text: text to be tokenized
:param str engine: tokenizer to be used
:param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
:Parameters for engine:
* newmm - Maximum Matching algorithm + TCC
* icu - IBM ICU
* longest-matching - Longest matching
* mm - Maximum Matching algorithm
* pylexto - LexTo
* deepcut - Deep Neural Network
* wordcutpy - wordcutpy (https://github.com/veer66/wordcutpy)
:return: A list of words, tokenized from a text
* newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
* longest - dictionary-based, Longest Matching
* icu - wrapper for ICU, dictionary-based
* wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
* pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
* deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
* ulmfit - use newmm engine with a specific dictionary for use with thai2vec
:return: list of words, tokenized from the text

**Example**::
from pythainlp.tokenize import word_tokenize
text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
word_tokenize(text, engine="newmm") # ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
word_tokenize(text, engine="icu") # ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
>>> from pythainlp.tokenize import word_tokenize
>>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
>>> word_tokenize(text, engine="newmm")
['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
>>> word_tokenize(text, engine="icu")
['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
"""
if engine == "icu":
from .pyicu import segment
elif engine == "multi_cut" or engine == "mm":
from .multi_cut import segment
if engine == "newmm" or engine == "onecut":
from .newmm import mmcut as segment
elif engine == "longest" or engine == "longest-matching":
from .longest import segment
elif engine == "ulmfit":
from .newmm import mmcut

def segment(text):
return mmcut(text, trie=FROZEN_DICT_TRIE)
elif engine == "longest-matching":
from .longest import segment
elif engine == "pylexto":
from .pylexto import segment

elif engine == "icu":
from .pyicu import segment
elif engine == "deepcut":
from .deepcut import segment
elif engine == "wordcutpy":
from .wordcutpy import segment
else: # default, use "newmm" ("onecut") engine
elif engine == "pylexto":
from .pylexto import segment
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
else: # default, use "newmm" engine
from .newmm import mmcut as segment

if not whitespaces:
Expand All @@ -63,27 +69,28 @@ def segment(text):
def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
"""
:meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.

:param str text: the text to be tokenized
:param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
:param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
:return: A list of words, tokenized from a text.
:param str text: text to be tokenized
:param dict custom_dict_trie: a dictionary trie
:param str engine: choose between different options of engine to token (newmm, longest, wordcutpy)
:return: list of words
**Example**::
>>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
>>> listword=['แมว',"ดี"]
>>> data_dict=create_custom_dict_trie(listword)
>>> dict_word_tokenize("แมวดีดีแมว",data_dict)
>>> listword = ["แมว", "ดี"]
>>> data_dict = create_custom_dict_trie(listword)
>>> dict_word_tokenize("แมวดีดีแมว", data_dict)
['แมว', 'ดี', 'ดี', 'แมว']
"""
if engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
elif engine == "longest-matching":
if engine == "newmm" or engine == "onecut":
from .newmm import mmcut as segment
elif engine == "longest" or engine == "longest-matching":
from .longest import segment
elif engine == "wordcutpy":
from .wordcutpy import segment

return segment(text, custom_dict_trie.keys())
else: # default, use "newmm" ("onecut") engine
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
else: # default, use "newmm" engine
from .newmm import mmcut as segment

return segment(text, custom_dict_trie)
Expand Down Expand Up @@ -167,11 +174,12 @@ def syllable_tokenize(text):


def create_custom_dict_trie(custom_dict_source):
"""The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

:param string/list custom_dict_source: a list of vocaburaries or a path to source file
"""
The function is used to create a custom dict trie which will be used for word_tokenize() function.
For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

:return: A trie created from custom dict input
:param string/list custom_dict_source: a list of vocaburaries or a path to source file
:return: a trie created from custom dictionary input
"""

if type(custom_dict_source) is str:
Expand Down
1 change: 0 additions & 1 deletion pythainlp/tokenize/deepcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"""
Wrapper for deepcut Thai word segmentation
"""
import sys

try:
import deepcut
Expand Down
Loading