Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ target/

# Jupyter Notebook
.ipynb_checkpoints
Untitled*.ipynb

# IDE files
.idea
Expand Down
19 changes: 16 additions & 3 deletions examples/spell.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
# -*- coding: utf-8 -*-

from pythainlp.spell import spell
from pythainlp.spell.pn import spell as pn_tnc_spell
from pythainlp.spell.pn import correct as pn_tnc_correct
from pythainlp.spell.pn import NorvigSpellChecker
from pythainlp.corpus import ttc

a = spell("สี่เหลียม")
print(a) # ['สี่เหลี่ยม']
# checker from pythainlp.spell module (generic)
spell("สี่เหลียม") # ['สี่เหลี่ยม']
# spell("สี่เหลียม", engine="hunspell") # available in some Linux systems

# a = spell("สี่เหลียม", engine="hunspell") # available in some Linux systems
# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
pn_tnc_spell("เหลืยม")
pn_tnc_correct("เหลืยม")

# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
ttc_word_freqs = ttc.get_word_frequency_all()
pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs)
pn_ttc_spell_checker.spell("เหลืยม")
pn_ttc_spell_checker.correct("เหลืยม")
4 changes: 2 additions & 2 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
import os

import requests
from future.moves.urllib.request import urlopen
from pythainlp.tools import get_path_data, get_path_db
from tinydb import Query, TinyDB
from tqdm import tqdm
from urllib.request import urlopen

CORPUS_DB_URL = (
"https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json"
)

# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"]
path_db_ = get_path_db()


Expand Down
5 changes: 3 additions & 2 deletions pythainlp/corpus/tnc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""
Word frequency from Thai National Corpus
Thai National Corpus word frequency

Credit: Korakot Chaovavanich‎
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
"""
Expand Down Expand Up @@ -57,6 +58,6 @@ def get_word_frequency_all():
listword = []
for line in lines:
listindata = line.split(" ")
listword.append((listindata[0], listindata[1]))
listword.append((listindata[0], int(listindata[1])))

return listword
7 changes: 4 additions & 3 deletions pythainlp/corpus/ttc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""
TTC Thai word frequency
Thai Textbook Corpus (TTC) word frequency

Credit: Korakot Chaovavanich‎
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
"""
Expand All @@ -13,7 +14,7 @@

def get_word_frequency_all():
"""
ดึงข้อมูลความถี่คำของ TTC มาใช้งาน
ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
"""
path = os.path.join(os.path.expanduser("~"), "pythainlp-data")
Expand All @@ -34,6 +35,6 @@ def get_word_frequency_all():
listword = []
for line in lines:
listindata = line.split(" ")
listword.append((listindata[0], listindata[1]))
listword.append((listindata[0], int(listindata[1])))

return listword
54 changes: 24 additions & 30 deletions pythainlp/ner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pythainlp.corpus import download, get_file, stopwords
from pythainlp.tag import pos_tag
from pythainlp.tokenize import word_tokenize
from pythainlp.util import is_thaiword

try:
import sklearn_crfsuite
Expand All @@ -22,62 +23,55 @@
_STOPWORDS = stopwords.words("thai")


def _is_thaichar(ch): # เป็นอักษรไทยหรือไม่
ch_val = ord(ch)
if ch_val >= 3584 and ch_val <= 3711:
return True
return False


def _is_thaiword(word): # เป็นคำที่มีแต่อักษรไทยหรือไม่
for ch in word:
if ch != "." and not _is_thaichar(ch):
return False
return True


def _is_stopword(word): # เช็คว่าเป็นคำฟุ่มเฟือย
return word in _STOPWORDS


def _doc2features(doc, i):
word = doc[i][0]
postag = doc[i][1]

# Features from current word
features = {
"word.word": word,
"word.stopword": _is_stopword(word),
"word.isthai": _is_thaiword(word),
"word.isthai": is_thaiword(word),
"word.isspace": word.isspace(),
"postag": postag,
"word.isdigit()": word.isdigit(),
}

if word.isdigit() and len(word) == 5:
features["word.islen5"] = True

# Features from previous word
if i > 0:
prevword = doc[i - 1][0]
postag1 = doc[i - 1][1]
features["word.prevword"] = prevword
features["word.previsspace"] = prevword.isspace()
features["word.previsthai"] = _is_thaiword(prevword)
features["word.prevstopword"] = _is_stopword(prevword)
features["word.prepostag"] = postag1
features["word.prevwordisdigit"] = prevword.isdigit()
prevpostag = doc[i - 1][1]
prev_features = {
"word.prevword": prevword,
"word.previsspace": prevword.isspace(),
"word.previsthai": is_thaiword(prevword),
"word.prevstopword": _is_stopword(prevword),
"word.prevpostag": prevpostag,
"word.prevwordisdigit": prevword.isdigit(),
}
features.update(prev_features)
else:
features["BOS"] = True # Special "Beginning of Sequence" tag

# Features from next word
if i < len(doc) - 1:
nextword = doc[i + 1][0]
postag1 = doc[i + 1][1]
features["word.nextword"] = nextword
features["word.nextisspace"] = nextword.isspace()
features["word.nextpostag"] = postag1
features["word.nextisthai"] = _is_thaiword(nextword)
features["word.nextstopword"] = _is_stopword(nextword)
features["word.nextwordisdigit"] = nextword.isdigit()
nextpostag = doc[i + 1][1]
next_features = {
"word.nextword": nextword,
"word.nextisspace": nextword.isspace(),
"word.nextpostag": nextpostag,
"word.nextisthai": is_thaiword(nextword),
"word.nextstopword": _is_stopword(nextword),
"word.nextwordisdigit": nextword.isdigit(),
}
features.update(next_features)
else:
features["EOS"] = True # Special "End of Sequence" tag

Expand Down
Loading