Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions notebooks/test_wsd.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
}
],
"source": [
"print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
"print(get_sense(\"เขากำลังอบขนมคุกกี้\", \"คุกกี้\"))"
]
},
{
Expand All @@ -50,7 +50,7 @@
}
],
"source": [
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คุกกี้\"))"
]
},
{
Expand All @@ -68,7 +68,7 @@
}
],
"source": [
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คน\"))"
]
},
{
Expand All @@ -92,7 +92,7 @@
},
"outputs": [],
"source": [
"_w=thai_wsd_dict()"
"w = thai_wsd_dict()"
]
},
{
Expand All @@ -115,7 +115,7 @@
}
],
"source": [
"_w.keys()"
"w.keys()"
]
},
{
Expand All @@ -138,7 +138,7 @@
}
],
"source": [
"_w[\"word\"][0],_w[\"meaning\"][0]"
"w[\"word\"][0], w[\"meaning\"][0]"
]
},
{
Expand Down
19 changes: 10 additions & 9 deletions pythainlp/ancient/aksonhan.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from pythainlp.util import Trie
from pythainlp import thai_consonants,thai_tonemarks
from pythainlp import thai_consonants, thai_tonemarks
from pythainlp.tokenize import Tokenizer
from pythainlp.corpus import thai_orst_words


_dict_aksonhan = {}
for i in list(thai_consonants):
if i=="ร":
if i == "ร":
continue
for j in list(thai_tonemarks):
_dict_aksonhan[i+j+i] = "ั"+j+i
_dict_aksonhan[i+i+j+i] = i+"ั"+j+i
_dict_aksonhan[i+i] = "ั"+i
_dict_aksonhan[i + j + i] = "ั" + j + i
_dict_aksonhan[i + i + j + i] = i + "ั" + j + i
_dict_aksonhan[i + i] = "ั" + i
_set_aksonhan = set(_dict_aksonhan.keys())
_trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants))
_tokenizer = Tokenizer(custom_dict=_trie,engine="mm")
_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
_dict_thai = set(thai_orst_words()) # call Thai words


Expand Down Expand Up @@ -52,13 +52,14 @@ def aksonhan_to_current(word: str) -> str:
return word
elif word in _set_aksonhan:
return _dict_aksonhan[word]
elif word in _dict_thai: # word in Thai words
elif word in _dict_thai: # word in Thai words
return word

_seg = _tokenizer.word_tokenize(word)
_w = []
for i in _seg:
if i in _set_aksonhan:
_w.append(_dict_aksonhan[i])
else:
_w.append(i)
return ''.join(_w)
return "".join(_w)
8 changes: 4 additions & 4 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
"Tokenizer",
"Trie",
"clause_tokenize",
"paragraph_tokenize",
"sent_tokenize",
"subword_tokenize",
"syllable_tokenize",
"word_tokenize",
"word_detokenize",
"paragraph_tokenize",
"word_tokenize",
]

from pythainlp.corpus import thai_syllables, thai_words
Expand All @@ -33,12 +33,12 @@
from pythainlp.tokenize.core import (
Tokenizer,
clause_tokenize,
paragraph_tokenize,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
word_detokenize,
paragraph_tokenize,
word_tokenize,
)

from pythainlp.corpus import get_corpus as _get_corpus
Expand Down
94 changes: 49 additions & 45 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:

def word_detokenize(
segments: Union[List[List[str]], List[str]], output: str = "str"
) -> Union[str, List[str]]:
) -> Union[List[str], str]:
"""
Word detokenizer.

Expand All @@ -65,16 +65,18 @@ def word_detokenize(
print(word_detokenize(["เรา", "เล่น"]))
# output: เราเล่น
"""
_list_all = []
list_all = []

if isinstance(segments[0], str):
segments = [segments]

from pythainlp import thai_characters

for i, s in enumerate(segments):
_list_sents = []
_add_index = []
_space_index = []
_mark_index = []
list_sents = []
add_index = []
space_index = []
mark_index = []
for j, w in enumerate(s):
if j > 0:
# previous word
Expand All @@ -85,35 +87,36 @@ def word_detokenize(
and not w.isspace()
and not p_w.isspace()
):
_list_sents.append(" ")
_add_index.append(j)
list_sents.append(" ")
add_index.append(j)
# if previous word is number or other language and is not space
elif p_w[0] not in thai_characters and not p_w.isspace():
_list_sents.append(" ")
_add_index.append(j)
list_sents.append(" ")
add_index.append(j)
# if word is Thai iteration mark
elif w == "ๆ":
if not p_w.isspace():
_list_sents.append(" ")
_mark_index.append(j)
elif w.isspace() and j - 1 not in _space_index:
_space_index.append(j)
elif j - 1 in _mark_index:
_list_sents.append(" ")
_list_sents.append(w)
_list_all.append(_list_sents)
list_sents.append(" ")
mark_index.append(j)
elif w.isspace() and j - 1 not in space_index:
space_index.append(j)
elif j - 1 in mark_index:
list_sents.append(" ")
list_sents.append(w)
list_all.append(list_sents)

if output == "list":
return _list_all
else:
_text = []
for i in _list_all:
_text.append("".join(i))
return " ".join(_text)
return list_all

text = []
for i in list_all:
text.append("".join(i))
return " ".join(text)


def word_tokenize(
text: str,
custom_dict: Trie = None,
custom_dict: Trie = Trie([]),
engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
join_broken_num: bool = True,
Expand Down Expand Up @@ -290,7 +293,7 @@ def word_tokenize(

if isinstance(custom_dict, str):
segments = segment(text, custom_dict=custom_dict)
elif not isinstance(custom_dict, str) and custom_dict is not None:
elif not isinstance(custom_dict, str) and not custom_dict:
raise ValueError(
f"""Tokenizer \"{engine}\":
custom_dict must be a str.
Expand Down Expand Up @@ -415,11 +418,12 @@ def sent_tokenize(
segments = segment.split_into_sentences(text)
elif engine.startswith("wtp"):
if "-" not in engine:
_size="mini"
_size = "mini"
else:
_size = engine.split("-")[-1]
from pythainlp.tokenize.wtsplit import tokenize as segment
segments = segment(text,size=_size,tokenize="sentence")

segments = segment(text, size=_size, tokenize="sentence")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand All @@ -435,8 +439,8 @@ def sent_tokenize(
def paragraph_tokenize(
text: str,
engine: str = "wtp-mini",
paragraph_threshold:float=0.5,
style:str='newline',
paragraph_threshold: float = 0.5,
style: str = "newline",
) -> List[List[str]]:
"""
Paragraph tokenizer.
Expand Down Expand Up @@ -479,23 +483,25 @@ def paragraph_tokenize(
"""
if engine.startswith("wtp"):
if "-" not in engine:
_size="mini"
size = "mini"
else:
_size = engine.split("-")[-1]
size = engine.split("-")[-1]

from pythainlp.tokenize.wtsplit import tokenize as segment
segments = segment(
text,
size=_size,
tokenize="paragraph",
paragraph_threshold=paragraph_threshold,
style=style,
)

segments = segment(
text,
size=size,
tokenize="paragraph",
paragraph_threshold=paragraph_threshold,
style=style,
)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)

return segments


Expand Down Expand Up @@ -622,7 +628,7 @@ def subword_tokenize(

def syllable_tokenize(
text: str,
engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
) -> List[str]:
"""
Expand Down Expand Up @@ -652,9 +658,7 @@ def syllable_tokenize(
It might be a typo; if not, please consult our document."""
)
return subword_tokenize(
text=text,
engine=engine,
keep_whitespace=keep_whitespace
text=text, engine=engine, keep_whitespace=keep_whitespace
)


Expand Down Expand Up @@ -727,7 +731,7 @@ class Tokenizer:

def __init__(
self,
custom_dict: Union[Trie, Iterable[str], str] = None,
custom_dict: Union[Trie, Iterable[str], str] = [],
engine: str = "newmm",
keep_whitespace: bool = True,
join_broken_num: bool = True,
Expand All @@ -743,7 +747,7 @@ def __init__(
:param bool keep_whitespace: True to keep whitespace, a common mark
for end of phrase in Thai
"""
self.__trie_dict = None
self.__trie_dict = Trie([])
if custom_dict:
self.__trie_dict = dict_trie(custom_dict)
else:
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/tokenize/deepcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


def segment(
text: str, custom_dict: Union[Trie, List[str], str] = None
text: str, custom_dict: Union[Trie, List[str], str] = []
) -> List[str]:
if not text or not isinstance(text, str):
return []
Expand Down
Loading