Skip to content

Commit 2015804

Browse files
authored
Merge pull request #884 from bact/dev
Fix wrong/incompatible types, code readability
2 parents bf61884 + c098749 commit 2015804

File tree

22 files changed

+598
-594
lines changed

22 files changed

+598
-594
lines changed

notebooks/test_wsd.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
}
3131
],
3232
"source": [
33-
"print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
33+
"print(get_sense(\"เขากำลังอบขนมคุกกี้\", \"คุกกี้\"))"
3434
]
3535
},
3636
{
@@ -50,7 +50,7 @@
5050
}
5151
],
5252
"source": [
53-
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
53+
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คุกกี้\"))"
5454
]
5555
},
5656
{
@@ -68,7 +68,7 @@
6868
}
6969
],
7070
"source": [
71-
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
71+
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คน\"))"
7272
]
7373
},
7474
{
@@ -92,7 +92,7 @@
9292
},
9393
"outputs": [],
9494
"source": [
95-
"_w=thai_wsd_dict()"
95+
"w = thai_wsd_dict()"
9696
]
9797
},
9898
{
@@ -115,7 +115,7 @@
115115
}
116116
],
117117
"source": [
118-
"_w.keys()"
118+
"w.keys()"
119119
]
120120
},
121121
{
@@ -138,7 +138,7 @@
138138
}
139139
],
140140
"source": [
141-
"_w[\"word\"][0],_w[\"meaning\"][0]"
141+
"w[\"word\"][0], w[\"meaning\"][0]"
142142
]
143143
},
144144
{

pythainlp/ancient/aksonhan.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,22 @@
22
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
33
# SPDX-License-Identifier: Apache-2.0
44
from pythainlp.util import Trie
5-
from pythainlp import thai_consonants,thai_tonemarks
5+
from pythainlp import thai_consonants, thai_tonemarks
66
from pythainlp.tokenize import Tokenizer
77
from pythainlp.corpus import thai_orst_words
88

99

1010
_dict_aksonhan = {}
1111
for i in list(thai_consonants):
12-
if i=="ร":
12+
if i == "ร":
1313
continue
1414
for j in list(thai_tonemarks):
15-
_dict_aksonhan[i+j+i] = "ั"+j+i
16-
_dict_aksonhan[i+i+j+i] = i+"ั"+j+i
17-
_dict_aksonhan[i+i] = "ั"+i
15+
_dict_aksonhan[i + j + i] = "ั" + j + i
16+
_dict_aksonhan[i + i + j + i] = i + "ั" + j + i
17+
_dict_aksonhan[i + i] = "ั" + i
1818
_set_aksonhan = set(_dict_aksonhan.keys())
19-
_trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants))
20-
_tokenizer = Tokenizer(custom_dict=_trie,engine="mm")
19+
_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
20+
_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
2121
_dict_thai = set(thai_orst_words()) # call Thai words
2222

2323

@@ -52,13 +52,14 @@ def aksonhan_to_current(word: str) -> str:
5252
return word
5353
elif word in _set_aksonhan:
5454
return _dict_aksonhan[word]
55-
elif word in _dict_thai: # word in Thai words
55+
elif word in _dict_thai: # word in Thai words
5656
return word
57+
5758
_seg = _tokenizer.word_tokenize(word)
5859
_w = []
5960
for i in _seg:
6061
if i in _set_aksonhan:
6162
_w.append(_dict_aksonhan[i])
6263
else:
6364
_w.append(i)
64-
return ''.join(_w)
65+
return "".join(_w)

pythainlp/tokenize/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
"Tokenizer",
1111
"Trie",
1212
"clause_tokenize",
13+
"paragraph_tokenize",
1314
"sent_tokenize",
1415
"subword_tokenize",
1516
"syllable_tokenize",
16-
"word_tokenize",
1717
"word_detokenize",
18-
"paragraph_tokenize",
18+
"word_tokenize",
1919
]
2020

2121
from pythainlp.corpus import thai_syllables, thai_words
@@ -33,12 +33,12 @@
3333
from pythainlp.tokenize.core import (
3434
Tokenizer,
3535
clause_tokenize,
36+
paragraph_tokenize,
3637
sent_tokenize,
3738
subword_tokenize,
3839
syllable_tokenize,
39-
word_tokenize,
4040
word_detokenize,
41-
paragraph_tokenize,
41+
word_tokenize,
4242
)
4343

4444
from pythainlp.corpus import get_corpus as _get_corpus

pythainlp/tokenize/core.py

Lines changed: 49 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
4848

4949
def word_detokenize(
5050
segments: Union[List[List[str]], List[str]], output: str = "str"
51-
) -> Union[str, List[str]]:
51+
) -> Union[List[str], str]:
5252
"""
5353
Word detokenizer.
5454
@@ -65,16 +65,18 @@ def word_detokenize(
6565
print(word_detokenize(["เรา", "เล่น"]))
6666
# output: เราเล่น
6767
"""
68-
_list_all = []
68+
list_all = []
69+
6970
if isinstance(segments[0], str):
7071
segments = [segments]
72+
7173
from pythainlp import thai_characters
7274

7375
for i, s in enumerate(segments):
74-
_list_sents = []
75-
_add_index = []
76-
_space_index = []
77-
_mark_index = []
76+
list_sents = []
77+
add_index = []
78+
space_index = []
79+
mark_index = []
7880
for j, w in enumerate(s):
7981
if j > 0:
8082
# previous word
@@ -85,35 +87,36 @@ def word_detokenize(
8587
and not w.isspace()
8688
and not p_w.isspace()
8789
):
88-
_list_sents.append(" ")
89-
_add_index.append(j)
90+
list_sents.append(" ")
91+
add_index.append(j)
9092
# if previous word is number or other language and is not space
9193
elif p_w[0] not in thai_characters and not p_w.isspace():
92-
_list_sents.append(" ")
93-
_add_index.append(j)
94+
list_sents.append(" ")
95+
add_index.append(j)
9496
# if word is Thai iteration mark
9597
elif w == "ๆ":
9698
if not p_w.isspace():
97-
_list_sents.append(" ")
98-
_mark_index.append(j)
99-
elif w.isspace() and j - 1 not in _space_index:
100-
_space_index.append(j)
101-
elif j - 1 in _mark_index:
102-
_list_sents.append(" ")
103-
_list_sents.append(w)
104-
_list_all.append(_list_sents)
99+
list_sents.append(" ")
100+
mark_index.append(j)
101+
elif w.isspace() and j - 1 not in space_index:
102+
space_index.append(j)
103+
elif j - 1 in mark_index:
104+
list_sents.append(" ")
105+
list_sents.append(w)
106+
list_all.append(list_sents)
107+
105108
if output == "list":
106-
return _list_all
107-
else:
108-
_text = []
109-
for i in _list_all:
110-
_text.append("".join(i))
111-
return " ".join(_text)
109+
return list_all
110+
111+
text = []
112+
for i in list_all:
113+
text.append("".join(i))
114+
return " ".join(text)
112115

113116

114117
def word_tokenize(
115118
text: str,
116-
custom_dict: Trie = None,
119+
custom_dict: Trie = Trie([]),
117120
engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
118121
keep_whitespace: bool = True,
119122
join_broken_num: bool = True,
@@ -290,7 +293,7 @@ def word_tokenize(
290293

291294
if isinstance(custom_dict, str):
292295
segments = segment(text, custom_dict=custom_dict)
293-
elif not isinstance(custom_dict, str) and custom_dict is not None:
296+
elif not isinstance(custom_dict, str) and not custom_dict:
294297
raise ValueError(
295298
f"""Tokenizer \"{engine}\":
296299
custom_dict must be a str.
@@ -415,11 +418,12 @@ def sent_tokenize(
415418
segments = segment.split_into_sentences(text)
416419
elif engine.startswith("wtp"):
417420
if "-" not in engine:
418-
_size="mini"
421+
_size = "mini"
419422
else:
420423
_size = engine.split("-")[-1]
421424
from pythainlp.tokenize.wtsplit import tokenize as segment
422-
segments = segment(text,size=_size,tokenize="sentence")
425+
426+
segments = segment(text, size=_size, tokenize="sentence")
423427
else:
424428
raise ValueError(
425429
f"""Tokenizer \"{engine}\" not found.
@@ -435,8 +439,8 @@ def sent_tokenize(
435439
def paragraph_tokenize(
436440
text: str,
437441
engine: str = "wtp-mini",
438-
paragraph_threshold:float=0.5,
439-
style:str='newline',
442+
paragraph_threshold: float = 0.5,
443+
style: str = "newline",
440444
) -> List[List[str]]:
441445
"""
442446
Paragraph tokenizer.
@@ -479,23 +483,25 @@ def paragraph_tokenize(
479483
"""
480484
if engine.startswith("wtp"):
481485
if "-" not in engine:
482-
_size="mini"
486+
size = "mini"
483487
else:
484-
_size = engine.split("-")[-1]
488+
size = engine.split("-")[-1]
489+
485490
from pythainlp.tokenize.wtsplit import tokenize as segment
486-
segments = segment(
487-
text,
488-
size=_size,
489-
tokenize="paragraph",
490-
paragraph_threshold=paragraph_threshold,
491-
style=style,
492-
)
493491

492+
segments = segment(
493+
text,
494+
size=size,
495+
tokenize="paragraph",
496+
paragraph_threshold=paragraph_threshold,
497+
style=style,
498+
)
494499
else:
495500
raise ValueError(
496501
f"""Tokenizer \"{engine}\" not found.
497502
It might be a typo; if not, please consult our document."""
498503
)
504+
499505
return segments
500506

501507

@@ -622,7 +628,7 @@ def subword_tokenize(
622628

623629
def syllable_tokenize(
624630
text: str,
625-
engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
631+
engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
626632
keep_whitespace: bool = True,
627633
) -> List[str]:
628634
"""
@@ -652,9 +658,7 @@ def syllable_tokenize(
652658
It might be a typo; if not, please consult our document."""
653659
)
654660
return subword_tokenize(
655-
text=text,
656-
engine=engine,
657-
keep_whitespace=keep_whitespace
661+
text=text, engine=engine, keep_whitespace=keep_whitespace
658662
)
659663

660664

@@ -727,7 +731,7 @@ class Tokenizer:
727731

728732
def __init__(
729733
self,
730-
custom_dict: Union[Trie, Iterable[str], str] = None,
734+
custom_dict: Union[Trie, Iterable[str], str] = [],
731735
engine: str = "newmm",
732736
keep_whitespace: bool = True,
733737
join_broken_num: bool = True,
@@ -743,7 +747,7 @@ def __init__(
743747
:param bool keep_whitespace: True to keep whitespace, a common mark
744748
for end of phrase in Thai
745749
"""
746-
self.__trie_dict = None
750+
self.__trie_dict = Trie([])
747751
if custom_dict:
748752
self.__trie_dict = dict_trie(custom_dict)
749753
else:

pythainlp/tokenize/deepcut.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222

2323
def segment(
24-
text: str, custom_dict: Union[Trie, List[str], str] = None
24+
text: str, custom_dict: Union[Trie, List[str], str] = []
2525
) -> List[str]:
2626
if not text or not isinstance(text, str):
2727
return []

0 commit comments

Comments
 (0)