Skip to content

Commit 1bec0f2

Browse files
committed
Add pythainlp.corpus.thai_wsd_dict
1 parent 87106b5 commit 1bec0f2

File tree

5 files changed

+119
-16
lines changed

5 files changed

+119
-16
lines changed

docs/api/corpus.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Modules
1919
.. autofunction:: thai_dict
2020
.. autofunction:: thai_stopwords
2121
.. autofunction:: thai_words
22+
.. autofunction:: thai_wsd_dict
2223
.. autofunction:: thai_orst_words
2324
.. autofunction:: thai_syllables
2425
.. autofunction:: thai_negations

notebooks/test_wsd.ipynb

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
},
1515
{
1616
"cell_type": "code",
17-
"execution_count": 2,
17+
"execution_count": 9,
1818
"id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16",
1919
"metadata": {
2020
"tags": []
@@ -24,8 +24,7 @@
2424
"name": "stdout",
2525
"output_type": "stream",
2626
"text": [
27-
"Setting ds_accelerator to cuda (auto detect)\n",
28-
"[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086)]\n"
27+
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
2928
]
3029
}
3130
],
@@ -35,7 +34,7 @@
3534
},
3635
{
3736
"cell_type": "code",
38-
"execution_count": 3,
37+
"execution_count": 10,
3938
"id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33",
4039
"metadata": {
4140
"tags": []
@@ -45,7 +44,7 @@
4544
"name": "stdout",
4645
"output_type": "stream",
4746
"text": [
48-
"[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584)]\n"
47+
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
4948
]
5049
}
5150
],
@@ -70,6 +69,84 @@
7069
"source": [
7170
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
7271
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 5,
76+
"id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162",
77+
"metadata": {
78+
"tags": []
79+
},
80+
"outputs": [],
81+
"source": [
82+
"from pythainlp.corpus import get_corpus_path, thai_wsd_dict"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 6,
88+
"id": "0f88ff4c-06db-4cba-8086-4bb2160bead0",
89+
"metadata": {
90+
"tags": []
91+
},
92+
"outputs": [],
93+
"source": [
94+
"_w=thai_wsd_dict()"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 7,
100+
"id": "83642893-d9a6-4271-a1b7-5e57638a74d4",
101+
"metadata": {
102+
"tags": []
103+
},
104+
"outputs": [
105+
{
106+
"data": {
107+
"text/plain": [
108+
"dict_keys(['word', 'meaning'])"
109+
]
110+
},
111+
"execution_count": 7,
112+
"metadata": {},
113+
"output_type": "execute_result"
114+
}
115+
],
116+
"source": [
117+
"_w.keys()"
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": 8,
123+
"id": "bb67c468-ce65-4581-adc6-832d70cfabab",
124+
"metadata": {
125+
"tags": []
126+
},
127+
"outputs": [
128+
{
129+
"data": {
130+
"text/plain": [
131+
"('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])"
132+
]
133+
},
134+
"execution_count": 8,
135+
"metadata": {},
136+
"output_type": "execute_result"
137+
}
138+
],
139+
"source": [
140+
"_w[\"word\"][0],_w[\"meaning\"][0]"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"id": "27fbe522-019f-4157-a9a8-50ae62b50727",
147+
"metadata": {},
148+
"outputs": [],
149+
"source": []
73150
}
74151
],
75152
"metadata": {

pythainlp/corpus/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"thai_stopwords",
4141
"thai_syllables",
4242
"thai_words",
43+
"thai_wsd_dict",
4344
"thai_orst_words",
4445
"path_pythainlp_corpus",
4546
"get_path_folder_corpus",
@@ -114,4 +115,5 @@ def corpus_db_path() -> str:
114115
thai_words,
115116
thai_orst_words,
116117
thai_dict,
118+
thai_wsd_dict
117119
)

pythainlp/corpus/common.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"thai_syllables",
2828
"thai_words",
2929
"thai_dict",
30+
"thai_wsd_dict",
3031
]
3132

3233
from typing import FrozenSet, List, Union
@@ -62,6 +63,7 @@
6263
_THAI_ORST_WORDS = set()
6364

6465
_THAI_DICT = {}
66+
_THAI_WSD_DICT = {}
6567

6668

6769
def countries() -> FrozenSet[str]:
@@ -260,6 +262,7 @@ def thai_male_names() -> FrozenSet[str]:
260262

261263
return _THAI_MALE_NAMES
262264

265+
263266
def thai_dict() -> dict:
264267
"""
265268
Return Thai dictionary with definition from wiktionary.
@@ -280,3 +283,29 @@ def thai_dict() -> dict:
280283
_THAI_DICT["meaning"].append(row["meaning"])
281284

282285
return _THAI_DICT
286+
287+
288+
def thai_wsd_dict() -> dict:
289+
"""
290+
Return Thai Word Sense Disambiguation dictionary with definition from wiktionary.
291+
\n(See: `thai_dict\
292+
<https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)
293+
294+
:return: Thai word with part-of-speech type and definition
295+
:rtype: :class:`frozenset`
296+
"""
297+
global _THAI_WSD_DICT
298+
if _THAI_WSD_DICT == {}:
299+
_thai_wsd = thai_dict()
300+
_THAI_WSD_DICT = {"word":[],"meaning":[]}
301+
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
302+
_all_value = list(eval(j).values())
303+
_use = []
304+
for k in _all_value:
305+
_use.extend(k)
306+
_use=list(set(_use))
307+
if len(_use)>1:
308+
_THAI_WSD_DICT["word"].append(i)
309+
_THAI_WSD_DICT["meaning"].append(_use)
310+
311+
return _THAI_WSD_DICT

pythainlp/wsd/core.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,13 @@
1818
from pythainlp.corpus import thai_words
1919
from pythainlp.tokenize import Tokenizer
2020
from pythainlp.util.trie import Trie, dict_trie
21-
from pythainlp.corpus import get_corpus_path, thai_dict
21+
from pythainlp.corpus import get_corpus_path, thai_wsd_dict
2222

23-
_thai_wsd = thai_dict()
23+
_wsd_dict = thai_wsd_dict()
2424
_mean_all = {}
25-
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
26-
_all_value = list(eval(j).values())
27-
_use = []
28-
for k in _all_value:
29-
_use.extend(k)
30-
_use=list(set(_use))
31-
if len(_use)>1:
32-
_mean_all[i]=_use
33-
_all_word=set(list(_mean_all.keys()))
25+
for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
26+
_mean_all[i]=j
27+
_all_word = set(list(_mean_all.keys()))
3428
_TRIE = Trie(list(_all_word))
3529
_word_cut = Tokenizer(custom_dict=_TRIE)
3630

0 commit comments

Comments
 (0)