From 3e9b28b54e552502dfefb6734295135f58d244bb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 01:26:28 -0700 Subject: [PATCH 1/8] Add pythainlp.wsd --- docs/api/wsd.rst | 12 ++++ notebooks/test_wsd.ipynb | 104 ++++++++++++++++++++++++++++++++ pythainlp/wsd/__init__.py | 19 ++++++ pythainlp/wsd/core.py | 122 ++++++++++++++++++++++++++++++++++++++ tests/test_wsd.py | 9 +++ 5 files changed, 266 insertions(+) create mode 100644 docs/api/wsd.rst create mode 100644 notebooks/test_wsd.ipynb create mode 100644 pythainlp/wsd/__init__.py create mode 100644 pythainlp/wsd/core.py create mode 100644 tests/test_wsd.py diff --git a/docs/api/wsd.rst b/docs/api/wsd.rst new file mode 100644 index 000000000..d62691e5b --- /dev/null +++ b/docs/api/wsd.rst @@ -0,0 +1,12 @@ +.. currentmodule:: pythainlp.wsd + +pythainlp.wsd +============= + +The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD). + + +Modules +------- + +.. autofunction:: get_sense \ No newline at end of file diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb new file mode 100644 index 000000000..19a2684f2 --- /dev/null +++ b/notebooks/test_wsd.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "70e6b5ba-063d-4e53-a312-2380b49bc3a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pythainlp.wsd import get_sense" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting ds_accelerator to cuda (auto detect)\n" + ] + }, + { + "data": { + "text/plain": [ + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n", + " 0.0974416732788086),\n", + " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n", + " 0.09319090843200684)]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n", + " 0.1005704402923584),\n", + " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n", + " 0.12473666667938232)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pythainlp/wsd/__init__.py b/pythainlp/wsd/__init__.py new file mode 100644 index 000000000..e1ca154b8 --- /dev/null +++ b/pythainlp/wsd/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Thai Word Sense Disambiguation (WSD) +""" +__all__ = ["get_sense"] +from pythainlp.wsd.core import get_sense \ No newline at end of file diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py new file mode 100644 index 000000000..5c87272fb --- /dev/null +++ b/pythainlp/wsd/core.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import csv +from typing import List, Tuple, Union + +from pythainlp.corpus import thai_words +from pythainlp.tokenize import Tokenizer +from pythainlp.util.trie import Trie, dict_trie +from pythainlp.corpus import get_corpus_path + +_thai_wsd = {"word":[], "meaning":[]} +with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile, delimiter=",") + for row in reader: + _thai_wsd["word"].append(row["word"]) + _thai_wsd["meaning"].append(row["meaning"]) +_mean_all = {} +for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): + _all_value = list(eval(j).values()) + _use = [] + for k in _all_value: + _use.extend(k) + _use=list(set(_use)) + if len(_use)>1: + _mean_all[i]=_use +_all_word=set(list(_mean_all.keys())) +_TRIE = Trie(list(_all_word)) +_word_cut = Tokenizer(custom_dict=_TRIE) + + +class _SentenceTransformersModel: + def __init__(self, model:str="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", device:str="cpu"): + from sentence_transformers import SentenceTransformer + self.device = device + self.model_name = model + self.model = SentenceTransformer(self.model_name, device=self.device) + def change_device(self, device: str): + from sentence_transformers import SentenceTransformer + self.device = device + self.model = SentenceTransformer(self.model_name, device=self.device) + def get_score(self, sentences1: str,sentences2: str)->float: + from sentence_transformers import util + embedding_1= self.model.encode(sentences1, convert_to_tensor=True) + embedding_2 = self.model.encode(sentences2, convert_to_tensor=True) + return 1-util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item() + +_MODEL = None + + +def get_sense( + sentence: str, + word: str, + device:str="cpu", + custom_dict: dict=_mean_all, + custom_tokenizer: Tokenizer=_word_cut, +) -> Union[List[Tuple[str, float]], None]: + """ + Get word sense from the sentence. + This function will get definition and distance from context in sentence. + + :param str sentence: Thai sentence + :param str word: Thai word + :param str device: device for running model. + :param dict custom_dict: Thai dictionary {"word":{"part-of-speech":["definition"]}} + :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence. + + We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation `_ to build get_sense function. + + For Thai dictionary, We use Thai dictionary from wiktionary. + See more `thai_dict `_. + + For the model, We use Sentence Transformers model from + sentence-transformers/paraphrase-multilingual-mpnet-base-v2. + + :Example: + :: + + from pythainlp.wsd import get_sense + print(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) + # output: + # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', + # 0.0974416732788086), + # ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', + # 0.09319090843200684)] + + print(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) + # output: + # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', + # 0.1005704402923584), + # ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', + # 0.12473666667938232)] + """ + global _MODEL + _w = custom_tokenizer.word_tokenize(sentence) + if word not in _w: + return None + if _MODEL == None: + _MODEL = _SentenceTransformersModel(device=device) + if _MODEL.device!=device: + _MODEL.change_device(device=device) + _temp_mean = custom_dict[word] + _temp =[] + for i in _temp_mean: + _temp_2 = [] + for j in _w: + if j == word: + j = word+f" ({word} ความหมาย '"+i.replace('(',"").replace(')',"")+"') " + _temp_2.append(j) + _temp.append((i,_MODEL.get_score(sentence,''.join(_temp_2)))) + return _temp \ No newline at end of file diff --git a/tests/test_wsd.py b/tests/test_wsd.py new file mode 100644 index 000000000..01dfd0186 --- /dev/null +++ b/tests/test_wsd.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +import unittest +from pythainlp.wsd import get_sense + + +class TestWsdPackage(unittest.TestCase): + def test_get_sense(self): + self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) + self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) From 2216899854e5f9b55f5c393d9de1ed7ec8d81864 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 01:39:10 -0700 Subject: [PATCH 2/8] Add wsd requirements --- docker_requirements.txt | 1 + docs/notes/installation.rst | 1 + setup.py | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/docker_requirements.txt b/docker_requirements.txt index 4cd8b63f9..c5fe9591e 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -37,3 +37,4 @@ ufal.chu-liu-edmonds==1.0.2 wtpsplit==1.0.1 fastcoref==2.1.6 panphon==0.20.0 +sentence-transformers==2.2.2 diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst index fa5bdb896..92bbd436e 100644 --- a/docs/notes/installation.rst +++ b/docs/notes/installation.rst @@ -36,6 +36,7 @@ where ``extras`` can be - ``transformers_ud`` (to support transformers_ud engine) - ``dependency_parsing`` (to support dependency parsing with all engine) - ``coreference_resolution`` (to support coreference esolution with all engine) + - ``wsd`` (to support pythainlp.wsd) - ``full`` (install everything) For dependency details, look at `extras` variable in `setup.py `_. diff --git a/setup.py b/setup.py index 7da84e696..16bb96c1a 100644 --- a/setup.py +++ b/setup.py @@ -117,6 +117,9 @@ "word_approximation":{ "panphon>=0.20.0" }, + "wsd":{ + "sentence-transformers>=2.2.2" + }, "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -150,6 +153,7 @@ "fastcoref>=2.1.5", "ufal.chu-liu-edmonds>=1.0.2", "panphon>=0.20.0", + "sentence-transformers>=2.2.2", ], } From 233ba2ff1944bd7ba730aab55bee3fc66201c009 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 01:44:29 -0700 Subject: [PATCH 3/8] Improve get_sense docs --- pythainlp/wsd/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index 5c87272fb..1c28808b9 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -75,6 +75,8 @@ def get_sense( :param str device: device for running model. :param dict custom_dict: Thai dictionary {"word":{"part-of-speech":["definition"]}} :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence. + :return: list of definition and distance or None (If word is not in the dictionary) + :rtype: Union[List[Tuple[str, float]], None] We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation `_ to build get_sense function. From 07531747bd32460c144e656788b7e3852c62db30 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 02:19:39 -0700 Subject: [PATCH 4/8] Update get_sense docs --- pythainlp/wsd/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index 1c28808b9..c5b4a43cf 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -73,7 +73,7 @@ def get_sense( :param str sentence: Thai sentence :param str word: Thai word :param str device: device for running model. - :param dict custom_dict: Thai dictionary {"word":{"part-of-speech":["definition"]}} + :param dict custom_dict: Thai dictionary {"word":["definition",..]} :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence. :return: list of definition and distance or None (If word is not in the dictionary) :rtype: Union[List[Tuple[str, float]], None] From d14d2b8d20978ba794c452ad77aa375d944ba5d5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 02:30:31 -0700 Subject: [PATCH 5/8] Update pythainlp.wsd --- notebooks/test_wsd.ipynb | 52 +++++++++++++++++----------------------- pythainlp/wsd/core.py | 2 +- tests/test_wsd.py | 1 + 3 files changed, 24 insertions(+), 31 deletions(-) diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb index 19a2684f2..a905f6a45 100644 --- a/notebooks/test_wsd.ipynb +++ b/notebooks/test_wsd.ipynb @@ -24,25 +24,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Setting ds_accelerator to cuda (auto detect)\n" + "Setting ds_accelerator to cuda (auto detect)\n", + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n" ] - }, - { - "data": { - "text/plain": [ - "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n", - " 0.0974416732788086),\n", - " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n", - " 0.09319090843200684)]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\")" + "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))" ] }, { @@ -54,30 +42,34 @@ }, "outputs": [ { - "data": { - "text/plain": [ - "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n", - " 0.1005704402923584),\n", - " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n", - " 0.12473666667938232)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n" + ] } ], "source": [ - "get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\")" + "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))" + ] } ], "metadata": { diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index c5b4a43cf..63440ea8c 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -106,7 +106,7 @@ def get_sense( """ global _MODEL _w = custom_tokenizer.word_tokenize(sentence) - if word not in _w: + if word not in set(custom_dict.keys()) or word not in sentence: return None if _MODEL == None: _MODEL = _SentenceTransformersModel(device=device) diff --git a/tests/test_wsd.py b/tests/test_wsd.py index 01dfd0186..b58fe76fa 100644 --- a/tests/test_wsd.py +++ b/tests/test_wsd.py @@ -7,3 +7,4 @@ class TestWsdPackage(unittest.TestCase): def test_get_sense(self): self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) + self.assertIsNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คน")) From 87106b564e0cfd277c748d81cf00a5bb0fa7aa94 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 05:00:53 -0700 Subject: [PATCH 6/8] Add pythainlp.corpus.thai_dict --- docs/api/corpus.rst | 1 + notebooks/test_wsd.ipynb | 4 ++-- pythainlp/corpus/__init__.py | 2 ++ pythainlp/corpus/common.py | 26 +++++++++++++++++++++++++- pythainlp/wsd/core.py | 9 ++------- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index 9f74e645a..7da4c81a2 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -16,6 +16,7 @@ Modules .. autofunction:: download .. autofunction:: remove .. autofunction:: provinces +.. autofunction:: thai_dict .. autofunction:: thai_stopwords .. autofunction:: thai_words .. autofunction:: thai_orst_words diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb index a905f6a45..2de6dda74 100644 --- a/notebooks/test_wsd.ipynb +++ b/notebooks/test_wsd.ipynb @@ -25,7 +25,7 @@ "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\n", - "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n" + "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086)]\n" ] } ], @@ -45,7 +45,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n" + "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584)]\n" ] } ], diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 49f8e1920..6c81f1a9f 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -32,6 +32,7 @@ "get_corpus_path", "provinces", "remove", + "thai_dict", "thai_family_names", "thai_female_names", "thai_male_names", @@ -112,4 +113,5 @@ def corpus_db_path() -> str: thai_syllables, thai_words, thai_orst_words, + thai_dict, ) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 2a99fb89c..bb87e3dd1 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -26,11 +26,12 @@ "thai_stopwords", "thai_syllables", "thai_words", + "thai_dict", ] from typing import FrozenSet, List, Union -from pythainlp.corpus import get_corpus +from pythainlp.corpus import get_corpus, get_corpus_path _THAI_COUNTRIES = set() _THAI_COUNTRIES_FILENAME = "countries_th.txt" @@ -60,6 +61,8 @@ _THAI_ORST_WORDS = set() +_THAI_DICT = {} + def countries() -> FrozenSet[str]: """ @@ -256,3 +259,24 @@ def thai_male_names() -> FrozenSet[str]: _THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME) return _THAI_MALE_NAMES + +def thai_dict() -> dict: + """ + Return Thai dictionary with definition from wiktionary. + \n(See: `thai_dict\ + `_) + + :return: Thai word with part-of-speech type and definition + :rtype: :class:`frozenset` + """ + global _THAI_DICT + if _THAI_DICT == {}: + import csv + _THAI_DICT = {"word":[], "meaning":[]} + with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile, delimiter=",") + for row in reader: + _THAI_DICT["word"].append(row["word"]) + _THAI_DICT["meaning"].append(row["meaning"]) + + return _THAI_DICT diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index 63440ea8c..49685c079 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -18,14 +18,9 @@ from pythainlp.corpus import thai_words from pythainlp.tokenize import Tokenizer from pythainlp.util.trie import Trie, dict_trie -from pythainlp.corpus import get_corpus_path +from pythainlp.corpus import get_corpus_path, thai_dict -_thai_wsd = {"word":[], "meaning":[]} -with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile: - reader = csv.DictReader(csvfile, delimiter=",") - for row in reader: - _thai_wsd["word"].append(row["word"]) - _thai_wsd["meaning"].append(row["meaning"]) +_thai_wsd = thai_dict() _mean_all = {} for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): _all_value = list(eval(j).values()) From 1bec0f2f76fa19913ee8a705fa48bd1e8c5591fd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 05:42:28 -0700 Subject: [PATCH 7/8] Add pythainlp.corpus.thai_wsd_dict --- docs/api/corpus.rst | 1 + notebooks/test_wsd.ipynb | 87 +++++++++++++++++++++++++++++++++--- pythainlp/corpus/__init__.py | 2 + pythainlp/corpus/common.py | 29 ++++++++++++ pythainlp/wsd/core.py | 16 +++---- 5 files changed, 119 insertions(+), 16 deletions(-) diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index 7da4c81a2..ff8617358 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -19,6 +19,7 @@ Modules .. autofunction:: thai_dict .. autofunction:: thai_stopwords .. autofunction:: thai_words +.. autofunction:: thai_wsd_dict .. autofunction:: thai_orst_words .. autofunction:: thai_syllables .. autofunction:: thai_negations diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb index 2de6dda74..9a2fc7027 100644 --- a/notebooks/test_wsd.ipynb +++ b/notebooks/test_wsd.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16", "metadata": { "tags": [] @@ -24,8 +24,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Setting ds_accelerator to cuda (auto detect)\n", - "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086)]\n" + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n" ] } ], @@ -35,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33", "metadata": { "tags": [] @@ -45,7 +44,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584)]\n" + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n" ] } ], @@ -70,6 +69,84 @@ "source": [ "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))" ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pythainlp.corpus import get_corpus_path, thai_wsd_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0f88ff4c-06db-4cba-8086-4bb2160bead0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "_w=thai_wsd_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "83642893-d9a6-4271-a1b7-5e57638a74d4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['word', 'meaning'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_w.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bb67c468-ce65-4581-adc6-832d70cfabab", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_w[\"word\"][0],_w[\"meaning\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27fbe522-019f-4157-a9a8-50ae62b50727", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 6c81f1a9f..ca6e047d4 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -40,6 +40,7 @@ "thai_stopwords", "thai_syllables", "thai_words", + "thai_wsd_dict", "thai_orst_words", "path_pythainlp_corpus", "get_path_folder_corpus", @@ -114,4 +115,5 @@ def corpus_db_path() -> str: thai_words, thai_orst_words, thai_dict, + thai_wsd_dict ) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index bb87e3dd1..15007b5e3 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -27,6 +27,7 @@ "thai_syllables", "thai_words", "thai_dict", + "thai_wsd_dict", ] from typing import FrozenSet, List, Union @@ -62,6 +63,7 @@ _THAI_ORST_WORDS = set() _THAI_DICT = {} +_THAI_WSD_DICT = {} def countries() -> FrozenSet[str]: @@ -260,6 +262,7 @@ def thai_male_names() -> FrozenSet[str]: return _THAI_MALE_NAMES + def thai_dict() -> dict: """ Return Thai dictionary with definition from wiktionary. @@ -280,3 +283,29 @@ def thai_dict() -> dict: _THAI_DICT["meaning"].append(row["meaning"]) return _THAI_DICT + + +def thai_wsd_dict() -> dict: + """ + Return Thai Word Sense Disambiguation dictionary with definition from wiktionary. + \n(See: `thai_dict\ + `_) + + :return: Thai word with part-of-speech type and definition + :rtype: :class:`frozenset` + """ + global _THAI_WSD_DICT + if _THAI_WSD_DICT == {}: + _thai_wsd = thai_dict() + _THAI_WSD_DICT = {"word":[],"meaning":[]} + for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): + _all_value = list(eval(j).values()) + _use = [] + for k in _all_value: + _use.extend(k) + _use=list(set(_use)) + if len(_use)>1: + _THAI_WSD_DICT["word"].append(i) + _THAI_WSD_DICT["meaning"].append(_use) + + return _THAI_WSD_DICT \ No newline at end of file diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index 49685c079..e5a984065 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -18,19 +18,13 @@ from pythainlp.corpus import thai_words from pythainlp.tokenize import Tokenizer from pythainlp.util.trie import Trie, dict_trie -from pythainlp.corpus import get_corpus_path, thai_dict +from pythainlp.corpus import get_corpus_path, thai_wsd_dict -_thai_wsd = thai_dict() +_wsd_dict = thai_wsd_dict() _mean_all = {} -for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): - _all_value = list(eval(j).values()) - _use = [] - for k in _all_value: - _use.extend(k) - _use=list(set(_use)) - if len(_use)>1: - _mean_all[i]=_use -_all_word=set(list(_mean_all.keys())) +for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]): + _mean_all[i]=j +_all_word = set(list(_mean_all.keys())) _TRIE = Trie(list(_all_word)) _word_cut = Tokenizer(custom_dict=_TRIE) From f38c51a8ea97a2e2fa2f6b1cbf53ea21ad418741 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 12 Jul 2023 22:00:55 -0700 Subject: [PATCH 8/8] Update pythainlp.wsd docs --- pythainlp/wsd/core.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index e5a984065..17dfee873 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -67,13 +67,15 @@ def get_sense( :return: list of definition and distance or None (If word is not in the dictionary) :rtype: Union[List[Tuple[str, float]], None] - We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation `_ to build get_sense function. + We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised \ + Word Sense Disambiguation `_ to build get_sense function. For Thai dictionary, We use Thai dictionary from wiktionary. See more `thai_dict `_. - For the model, We use Sentence Transformers model from - sentence-transformers/paraphrase-multilingual-mpnet-base-v2. + For the model, We use Sentence Transformers model from \ + sentence-transformers/paraphrase-multilingual-mpnet-base-v2 for \ + Unsupervised Word Sense Disambiguation. :Example: ::