diff --git a/docker_requirements.txt b/docker_requirements.txt index 4cd8b63f9..c5fe9591e 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -37,3 +37,4 @@ ufal.chu-liu-edmonds==1.0.2 wtpsplit==1.0.1 fastcoref==2.1.6 panphon==0.20.0 +sentence-transformers==2.2.2 diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index 9f74e645a..ff8617358 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -16,8 +16,10 @@ Modules .. autofunction:: download .. autofunction:: remove .. autofunction:: provinces +.. autofunction:: thai_dict .. autofunction:: thai_stopwords .. autofunction:: thai_words +.. autofunction:: thai_wsd_dict .. autofunction:: thai_orst_words .. autofunction:: thai_syllables .. autofunction:: thai_negations diff --git a/docs/api/wsd.rst b/docs/api/wsd.rst new file mode 100644 index 000000000..d62691e5b --- /dev/null +++ b/docs/api/wsd.rst @@ -0,0 +1,12 @@ +.. currentmodule:: pythainlp.wsd + +pythainlp.wsd +============= + +The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD). + + +Modules +------- + +.. autofunction:: get_sense \ No newline at end of file diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst index fa5bdb896..92bbd436e 100644 --- a/docs/notes/installation.rst +++ b/docs/notes/installation.rst @@ -36,6 +36,7 @@ where ``extras`` can be - ``transformers_ud`` (to support transformers_ud engine) - ``dependency_parsing`` (to support dependency parsing with all engine) - ``coreference_resolution`` (to support coreference esolution with all engine) + - ``wsd`` (to support pythainlp.wsd) - ``full`` (install everything) For dependency details, look at `extras` variable in `setup.py `_. diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb new file mode 100644 index 000000000..9a2fc7027 --- /dev/null +++ b/notebooks/test_wsd.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "70e6b5ba-063d-4e53-a312-2380b49bc3a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pythainlp.wsd import get_sense" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n" + ] + } + ], + "source": [ + "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n" + ] + } + ], + "source": [ + "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pythainlp.corpus import get_corpus_path, thai_wsd_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0f88ff4c-06db-4cba-8086-4bb2160bead0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "_w=thai_wsd_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "83642893-d9a6-4271-a1b7-5e57638a74d4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['word', 'meaning'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_w.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bb67c468-ce65-4581-adc6-832d70cfabab", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_w[\"word\"][0],_w[\"meaning\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27fbe522-019f-4157-a9a8-50ae62b50727", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 49f8e1920..ca6e047d4 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -32,6 +32,7 @@ "get_corpus_path", "provinces", "remove", + "thai_dict", "thai_family_names", "thai_female_names", "thai_male_names", @@ -39,6 +40,7 @@ "thai_stopwords", "thai_syllables", "thai_words", + "thai_wsd_dict", "thai_orst_words", "path_pythainlp_corpus", "get_path_folder_corpus", @@ -112,4 +114,6 @@ def corpus_db_path() -> str: thai_syllables, thai_words, thai_orst_words, + thai_dict, + thai_wsd_dict ) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 2a99fb89c..15007b5e3 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -26,11 +26,13 @@ "thai_stopwords", "thai_syllables", "thai_words", + "thai_dict", + "thai_wsd_dict", ] from typing import FrozenSet, List, Union -from pythainlp.corpus import get_corpus +from pythainlp.corpus import get_corpus, get_corpus_path _THAI_COUNTRIES = set() _THAI_COUNTRIES_FILENAME = "countries_th.txt" @@ -60,6 +62,9 @@ _THAI_ORST_WORDS = set() +_THAI_DICT = {} +_THAI_WSD_DICT = {} + def countries() -> FrozenSet[str]: """ @@ -256,3 +261,51 @@ def thai_male_names() -> FrozenSet[str]: _THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME) return _THAI_MALE_NAMES + + +def thai_dict() -> dict: + """ + Return Thai dictionary with definition from wiktionary. + \n(See: `thai_dict\ + `_) + + :return: Thai word with part-of-speech type and definition + :rtype: :class:`frozenset` + """ + global _THAI_DICT + if _THAI_DICT == {}: + import csv + _THAI_DICT = {"word":[], "meaning":[]} + with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile, delimiter=",") + for row in reader: + _THAI_DICT["word"].append(row["word"]) + _THAI_DICT["meaning"].append(row["meaning"]) + + return _THAI_DICT + + +def thai_wsd_dict() -> dict: + """ + Return Thai Word Sense Disambiguation dictionary with definition from wiktionary. + \n(See: `thai_dict\ + `_) + + :return: Thai word with part-of-speech type and definition + :rtype: :class:`frozenset` + """ + global _THAI_WSD_DICT + if _THAI_WSD_DICT == {}: + _thai_wsd = thai_dict() + _THAI_WSD_DICT = {"word":[],"meaning":[]} + for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): + _all_value = list(eval(j).values()) + _use = [] + for k in _all_value: + _use.extend(k) + _use=list(set(_use)) + if len(_use)>1: + _THAI_WSD_DICT["word"].append(i) + _THAI_WSD_DICT["meaning"].append(_use) + + return _THAI_WSD_DICT \ No newline at end of file diff --git a/pythainlp/wsd/__init__.py b/pythainlp/wsd/__init__.py new file mode 100644 index 000000000..e1ca154b8 --- /dev/null +++ b/pythainlp/wsd/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Thai Word Sense Disambiguation (WSD) +""" +__all__ = ["get_sense"] +from pythainlp.wsd.core import get_sense \ No newline at end of file diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py new file mode 100644 index 000000000..17dfee873 --- /dev/null +++ b/pythainlp/wsd/core.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import csv +from typing import List, Tuple, Union + +from pythainlp.corpus import thai_words +from pythainlp.tokenize import Tokenizer +from pythainlp.util.trie import Trie, dict_trie +from pythainlp.corpus import get_corpus_path, thai_wsd_dict + +_wsd_dict = thai_wsd_dict() +_mean_all = {} +for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]): + _mean_all[i]=j +_all_word = set(list(_mean_all.keys())) +_TRIE = Trie(list(_all_word)) +_word_cut = Tokenizer(custom_dict=_TRIE) + + +class _SentenceTransformersModel: + def __init__(self, model:str="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", device:str="cpu"): + from sentence_transformers import SentenceTransformer + self.device = device + self.model_name = model + self.model = SentenceTransformer(self.model_name, device=self.device) + def change_device(self, device: str): + from sentence_transformers import SentenceTransformer + self.device = device + self.model = SentenceTransformer(self.model_name, device=self.device) + def get_score(self, sentences1: str,sentences2: str)->float: + from sentence_transformers import util + embedding_1= self.model.encode(sentences1, convert_to_tensor=True) + embedding_2 = self.model.encode(sentences2, convert_to_tensor=True) + return 1-util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item() + +_MODEL = None + + +def get_sense( + sentence: str, + word: str, + device:str="cpu", + custom_dict: dict=_mean_all, + custom_tokenizer: Tokenizer=_word_cut, +) -> Union[List[Tuple[str, float]], None]: + """ + Get word sense from the sentence. + This function will get definition and distance from context in sentence. + + :param str sentence: Thai sentence + :param str word: Thai word + :param str device: device for running model. + :param dict custom_dict: Thai dictionary {"word":["definition",..]} + :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence. + :return: list of definition and distance or None (If word is not in the dictionary) + :rtype: Union[List[Tuple[str, float]], None] + + We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised \ + Word Sense Disambiguation `_ to build get_sense function. + + For Thai dictionary, We use Thai dictionary from wiktionary. + See more `thai_dict `_. + + For the model, We use Sentence Transformers model from \ + sentence-transformers/paraphrase-multilingual-mpnet-base-v2 for \ + Unsupervised Word Sense Disambiguation. + + :Example: + :: + + from pythainlp.wsd import get_sense + print(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) + # output: + # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', + # 0.0974416732788086), + # ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', + # 0.09319090843200684)] + + print(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) + # output: + # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', + # 0.1005704402923584), + # ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', + # 0.12473666667938232)] + """ + global _MODEL + _w = custom_tokenizer.word_tokenize(sentence) + if word not in set(custom_dict.keys()) or word not in sentence: + return None + if _MODEL == None: + _MODEL = _SentenceTransformersModel(device=device) + if _MODEL.device!=device: + _MODEL.change_device(device=device) + _temp_mean = custom_dict[word] + _temp =[] + for i in _temp_mean: + _temp_2 = [] + for j in _w: + if j == word: + j = word+f" ({word} ความหมาย '"+i.replace('(',"").replace(')',"")+"') " + _temp_2.append(j) + _temp.append((i,_MODEL.get_score(sentence,''.join(_temp_2)))) + return _temp \ No newline at end of file diff --git a/setup.py b/setup.py index 7da84e696..16bb96c1a 100644 --- a/setup.py +++ b/setup.py @@ -117,6 +117,9 @@ "word_approximation":{ "panphon>=0.20.0" }, + "wsd":{ + "sentence-transformers>=2.2.2" + }, "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -150,6 +153,7 @@ "fastcoref>=2.1.5", "ufal.chu-liu-edmonds>=1.0.2", "panphon>=0.20.0", + "sentence-transformers>=2.2.2", ], } diff --git a/tests/test_wsd.py b/tests/test_wsd.py new file mode 100644 index 000000000..b58fe76fa --- /dev/null +++ b/tests/test_wsd.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +import unittest +from pythainlp.wsd import get_sense + + +class TestWsdPackage(unittest.TestCase): + def test_get_sense(self): + self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) + self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) + self.assertIsNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คน"))