From 3e9b28b54e552502dfefb6734295135f58d244bb Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 01:26:28 -0700
Subject: [PATCH 1/8] Add pythainlp.wsd

---
 docs/api/wsd.rst          |  12 ++++
 notebooks/test_wsd.ipynb  | 104 ++++++++++++++++++++++++++++++++
 pythainlp/wsd/__init__.py |  19 ++++++
 pythainlp/wsd/core.py     | 122 ++++++++++++++++++++++++++++++++++++++
 tests/test_wsd.py         |   9 +++
 5 files changed, 266 insertions(+)
 create mode 100644 docs/api/wsd.rst
 create mode 100644 notebooks/test_wsd.ipynb
 create mode 100644 pythainlp/wsd/__init__.py
 create mode 100644 pythainlp/wsd/core.py
 create mode 100644 tests/test_wsd.py

diff --git a/docs/api/wsd.rst b/docs/api/wsd.rst
new file mode 100644
index 000000000..d62691e5b
--- /dev/null
+++ b/docs/api/wsd.rst
@@ -0,0 +1,12 @@
+.. currentmodule:: pythainlp.wsd
+
+pythainlp.wsd
+=============
+
+The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD).
+
+
+Modules
+-------
+
+.. autofunction:: get_sense
\ No newline at end of file
diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb
new file mode 100644
index 000000000..19a2684f2
--- /dev/null
+++ b/notebooks/test_wsd.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "70e6b5ba-063d-4e53-a312-2380b49bc3a9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pythainlp.wsd import get_sense"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n",
+       "  0.0974416732788086),\n",
+       " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n",
+       "  0.09319090843200684)]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n",
+       "  0.1005704402923584),\n",
+       " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n",
+       "  0.12473666667938232)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pythainlp/wsd/__init__.py b/pythainlp/wsd/__init__.py
new file mode 100644
index 000000000..e1ca154b8
--- /dev/null
+++ b/pythainlp/wsd/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Thai Word Sense Disambiguation (WSD)
+"""
+__all__ = ["get_sense"]
+from pythainlp.wsd.core import get_sense
\ No newline at end of file
diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
new file mode 100644
index 000000000..5c87272fb
--- /dev/null
+++ b/pythainlp/wsd/core.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+from typing import List, Tuple, Union
+
+from pythainlp.corpus import thai_words
+from pythainlp.tokenize import Tokenizer
+from pythainlp.util.trie import Trie, dict_trie
+from pythainlp.corpus import get_corpus_path
+
+_thai_wsd = {"word":[], "meaning":[]}
+with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
+    reader = csv.DictReader(csvfile, delimiter=",")
+    for row in reader:
+        _thai_wsd["word"].append(row["word"])
+        _thai_wsd["meaning"].append(row["meaning"])
+_mean_all = {}
+for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
+    _all_value = list(eval(j).values())
+    _use = []
+    for k in _all_value:
+        _use.extend(k)
+    _use=list(set(_use))
+    if len(_use)>1:
+        _mean_all[i]=_use
+_all_word=set(list(_mean_all.keys()))
+_TRIE = Trie(list(_all_word))
+_word_cut = Tokenizer(custom_dict=_TRIE)
+
+
+class _SentenceTransformersModel:
+    def __init__(self, model:str="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", device:str="cpu"):
+        from sentence_transformers import SentenceTransformer
+        self.device = device
+        self.model_name = model
+        self.model = SentenceTransformer(self.model_name, device=self.device)
+    def change_device(self, device: str):
+        from sentence_transformers import SentenceTransformer
+        self.device = device
+        self.model = SentenceTransformer(self.model_name, device=self.device)
+    def get_score(self, sentences1: str,sentences2: str)->float:
+        from sentence_transformers import util
+        embedding_1= self.model.encode(sentences1, convert_to_tensor=True)
+        embedding_2 = self.model.encode(sentences2, convert_to_tensor=True)
+        return 1-util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item()
+
+_MODEL = None
+
+
+def get_sense(
+    sentence: str,
+    word: str,
+    device:str="cpu",
+    custom_dict: dict=_mean_all,
+    custom_tokenizer: Tokenizer=_word_cut,
+) -> Union[List[Tuple[str, float]], None]:
+    """
+    Get word sense from the sentence.
+    This function will get definition and distance from context in sentence.
+    
+    :param str sentence: Thai sentence
+    :param str word: Thai word
+    :param str device: device for running model.
+    :param dict custom_dict: Thai dictionary {"word":{"part-of-speech":["definition"]}}
+    :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence.
+    
+    We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation <https://arxiv.org/abs/2305.03520>`_ to build get_sense function.
+
+    For Thai dictionary, We use Thai dictionary from wiktionary.
+    See more `thai_dict <https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_.
+    
+    For the model, We use Sentence Transformers model from 
+    sentence-transformers/paraphrase-multilingual-mpnet-base-v2.
+    
+    :Example:
+    ::
+
+        from pythainlp.wsd import get_sense
+        print(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้"))
+        # output:
+        # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',
+        #   0.0974416732788086),
+        #  ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',
+        #   0.09319090843200684)]
+
+        print(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้"))
+        # output:
+        # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',
+        #   0.1005704402923584),
+        #  ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',
+        #   0.12473666667938232)]
+    """
+    global _MODEL
+    _w = custom_tokenizer.word_tokenize(sentence)
+    if word not in _w:
+        return None
+    if _MODEL == None:
+        _MODEL = _SentenceTransformersModel(device=device)
+    if _MODEL.device!=device:
+        _MODEL.change_device(device=device)
+    _temp_mean = custom_dict[word]
+    _temp =[]
+    for i in _temp_mean:
+        _temp_2 = []
+        for j in _w:
+            if j == word:
+                j = word+f" ({word} ความหมาย '"+i.replace('(',"").replace(')',"")+"') "
+            _temp_2.append(j)
+        _temp.append((i,_MODEL.get_score(sentence,''.join(_temp_2))))
+    return _temp
\ No newline at end of file
diff --git a/tests/test_wsd.py b/tests/test_wsd.py
new file mode 100644
index 000000000..01dfd0186
--- /dev/null
+++ b/tests/test_wsd.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+import unittest
+from pythainlp.wsd import get_sense
+
+
+class TestWsdPackage(unittest.TestCase):
+    def test_get_sense(self):
+        self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้"))
+        self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้"))

From 2216899854e5f9b55f5c393d9de1ed7ec8d81864 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 01:39:10 -0700
Subject: [PATCH 2/8] Add wsd requirements

---
 docker_requirements.txt     | 1 +
 docs/notes/installation.rst | 1 +
 setup.py                    | 4 ++++
 3 files changed, 6 insertions(+)

diff --git a/docker_requirements.txt b/docker_requirements.txt
index 4cd8b63f9..c5fe9591e 100644
--- a/docker_requirements.txt
+++ b/docker_requirements.txt
@@ -37,3 +37,4 @@ ufal.chu-liu-edmonds==1.0.2
 wtpsplit==1.0.1
 fastcoref==2.1.6
 panphon==0.20.0
+sentence-transformers==2.2.2
diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
index fa5bdb896..92bbd436e 100644
--- a/docs/notes/installation.rst
+++ b/docs/notes/installation.rst
@@ -36,6 +36,7 @@ where ``extras`` can be
   - ``transformers_ud`` (to support transformers_ud engine)
   - ``dependency_parsing`` (to support dependency parsing with all engine)
   - ``coreference_resolution`` (to support coreference esolution with all engine)
+  - ``wsd`` (to support pythainlp.wsd)
   - ``full`` (install everything)
 
 For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
diff --git a/setup.py b/setup.py
index 7da84e696..16bb96c1a 100644
--- a/setup.py
+++ b/setup.py
@@ -117,6 +117,9 @@
     "word_approximation":{
         "panphon>=0.20.0"
     },
+    "wsd":{
+        "sentence-transformers>=2.2.2"
+    },
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -150,6 +153,7 @@
         "fastcoref>=2.1.5",
         "ufal.chu-liu-edmonds>=1.0.2",
         "panphon>=0.20.0",
+        "sentence-transformers>=2.2.2",
     ],
 }
 

From 233ba2ff1944bd7ba730aab55bee3fc66201c009 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 01:44:29 -0700
Subject: [PATCH 3/8] Improve get_sense docs

---
 pythainlp/wsd/core.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index 5c87272fb..1c28808b9 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -75,6 +75,8 @@ def get_sense(
     :param str device: device for running model.
     :param dict custom_dict: Thai dictionary {"word":{"part-of-speech":["definition"]}}
     :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence.
+    :return: list of definition and distance or None (If word is not in the dictionary)
+    :rtype: Union[List[Tuple[str, float]], None]
     
     We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation <https://arxiv.org/abs/2305.03520>`_ to build get_sense function.
 

From 07531747bd32460c144e656788b7e3852c62db30 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 02:19:39 -0700
Subject: [PATCH 4/8] Update get_sense docs

---
 pythainlp/wsd/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index 1c28808b9..c5b4a43cf 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -73,7 +73,7 @@ def get_sense(
     :param str sentence: Thai sentence
     :param str word: Thai word
     :param str device: device for running model.
-    :param dict custom_dict: Thai dictionary {"word":{"part-of-speech":["definition"]}}
+    :param dict custom_dict: Thai dictionary {"word":["definition",..]}
     :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence.
     :return: list of definition and distance or None (If word is not in the dictionary)
     :rtype: Union[List[Tuple[str, float]], None]

From d14d2b8d20978ba794c452ad77aa375d944ba5d5 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 02:30:31 -0700
Subject: [PATCH 5/8] Update pythainlp.wsd

---
 notebooks/test_wsd.ipynb | 52 +++++++++++++++++-----------------------
 pythainlp/wsd/core.py    |  2 +-
 tests/test_wsd.py        |  1 +
 3 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb
index 19a2684f2..a905f6a45 100644
--- a/notebooks/test_wsd.ipynb
+++ b/notebooks/test_wsd.ipynb
@@ -24,25 +24,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Setting ds_accelerator to cuda (auto detect)\n"
+      "Setting ds_accelerator to cuda (auto detect)\n",
+      "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n",
-       "  0.0974416732788086),\n",
-       " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n",
-       "  0.09319090843200684)]"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
-    "get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\")"
+    "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
    ]
   },
   {
@@ -54,30 +42,34 @@
    },
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',\n",
-       "  0.1005704402923584),\n",
-       " ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',\n",
-       "  0.12473666667938232)]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
+     ]
     }
    ],
    "source": [
-    "get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\")"
+    "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "b0ee35fc-f26e-4bce-b6fa-0e1efc863ae4",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
+   ]
   }
  ],
  "metadata": {
diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index c5b4a43cf..63440ea8c 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -106,7 +106,7 @@ def get_sense(
     """
     global _MODEL
     _w = custom_tokenizer.word_tokenize(sentence)
-    if word not in _w:
+    if word not in set(custom_dict.keys()) or word not in sentence:
         return None
     if _MODEL == None:
         _MODEL = _SentenceTransformersModel(device=device)
diff --git a/tests/test_wsd.py b/tests/test_wsd.py
index 01dfd0186..b58fe76fa 100644
--- a/tests/test_wsd.py
+++ b/tests/test_wsd.py
@@ -7,3 +7,4 @@ class TestWsdPackage(unittest.TestCase):
     def test_get_sense(self):
         self.assertIsNotNone(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้"))
         self.assertIsNotNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้"))
+        self.assertIsNone(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คน"))

From 87106b564e0cfd277c748d81cf00a5bb0fa7aa94 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 05:00:53 -0700
Subject: [PATCH 6/8] Add pythainlp.corpus.thai_dict

---
 docs/api/corpus.rst          |  1 +
 notebooks/test_wsd.ipynb     |  4 ++--
 pythainlp/corpus/__init__.py |  2 ++
 pythainlp/corpus/common.py   | 26 +++++++++++++++++++++++++-
 pythainlp/wsd/core.py        |  9 ++-------
 5 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
index 9f74e645a..7da4c81a2 100644
--- a/docs/api/corpus.rst
+++ b/docs/api/corpus.rst
@@ -16,6 +16,7 @@ Modules
 .. autofunction:: download
 .. autofunction:: remove
 .. autofunction:: provinces
+.. autofunction:: thai_dict
 .. autofunction:: thai_stopwords
 .. autofunction:: thai_words
 .. autofunction:: thai_orst_words
diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb
index a905f6a45..2de6dda74 100644
--- a/notebooks/test_wsd.ipynb
+++ b/notebooks/test_wsd.ipynb
@@ -25,7 +25,7 @@
      "output_type": "stream",
      "text": [
       "Setting ds_accelerator to cuda (auto detect)\n",
-      "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
+      "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086)]\n"
      ]
     }
    ],
@@ -45,7 +45,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
+      "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584)]\n"
      ]
     }
    ],
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
index 49f8e1920..6c81f1a9f 100644
--- a/pythainlp/corpus/__init__.py
+++ b/pythainlp/corpus/__init__.py
@@ -32,6 +32,7 @@
     "get_corpus_path",
     "provinces",
     "remove",
+    "thai_dict",
     "thai_family_names",
     "thai_female_names",
     "thai_male_names",
@@ -112,4 +113,5 @@ def corpus_db_path() -> str:
     thai_syllables,
     thai_words,
     thai_orst_words,
+    thai_dict,
 )
diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
index 2a99fb89c..bb87e3dd1 100644
--- a/pythainlp/corpus/common.py
+++ b/pythainlp/corpus/common.py
@@ -26,11 +26,12 @@
     "thai_stopwords",
     "thai_syllables",
     "thai_words",
+    "thai_dict",
 ]
 
 from typing import FrozenSet, List, Union
 
-from pythainlp.corpus import get_corpus
+from pythainlp.corpus import get_corpus, get_corpus_path
 
 _THAI_COUNTRIES = set()
 _THAI_COUNTRIES_FILENAME = "countries_th.txt"
@@ -60,6 +61,8 @@
 
 _THAI_ORST_WORDS = set()
 
+_THAI_DICT = {}
+
 
 def countries() -> FrozenSet[str]:
     """
@@ -256,3 +259,24 @@ def thai_male_names() -> FrozenSet[str]:
         _THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME)
 
     return _THAI_MALE_NAMES
+
+def thai_dict() -> dict:
+    """
+    Return Thai dictionary with definition from wiktionary.
+    \n(See: `thai_dict\
+    <https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)
+
+    :return: Thai word with part-of-speech type and definition
+    :rtype: :class:`frozenset`
+    """
+    global _THAI_DICT
+    if _THAI_DICT == {}:
+        import csv
+        _THAI_DICT = {"word":[], "meaning":[]}
+        with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
+            reader = csv.DictReader(csvfile, delimiter=",")
+            for row in reader:
+                _THAI_DICT["word"].append(row["word"])
+                _THAI_DICT["meaning"].append(row["meaning"])
+
+    return _THAI_DICT
diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index 63440ea8c..49685c079 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -18,14 +18,9 @@
 from pythainlp.corpus import thai_words
 from pythainlp.tokenize import Tokenizer
 from pythainlp.util.trie import Trie, dict_trie
-from pythainlp.corpus import get_corpus_path
+from pythainlp.corpus import get_corpus_path, thai_dict
 
-_thai_wsd = {"word":[], "meaning":[]}
-with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:
-    reader = csv.DictReader(csvfile, delimiter=",")
-    for row in reader:
-        _thai_wsd["word"].append(row["word"])
-        _thai_wsd["meaning"].append(row["meaning"])
+_thai_wsd = thai_dict()
 _mean_all = {}
 for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
     _all_value = list(eval(j).values())

From 1bec0f2f76fa19913ee8a705fa48bd1e8c5591fd Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 05:42:28 -0700
Subject: [PATCH 7/8] Add pythainlp.corpus.thai_wsd_dict

---
 docs/api/corpus.rst          |  1 +
 notebooks/test_wsd.ipynb     | 87 +++++++++++++++++++++++++++++++++---
 pythainlp/corpus/__init__.py |  2 +
 pythainlp/corpus/common.py   | 29 ++++++++++++
 pythainlp/wsd/core.py        | 16 +++----
 5 files changed, 119 insertions(+), 16 deletions(-)

diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
index 7da4c81a2..ff8617358 100644
--- a/docs/api/corpus.rst
+++ b/docs/api/corpus.rst
@@ -19,6 +19,7 @@ Modules
 .. autofunction:: thai_dict
 .. autofunction:: thai_stopwords
 .. autofunction:: thai_words
+.. autofunction:: thai_wsd_dict
 .. autofunction:: thai_orst_words
 .. autofunction:: thai_syllables
 .. autofunction:: thai_negations
diff --git a/notebooks/test_wsd.ipynb b/notebooks/test_wsd.ipynb
index 2de6dda74..9a2fc7027 100644
--- a/notebooks/test_wsd.ipynb
+++ b/notebooks/test_wsd.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
    "id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16",
    "metadata": {
     "tags": []
@@ -24,8 +24,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Setting ds_accelerator to cuda (auto detect)\n",
-      "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086)]\n"
+      "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
      ]
     }
    ],
@@ -35,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 10,
    "id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33",
    "metadata": {
     "tags": []
@@ -45,7 +44,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584)]\n"
+      "[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
      ]
     }
    ],
@@ -70,6 +69,84 @@
    "source": [
     "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pythainlp.corpus import get_corpus_path, thai_wsd_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0f88ff4c-06db-4cba-8086-4bb2160bead0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "_w=thai_wsd_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "83642893-d9a6-4271-a1b7-5e57638a74d4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['word', 'meaning'])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_w.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "bb67c468-ce65-4581-adc6-832d70cfabab",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_w[\"word\"][0],_w[\"meaning\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27fbe522-019f-4157-a9a8-50ae62b50727",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
index 6c81f1a9f..ca6e047d4 100644
--- a/pythainlp/corpus/__init__.py
+++ b/pythainlp/corpus/__init__.py
@@ -40,6 +40,7 @@
     "thai_stopwords",
     "thai_syllables",
     "thai_words",
+    "thai_wsd_dict",
     "thai_orst_words",
     "path_pythainlp_corpus",
     "get_path_folder_corpus",
@@ -114,4 +115,5 @@ def corpus_db_path() -> str:
     thai_words,
     thai_orst_words,
     thai_dict,
+    thai_wsd_dict
 )
diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
index bb87e3dd1..15007b5e3 100644
--- a/pythainlp/corpus/common.py
+++ b/pythainlp/corpus/common.py
@@ -27,6 +27,7 @@
     "thai_syllables",
     "thai_words",
     "thai_dict",
+    "thai_wsd_dict",
 ]
 
 from typing import FrozenSet, List, Union
@@ -62,6 +63,7 @@
 _THAI_ORST_WORDS = set()
 
 _THAI_DICT = {}
+_THAI_WSD_DICT = {}
 
 
 def countries() -> FrozenSet[str]:
@@ -260,6 +262,7 @@ def thai_male_names() -> FrozenSet[str]:
 
     return _THAI_MALE_NAMES
 
+
 def thai_dict() -> dict:
     """
     Return Thai dictionary with definition from wiktionary.
@@ -280,3 +283,29 @@ def thai_dict() -> dict:
                 _THAI_DICT["meaning"].append(row["meaning"])
 
     return _THAI_DICT
+
+
+def thai_wsd_dict() -> dict:
+    """
+    Return Thai Word Sense Disambiguation dictionary with definition from wiktionary.
+    \n(See: `thai_dict\
+    <https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)
+
+    :return: Thai word with part-of-speech type and definition
+    :rtype: :class:`frozenset`
+    """
+    global _THAI_WSD_DICT
+    if _THAI_WSD_DICT == {}:
+        _thai_wsd = thai_dict()
+        _THAI_WSD_DICT = {"word":[],"meaning":[]}
+        for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
+            _all_value = list(eval(j).values())
+            _use = []
+            for k in _all_value:
+                _use.extend(k)
+            _use=list(set(_use))
+            if len(_use)>1:
+                _THAI_WSD_DICT["word"].append(i)
+                _THAI_WSD_DICT["meaning"].append(_use)
+
+    return _THAI_WSD_DICT
\ No newline at end of file
diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index 49685c079..e5a984065 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -18,19 +18,13 @@
 from pythainlp.corpus import thai_words
 from pythainlp.tokenize import Tokenizer
 from pythainlp.util.trie import Trie, dict_trie
-from pythainlp.corpus import get_corpus_path, thai_dict
+from pythainlp.corpus import get_corpus_path, thai_wsd_dict
 
-_thai_wsd = thai_dict()
+_wsd_dict = thai_wsd_dict()
 _mean_all = {}
-for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
-    _all_value = list(eval(j).values())
-    _use = []
-    for k in _all_value:
-        _use.extend(k)
-    _use=list(set(_use))
-    if len(_use)>1:
-        _mean_all[i]=_use
-_all_word=set(list(_mean_all.keys()))
+for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
+    _mean_all[i]=j
+_all_word = set(list(_mean_all.keys()))
 _TRIE = Trie(list(_all_word))
 _word_cut = Tokenizer(custom_dict=_TRIE)
 

From f38c51a8ea97a2e2fa2f6b1cbf53ea21ad418741 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 12 Jul 2023 22:00:55 -0700
Subject: [PATCH 8/8] Update pythainlp.wsd docs

---
 pythainlp/wsd/core.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py
index e5a984065..17dfee873 100644
--- a/pythainlp/wsd/core.py
+++ b/pythainlp/wsd/core.py
@@ -67,13 +67,15 @@ def get_sense(
     :return: list of definition and distance or None (If word is not in the dictionary)
     :rtype: Union[List[Tuple[str, float]], None]
     
-    We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation <https://arxiv.org/abs/2305.03520>`_ to build get_sense function.
+    We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised \
+    Word Sense Disambiguation <https://arxiv.org/abs/2305.03520>`_ to build get_sense function.
 
     For Thai dictionary, We use Thai dictionary from wiktionary.
     See more `thai_dict <https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_.
     
-    For the model, We use Sentence Transformers model from 
-    sentence-transformers/paraphrase-multilingual-mpnet-base-v2.
+    For the model, We use Sentence Transformers model from \
+    sentence-transformers/paraphrase-multilingual-mpnet-base-v2 for \
+    Unsupervised Word Sense Disambiguation.
     
     :Example:
     ::