From e9a4862f4aa70535070870a35a784950195077a5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 18:11:03 +0700 Subject: [PATCH 1/8] Add pythainlp.corpus.find_synonym Find synonyms --- docs/api/corpus.rst | 5 +++++ pythainlp/corpus/__init__.py | 2 ++ pythainlp/corpus/common.py | 24 ++++++++++++++++++++++++ tests/test_corpus.py | 5 +++++ 4 files changed, 36 insertions(+) diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index ddb81e8f6..8f5a98188 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -12,6 +12,11 @@ countries .. autofunction:: countries :noindex: +find_synonym +~~~~~~~~~~~~ +.. autofunction:: find_synonym + :noindex: + get_corpus ~~~~~~~~~~ .. autofunction:: get_corpus diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 7fff98d85..d5e8ea39b 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -14,6 +14,7 @@ "corpus_path", "countries", "download", + "find_synonyms", "get_corpus", "get_corpus_as_is", "get_corpus_db", @@ -101,6 +102,7 @@ def corpus_db_path() -> str: ) # these imports must come before other pythainlp.corpus.* imports from pythainlp.corpus.common import ( countries, + find_synonyms, provinces, thai_dict, thai_family_names, diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 36a8c718c..9ce937827 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -7,6 +7,7 @@ __all__ = [ "countries", + "find_synonyms", "provinces", "thai_family_names", "thai_female_names", @@ -336,3 +337,26 @@ def thai_synonyms() -> dict: def thai_synonym() -> dict: warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning) return thai_synonyms() + + +def find_synonyms(word) -> Union[List[str], None]: + """ + Find synonyms + + :param str word: Thai word + :return: List synonyms of word or None if it isn't exist. + :rtype: Union[List[str], None] + + :Example: + :: + + from pythainlp.corpus import find_synonyms + + print(find_synonyms("หมู")) + # output: ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] + """ + _temp = thai_synonyms() + if word in _temp["word"]: + _idx = _temp["word"].index(word) + return _temp["synonym"][_idx] + return None diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 01c0c77ef..ac8c5c820 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -13,6 +13,7 @@ conceptnet, countries, download, + find_synonyms, get_corpus_db, get_corpus_db_detail, get_corpus_default_db, @@ -204,3 +205,7 @@ def test_zip(self): p = get_corpus_path("test_zip") self.assertEqual(os.path.isdir(p), True) self.assertEqual(remove("test_zip"), True) + + def test_find_synonyms(self): + self.assertIsInstance(find_synonyms("หมู"), list) + self.assertIsInstance(find_synonyms("1"), None) From 7f5251cbf18c5ccf627c5143f9fedb2304892c25 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 18:27:42 +0700 Subject: [PATCH 2/8] Change None to List in pythainlp.corpus.find_synonyms --- pythainlp/corpus/common.py | 6 +++--- tests/test_corpus.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 9ce937827..2c3251115 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -339,13 +339,13 @@ def thai_synonym() -> dict: return thai_synonyms() -def find_synonyms(word) -> Union[List[str], None]: +def find_synonyms(word) -> List[str]: """ Find synonyms :param str word: Thai word :return: List synonyms of word or None if it isn't exist. - :rtype: Union[List[str], None] + :rtype: List[str] :Example: :: @@ -359,4 +359,4 @@ def find_synonyms(word) -> Union[List[str], None]: if word in _temp["word"]: _idx = _temp["word"].index(word) return _temp["synonym"][_idx] - return None + return [] diff --git a/tests/test_corpus.py b/tests/test_corpus.py index ac8c5c820..2ddc8f3fa 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -207,5 +207,8 @@ def test_zip(self): self.assertEqual(remove("test_zip"), True) def test_find_synonyms(self): - self.assertIsInstance(find_synonyms("หมู"), list) - self.assertIsInstance(find_synonyms("1"), None) + self.assertEqual( + find_synonyms("หมู"), + ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] + ) + self.assertEqual(find_synonyms("1"), []) From dc0caaa3a221d5627126cdb07e50b248db6eac47 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 18:32:34 +0700 Subject: [PATCH 3/8] Update common.py --- pythainlp/corpus/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 2c3251115..e5540885f 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -355,8 +355,8 @@ def find_synonyms(word) -> List[str]: print(find_synonyms("หมู")) # output: ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] """ - _temp = thai_synonyms() - if word in _temp["word"]: - _idx = _temp["word"].index(word) - return _temp["synonym"][_idx] + synonyms = thai_synonyms() + if word in synonyms["word"]: + idx = synonyms["word"].index(word) + return synonyms["synonym"][idx] return [] From a5aa12c8a35ac847512e4fddda85f9ba4c3dc9a9 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 12 Dec 2023 11:55:03 +0000 Subject: [PATCH 4/8] Add arg type to find_synonyms() --- pythainlp/corpus/common.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index e5540885f..50df9279a 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + """ Common lists of words. """ @@ -339,12 +340,12 @@ def thai_synonym() -> dict: return thai_synonyms() -def find_synonyms(word) -> List[str]: +def find_synonyms(word: str) -> List[str]: """ Find synonyms :param str word: Thai word - :return: List synonyms of word or None if it isn't exist. + :return: List of synonyms of the input word or an empty list if it isn't exist. :rtype: List[str] :Example: @@ -355,8 +356,11 @@ def find_synonyms(word) -> List[str]: print(find_synonyms("หมู")) # output: ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] """ - synonyms = thai_synonyms() + synonyms = thai_synonyms() # get a dictionary of {word, synonym} + if word in synonyms["word"]: + # returns the position of the first occurrence of the word idx = synonyms["word"].index(word) return synonyms["synonym"][idx] + return [] From cf6647e4137ced18feebee8deaeae68073c2ecf0 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 20:34:17 +0700 Subject: [PATCH 5/8] Update pythainlp.corpus.find_synonyms --- pythainlp/corpus/common.py | 10 +++++----- tests/test_corpus.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 50df9279a..9547ac640 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -357,10 +357,10 @@ def find_synonyms(word: str) -> List[str]: # output: ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] """ synonyms = thai_synonyms() # get a dictionary of {word, synonym} + list_synonym = [] - if word in synonyms["word"]: - # returns the position of the first occurrence of the word - idx = synonyms["word"].index(word) - return synonyms["synonym"][idx] + for idx, words in enumerate(synonyms["word"]): + if word in words: + list_synonym.extend(synonyms["synonym"][idx]) - return [] + return sorted(list(set(list_synonym))) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 2ddc8f3fa..5a0ece6f3 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -209,6 +209,6 @@ def test_zip(self): def test_find_synonyms(self): self.assertEqual( find_synonyms("หมู"), - ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] + ['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร'] ) self.assertEqual(find_synonyms("1"), []) From 92ad41a8dad0f928f0ce2316f6f5cbd0375b6636 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 20:48:06 +0700 Subject: [PATCH 6/8] Update common.py --- pythainlp/corpus/common.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 9547ac640..5bf9ac4d7 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -354,13 +354,20 @@ def find_synonyms(word: str) -> List[str]: from pythainlp.corpus import find_synonyms print(find_synonyms("หมู")) - # output: ['จรุก', 'วราห์', 'วราหะ', 'ศูกร', 'สุกร'] + # output: ['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร'] """ synonyms = thai_synonyms() # get a dictionary of {word, synonym} list_synonym = [] - for idx, words in enumerate(synonyms["word"]): + if word in synonyms["word"]: # find by word + list_synonym.extend(synonyms["synonym"][synonyms["word"].index(word)]) + + for idx, words in enumerate(synonyms["synonym"]): # find by synonym if word in words: list_synonym.extend(synonyms["synonym"][idx]) + list_synonym.append(synonyms["word"][idx]) + + if word in list_synonym: # remove same word + list_synonym.remove(word) return sorted(list(set(list_synonym))) From 2e2f8785209876e6a3dbb666567e6a2c2109d4c6 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 20:50:13 +0700 Subject: [PATCH 7/8] Update common.py --- pythainlp/corpus/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 5bf9ac4d7..a2183397f 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -359,15 +359,15 @@ def find_synonyms(word: str) -> List[str]: synonyms = thai_synonyms() # get a dictionary of {word, synonym} list_synonym = [] - if word in synonyms["word"]: # find by word + if word in synonyms["word"]: # find by word list_synonym.extend(synonyms["synonym"][synonyms["word"].index(word)]) - for idx, words in enumerate(synonyms["synonym"]): # find by synonym + for idx, words in enumerate(synonyms["synonym"]): # find by synonym if word in words: list_synonym.extend(synonyms["synonym"][idx]) list_synonym.append(synonyms["word"][idx]) - if word in list_synonym: # remove same word + if word in list_synonym: # remove same word list_synonym.remove(word) return sorted(list(set(list_synonym))) From d9aa851968df938a0f41c63b776211d405ce3bcb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 12 Dec 2023 20:51:35 +0700 Subject: [PATCH 8/8] Update common.py --- pythainlp/corpus/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index a2183397f..985811817 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -367,7 +367,9 @@ def find_synonyms(word: str) -> List[str]: list_synonym.extend(synonyms["synonym"][idx]) list_synonym.append(synonyms["word"][idx]) + list_synonym = sorted(list(set(list_synonym))) + if word in list_synonym: # remove same word list_synonym.remove(word) - return sorted(list(set(list_synonym))) + return list_synonym