diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index ddb81e8f6..8f5a98188 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -12,6 +12,11 @@ countries .. autofunction:: countries :noindex: +find_synonym +~~~~~~~~~~~~ +.. autofunction:: find_synonym + :noindex: + get_corpus ~~~~~~~~~~ .. autofunction:: get_corpus diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 7fff98d85..d5e8ea39b 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -14,6 +14,7 @@ "corpus_path", "countries", "download", + "find_synonyms", "get_corpus", "get_corpus_as_is", "get_corpus_db", @@ -101,6 +102,7 @@ def corpus_db_path() -> str: ) # these imports must come before other pythainlp.corpus.* imports from pythainlp.corpus.common import ( countries, + find_synonyms, provinces, thai_dict, thai_family_names, diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 36a8c718c..985811817 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + """ Common lists of words. """ __all__ = [ "countries", + "find_synonyms", "provinces", "thai_family_names", "thai_female_names", @@ -336,3 +338,38 @@ def thai_synonyms() -> dict: def thai_synonym() -> dict: warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning) return thai_synonyms() + + +def find_synonyms(word: str) -> List[str]: + """ + Find synonyms + + :param str word: Thai word + :return: List of synonyms of the input word or an empty list if it isn't exist. + :rtype: List[str] + + :Example: + :: + + from pythainlp.corpus import find_synonyms + + print(find_synonyms("หมู")) + # output: ['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร'] + """ + synonyms = thai_synonyms() # get a dictionary of {word, synonym} + list_synonym = [] + + if word in synonyms["word"]: # find by word + list_synonym.extend(synonyms["synonym"][synonyms["word"].index(word)]) + + for idx, words in enumerate(synonyms["synonym"]): # find by synonym + if word in words: + list_synonym.extend(synonyms["synonym"][idx]) + list_synonym.append(synonyms["word"][idx]) + + list_synonym = sorted(list(set(list_synonym))) + + if word in list_synonym: # remove same word + list_synonym.remove(word) + + return list_synonym diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 01c0c77ef..5a0ece6f3 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -13,6 +13,7 @@ conceptnet, countries, download, + find_synonyms, get_corpus_db, get_corpus_db_detail, get_corpus_default_db, @@ -204,3 +205,10 @@ def test_zip(self): p = get_corpus_path("test_zip") self.assertEqual(os.path.isdir(p), True) self.assertEqual(remove("test_zip"), True) + + def test_find_synonyms(self): + self.assertEqual( + find_synonyms("หมู"), + ['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร'] + ) + self.assertEqual(find_synonyms("1"), [])