diff --git a/docs/api/morpheme.rst b/docs/api/morpheme.rst new file mode 100644 index 000000000..b227cae9f --- /dev/null +++ b/docs/api/morpheme.rst @@ -0,0 +1,13 @@ +.. currentmodule:: pythainlp.morpheme + +pythainlp.morpheme +================== + +The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language. + +.. autofunction:: nighit + +.. autofunction:: is_native_thai + :noindex: + + The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks. diff --git a/docs/api/util.rst b/docs/api/util.rst index 063fd1ab1..8f3e3110a 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -77,11 +77,6 @@ Modules The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides. -.. autofunction:: is_native_thai - :noindex: - - The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks. - .. autofunction:: isthai :noindex: diff --git a/notebooks/create_words.ipynb b/notebooks/create_words.ipynb new file mode 100644 index 000000000..d8d3ced83 --- /dev/null +++ b/notebooks/create_words.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pythainlp.transliterate import pronunciate\n", + "from pythainlp import thai_consonants" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'พุด-ทะ'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pronunciate(\"พุทธ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'บู-ชา'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pronunciate(\"บูชา\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'อะ-นุก'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pronunciate(\"อนุค\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def nighit(w1,w2): # read: https://www.trueplookpanya.com/learning/detail/1180\n", + " if not str(w1).endswith('ํ') and len(w1)!=2:\n", + " raise NotImplementedError(f\"The function doesn't support {w1}.\")\n", + " list_w1 = list(w1)\n", + " list_w2 = list(w2)\n", + " newword = list()\n", + " newword.append(list_w1[0])\n", + " newword.append(\"ั\")\n", + " consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]\n", + " if consonant_start in [\"ก\",\"ช\",\"ค\",\"ข\",\"ง\"]:\n", + " newword.append(\"ง\")\n", + " elif consonant_start in [\"จ\",\"ฉ\",\"ช\",\"ฌ\"]:\n", + " newword.append(\"ญ\")\n", + " elif consonant_start in [\"ฎ\",\"ฐ\",\"ฑ\",\"ณ\"]:\n", + " newword.append(\"ณ\")\n", + " elif consonant_start in [\"ด\",\"ถ\",\"ท\",\"ธ\",\"น\"]:\n", + " newword.append(\"น\")\n", + " elif consonant_start in [\"ป\",\"ผ\",\"พ\",\"ภ\"]:\n", + " newword.append(\"ม\")\n", + " elif consonant_start in [\"ย\",\"ร\",\"ล\",\"ฬ\",\"ว\",\"ศ\",\"ษ\",\"ส\",\"ห\"]:\n", + " newword.append(\"ง\")\n", + " else:\n", + " raise NotImplementedError(f\"The function doesn't support {w1} and {w2}.\")\n", + " newword.extend(list_w2)\n", + " return ''.join(newword)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "assert nighit(\"สํ\",\"คีต\")==\"สังคีต\"\n", + "assert nighit(\"สํ\",\"จร\")==\"สัญจร\"\n", + "assert nighit(\"สํ\",\"ฐาน\")==\"สัณฐาน\"\n", + "assert nighit(\"สํ\",\"นิษฐาน\")==\"สันนิษฐาน\"\n", + "assert nighit(\"สํ\",\"ปทา\")==\"สัมปทา\"\n", + "assert nighit(\"สํ\",\"โยค\")==\"สังโยค\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pythainlp/morpheme/__init__.py b/pythainlp/morpheme/__init__.py new file mode 100644 index 000000000..d04bcb5ba --- /dev/null +++ b/pythainlp/morpheme/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +""" +PyThaiNLP morpheme +""" +__all__ = [ + "nighit", + "is_native_thai" +] +from pythainlp.morpheme.word_formation import nighit +from pythainlp.morpheme.thaiwordcheck import is_native_thai diff --git a/pythainlp/morpheme/thaiwordcheck.py b/pythainlp/morpheme/thaiwordcheck.py new file mode 100644 index 000000000..b2b71b39f --- /dev/null +++ b/pythainlp/morpheme/thaiwordcheck.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +""" +Check if a word is a "native Thai word" + +Adapted from +https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md + +References +- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ \ + http://www.trueplookpanya.com/learning/detail/30589-043067 +- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619 +""" +import re + +_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound) + +# Non-native Thai characters +_TH_NON_NATIVE_CHARS = { + "ฆ", + "ณ", + "ฌ", + "ฎ", + "ฏ", + "ฐ", + "ฑ", + "ฒ", + "ธ", + "ศ", + "ษ", + "ฬ", + _THANTHAKHAT_CHAR, +} + +# Native Thai final consonants +_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"} + +# Known native Thai words (exceptions) +_TH_NATIVE_WORDS = { + "ฆ่า", + "เฆี่ยน", + "ศึก", + "ศอก", + "เศิก", + "เศร้า", + "ธ", + "ณ", + "ฯพณฯ", + "ใหญ่", + "หญ้า", + "ควาย", + "ความ", + "กริ่งเกรง", + "ผลิ", +} + +# Diphthong prefixes (can start native Thai word) +_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"} + +# Thai consonant filter +# O ANG (U+0E2D) is omitted, as it can be considered as vowel +_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U) + + +def is_native_thai(word: str) -> bool: + """ + Check if a word is an "native Thai word" (Thai: "คำไทยแท้") + This function is based on a simple heuristic algorithm + and cannot be entirely reliable. + + :param str word: word + :return: True or False + :rtype: bool + + :Example: + + English word:: + + from pythainlp.util import is_native_thai + + is_native_thai("Avocado") + # output: False + + Native Thai word:: + + is_native_thai("มะม่วง") + # output: True + is_native_thai("ตะวัน") + # output: True + + Non-native Thai word:: + + is_native_thai("สามารถ") + # output: False + is_native_thai("อิสริยาภรณ์") + # output: False + """ + if not isinstance(word, str) or not word.strip(): + return False + + word = word.strip() + + # Known native Thai words (exceptions) + if word in _TH_NATIVE_WORDS: + return True + + # If a word contains non-Thai chars, it is not a native Thai + if any(ch in word for ch in _TH_NON_NATIVE_CHARS): + return False + + # If it does not contain any Thai consonants -> it cannot be Thai + chs = re.findall(_TH_CONSONANTS_PATTERN, word) + if not chs: + return False + + # If there's only one Thai consonant -> it can be a native Thai + if len(chs) == 1: + return True + + # If a word ends with native final, it can be a native Thai + if word[-1] in _TH_NATIVE_FINALS: + return True + + # Note: This will not work, as it check the whole word, not the prefix. + # Prefix-sensitive tokenization is required in order to be able to check this. + if word in _TH_PREFIX_DIPHTHONG: + return True + + return False diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py new file mode 100644 index 000000000..ec9d4c38b --- /dev/null +++ b/pythainlp/morpheme/word_formation.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +from pythainlp import thai_consonants + + +def nighit(w1: str, w2: str) -> str: + """ + Nighit (นิคหิต or ํ ) is the niggahita in Thai language for create new \ + words from Pali language in Thai. + The function use simple method to create new Thai word from two words \ + that the root is from Pali language. + + Read more: https://www.trueplookpanya.com/learning/detail/1180 + + :param str w1: A Thai word that has a nighit. + :param str w2: A Thai word. + :return: Thai word. + :rtype: str + :Example: + :: + from pythainlp.morpheme import nighit + + assert nighit("สํ","คีต")=="สังคีต" + assert nighit("สํ","จร")=="สัญจร" + assert nighit("สํ","ฐาน")=="สัณฐาน" + assert nighit("สํ","นิษฐาน")=="สันนิษฐาน" + assert nighit("สํ","ปทา")=="สัมปทา" + assert nighit("สํ","โยค")=="สังโยค" + """ + if not str(w1).endswith('ํ') and len(w1) != 2: + raise NotImplementedError(f"The function doesn't support {w1}.") + list_w1 = list(w1) + list_w2 = list(w2) + newword = list() + newword.append(list_w1[0]) + newword.append("ั") + consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0] + if consonant_start in ["ก", "ช", "ค", "ข", "ง"]: + newword.append("ง") + elif consonant_start in ["จ", "ฉ", "ช", "ฌ"]: + newword.append("ญ") + elif consonant_start in ["ฎ", "ฐ", "ฑ", "ณ"]: + newword.append("ณ") + elif consonant_start in ["ด", "ถ", "ท", "ธ", "น"]: + newword.append("น") + elif consonant_start in ["ป", "ผ", "พ", "ภ"]: + newword.append("ม") + elif consonant_start in ["ย", "ร", "ล", "ฬ", "ว", "ศ", "ษ", "ส", "ห"]: + newword.append("ง") + else: + raise NotImplementedError(f""" + The function doesn't support {w1} and {w2}. + """) + newword.extend(list_w2) + return ''.join(newword) diff --git a/pythainlp/util/thaiwordcheck.py b/pythainlp/util/thaiwordcheck.py index bce6579a4..5ae88efe2 100644 --- a/pythainlp/util/thaiwordcheck.py +++ b/pythainlp/util/thaiwordcheck.py @@ -1,129 +1,15 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 -""" -Check if a word is a "native Thai word" - -Adapted from -https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md - -References -- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067 -- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619 -""" -import re - -_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound) - -# Non-native Thai characters -_TH_NON_NATIVE_CHARS = { - "ฆ", - "ณ", - "ฌ", - "ฎ", - "ฏ", - "ฐ", - "ฑ", - "ฒ", - "ธ", - "ศ", - "ษ", - "ฬ", - _THANTHAKHAT_CHAR, -} - -# Native Thai final consonants -_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"} - -# Known native Thai words (exceptions) -_TH_NATIVE_WORDS = { - "ฆ่า", - "เฆี่ยน", - "ศึก", - "ศอก", - "เศิก", - "เศร้า", - "ธ", - "ณ", - "ฯพณฯ", - "ใหญ่", - "หญ้า", - "ควาย", - "ความ", - "กริ่งเกรง", - "ผลิ", -} - -# Diphthong prefixes (can start native Thai word) -_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"} - -# Thai consonant filter -# O ANG (U+0E2D) is omitted, as it can be considered as vowel -_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U) - +import warnings def is_native_thai(word: str) -> bool: - """ - Check if a word is an "native Thai word" (Thai: "คำไทยแท้") - This function is based on a simple heuristic algorithm - and cannot be entirely reliable. - - :param str word: word - :return: True or False - :rtype: bool - - :Example: - - English word:: - - from pythainlp.util import is_native_thai - - is_native_thai("Avocado") - # output: False - - Native Thai word:: - - is_native_thai("มะม่วง") - # output: True - is_native_thai("ตะวัน") - # output: True - - Non-native Thai word:: - - is_native_thai("สามารถ") - # output: False - is_native_thai("อิสริยาภรณ์") - # output: False - """ - if not isinstance(word, str) or not word.strip(): - return False - - word = word.strip() - - # Known native Thai words (exceptions) - if word in _TH_NATIVE_WORDS: - return True - - # If a word contains non-Thai chars, it is not a native Thai - if any(ch in word for ch in _TH_NON_NATIVE_CHARS): - return False - - # If it does not contain any Thai consonants -> it cannot be Thai - chs = re.findall(_TH_CONSONANTS_PATTERN, word) - if not chs: - return False - - # If there's only one Thai consonant -> it can be a native Thai - if len(chs) == 1: - return True - - # If a word ends with native final, it can be a native Thai - if word[-1] in _TH_NATIVE_FINALS: - return True - - # Note: This will not work, as it check the whole word, not the prefix. - # Prefix-sensitive tokenization is required in order to be able to check this. - if word in _TH_PREFIX_DIPHTHONG: - return True - - return False + warnings.warn( + """ + pythainlp.util.is_native_thai is rename as \ + pythainlp.morpheme.is_native_thai. + This function will remove in PyThaiNLP 5.1. + """, DeprecationWarning) + from pythainlp.morpheme import is_native_thai as check + + return check(word) diff --git a/tests/test_morpheme.py b/tests/test_morpheme.py new file mode 100644 index 000000000..af8250360 --- /dev/null +++ b/tests/test_morpheme.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +import unittest +from pythainlp.morpheme import nighit, is_native_thai + + +class TestMorphemePackage(unittest.TestCase): + def test_nighit(self): + self.assertEqual(nighit("สํ", "คีต"), "สังคีต") + self.assertEqual(nighit("สํ", "จร"), "สัญจร") + self.assertEqual(nighit("สํ", "ฐาน"), "สัณฐาน") + self.assertEqual(nighit("สํ", "นิษฐาน"), "สันนิษฐาน") + self.assertEqual(nighit("สํ", "ปทา"), "สัมปทา") + self.assertEqual(nighit("สํ", "โยค"), "สังโยค") + + def test_is_native_thai(self): + self.assertEqual(is_native_thai(None), False) + self.assertEqual(is_native_thai(""), False) + self.assertEqual(is_native_thai("116"), False) + self.assertEqual(is_native_thai("abc"), False) + self.assertEqual(is_native_thai("ตา"), True) + self.assertEqual(is_native_thai("ยา"), True) + self.assertEqual(is_native_thai("ฆ่า"), True) + self.assertEqual(is_native_thai("คน"), True) + self.assertEqual(is_native_thai("กะ"), True) + self.assertEqual(is_native_thai("มอ"), True) + self.assertEqual(is_native_thai("กะ"), True) + self.assertEqual(is_native_thai("กระ"), True) + self.assertEqual(is_native_thai("ประท้วง"), True) + self.assertEqual(is_native_thai("ศา"), False) + self.assertEqual(is_native_thai("ลักษ์"), False) + self.assertEqual(is_native_thai("มาร์ค"), False) + self.assertEqual(is_native_thai("เลข"), False) + self.assertEqual(is_native_thai("เทเวศน์"), False) + self.assertEqual(is_native_thai("เทเวศร์"), False)