diff --git a/docs/api/translate.rst b/docs/api/translate.rst index 29bb132d6..89b4537ac 100644 --- a/docs/api/translate.rst +++ b/docs/api/translate.rst @@ -12,3 +12,9 @@ Modules :members: translate .. autoclass:: ThEnTranslator :members: translate +.. autoclass:: ThZhTranslator + :members: translate +.. autoclass:: ZhThTranslator + :members: translate +.. autoclass:: Translate + :members: diff --git a/pythainlp/translate/__init__.py b/pythainlp/translate/__init__.py index 8664f22ec..93a473277 100644 --- a/pythainlp/translate/__init__.py +++ b/pythainlp/translate/__init__.py @@ -6,11 +6,20 @@ __all__ = [ "EnThTranslator", "ThEnTranslator", - "download_model_all" + "download_model_all", + "ThZhTranslator", + "ZhThTranslator", + "Translate" ] -from pythainlp.translate.core import ( +from pythainlp.translate.core import Translate + +from pythainlp.translate.en_th import ( EnThTranslator, ThEnTranslator, download_model_all, ) +from pythainlp.translate.zh_th import ( + ThZhTranslator, + ZhThTranslator, +) diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py index 72fc9a6a4..48702ee43 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/core.py @@ -1,114 +1,75 @@ # -*- coding: utf-8 -*- -import os -import tarfile -from collections import defaultdict -from pythainlp.corpus import download, get_corpus_path -from pythainlp.tools import get_full_data_path, get_pythainlp_data_path -from fairseq.models.transformer import TransformerModel -from sacremoses import MosesTokenizer +class Translate: + """ + Machine Translation + :param str src_lang: source language + :param str target_lang: target language -_EN_TH_MODEL_NAME = "scb_1m_en-th_moses" -# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz -_EN_TH_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0" + **Options for source & target language** + * *th* - *en* - Thai to English + * *en* - *th* - English to Thai + * *th* - *zh* - Thai to Chinese + * *zh* - *th* - Chinese to Thai -_TH_EN_MODEL_NAME = "scb_1m_th-en_spm" -# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz -_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0" + :Example: + Translate text from Thai to English:: -def _get_translate_path(model: str, *path: str) -> str: - return os.path.join(get_full_data_path(model), *path) + from pythainlp.translate import Translate + th2en = Translate('th', 'en') + th2en.translate("ฉันรักแมว") + # output: I love cat. + """ + def __init__(self, src_lang: str, target_lang: str) -> None: + """ + :param str src_lang: source language + :param str target_lang: target language -def _download_install(name: str) -> None: - if get_corpus_path(name) is None: - download(name, force=True, version="1.0") - tar = tarfile.open(get_corpus_path(name), "r:gz") - tar.extractall() - tar.close() - if not os.path.exists(get_full_data_path(name)): - os.mkdir(get_full_data_path(name)) - with tarfile.open(get_corpus_path(name)) as tar: - tar.extractall(path=get_full_data_path(name)) + **Options for source & target language** + * *th* - *en* - Thai to English + * *en* - *th* - English to Thai + * *th* - *zh* - Thai to Chinese + * *zh* - *th* - Chinese to Thai + :Example: -def download_model_all() -> None: - """ - Download all translation models in advanced - """ - _download_install(_EN_TH_MODEL_NAME) - _download_install(_TH_EN_MODEL_NAME) - - -class EnThTranslator: - def __init__(self): - self._tokenizer = MosesTokenizer("en") - - self._model_name = _EN_TH_MODEL_NAME - - _download_install(self._model_name) - self._model = TransformerModel.from_pretrained( - model_name_or_path=_get_translate_path( - self._model_name, - _EN_TH_FILE_NAME, - "models", - ), - checkpoint_file="checkpoint.pt", - data_name_or_path=_get_translate_path( - self._model_name, - _EN_TH_FILE_NAME, - "vocab", - ), - ) - - def translate(self, text: str) -> str: - """ - Translate text from English to Thai + Translate text from Thai to English:: - :param str text: input text in source language - :return: translated text in target language - :rtype: str + from pythainlp.translate import Translate + th2en = Translate('th', 'en') + + th2en.translate("ฉันรักแมว") + # output: I love cat. """ - tokens = " ".join(self._tokenizer.tokenize(text)) - translated = self._model.translate(tokens) - return translated.replace(" ", "").replace("▁", " ").strip() - - -class ThEnTranslator: - def __init__(self): - self._model_name = _TH_EN_MODEL_NAME - - _download_install(self._model_name) - self._model = TransformerModel.from_pretrained( - model_name_or_path=_get_translate_path( - self._model_name, - _TH_EN_FILE_NAME, - "models", - ), - checkpoint_file="checkpoint.pt", - data_name_or_path=_get_translate_path( - self._model_name, - _TH_EN_FILE_NAME, - "vocab", - ), - bpe="sentencepiece", - sentencepiece_model=_get_translate_path( - self._model_name, - _TH_EN_FILE_NAME, - "bpe", - "spm.th.model", - ), - ) - - def translate(self, text: str) -> str: + self.model = None + self.load_model(src_lang, target_lang) + + def load_model(self, src_lang: str, target_lang: str): + if src_lang == "th" and target_lang == "en": + from pythainlp.translate.en_th import ThEnTranslator + self.model = ThEnTranslator() + elif src_lang == "en" and target_lang == "th": + from pythainlp.translate.en_th import EnThTranslator + self.model = EnThTranslator() + elif src_lang == "th" and target_lang == "zh": + from pythainlp.translate.zh_th import ThZhTranslator + self.model = ThZhTranslator() + elif src_lang == "zh" and target_lang == "th": + from pythainlp.translate.zh_th import ZhThTranslator + self.model = ZhThTranslator() + else: + raise ValueError("Not support language!") + + def translate(self, text) -> str: """ - Translate text from Thai to English + Translate text :param str text: input text in source language :return: translated text in target language :rtype: str """ - return self._model.translate(text) + return self.model.translate(text) diff --git a/pythainlp/translate/en_th.py b/pythainlp/translate/en_th.py new file mode 100644 index 000000000..38492a980 --- /dev/null +++ b/pythainlp/translate/en_th.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +""" +English-Thai Machine Translation + +from VISTEC-depa Thailand Artificial Intelligence Research Institute + +Website: https://airesearch.in.th/releases/machine-translation-models/ +""" +import os +import tarfile +from collections import defaultdict + +from pythainlp.corpus import download, get_corpus_path +from pythainlp.tools import get_full_data_path, get_pythainlp_data_path + +from fairseq.models.transformer import TransformerModel +from sacremoses import MosesTokenizer + + +_EN_TH_MODEL_NAME = "scb_1m_en-th_moses" +# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz +_EN_TH_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0" + +_TH_EN_MODEL_NAME = "scb_1m_th-en_spm" +# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz +_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0" + + +def _get_translate_path(model: str, *path: str) -> str: + return os.path.join(get_full_data_path(model), *path) + + +def _download_install(name: str) -> None: + if get_corpus_path(name) is None: + download(name, force=True, version="1.0") + tar = tarfile.open(get_corpus_path(name), "r:gz") + tar.extractall() + tar.close() + if not os.path.exists(get_full_data_path(name)): + os.mkdir(get_full_data_path(name)) + with tarfile.open(get_corpus_path(name)) as tar: + tar.extractall(path=get_full_data_path(name)) + + +def download_model_all() -> None: + """ + Download all translation models in advanced + """ + _download_install(_EN_TH_MODEL_NAME) + _download_install(_TH_EN_MODEL_NAME) + + +class EnThTranslator: + def __init__(self): + self._tokenizer = MosesTokenizer("en") + + self._model_name = _EN_TH_MODEL_NAME + + _download_install(self._model_name) + self._model = TransformerModel.from_pretrained( + model_name_or_path=_get_translate_path( + self._model_name, + _EN_TH_FILE_NAME, + "models", + ), + checkpoint_file="checkpoint.pt", + data_name_or_path=_get_translate_path( + self._model_name, + _EN_TH_FILE_NAME, + "vocab", + ), + ) + + def translate(self, text: str) -> str: + """ + Translate text from English to Thai + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from English to Thai:: + + from pythainlp.translate import EnThTranslator + + enth = EnThTranslator() + + enth.translate("I love cat.") + # output: ฉันรักแมว + + """ + tokens = " ".join(self._tokenizer.tokenize(text)) + translated = self._model.translate(tokens) + return translated.replace(" ", "").replace("▁", " ").strip() + + +class ThEnTranslator: + def __init__(self): + self._model_name = _TH_EN_MODEL_NAME + + _download_install(self._model_name) + self._model = TransformerModel.from_pretrained( + model_name_or_path=_get_translate_path( + self._model_name, + _TH_EN_FILE_NAME, + "models", + ), + checkpoint_file="checkpoint.pt", + data_name_or_path=_get_translate_path( + self._model_name, + _TH_EN_FILE_NAME, + "vocab", + ), + bpe="sentencepiece", + sentencepiece_model=_get_translate_path( + self._model_name, + _TH_EN_FILE_NAME, + "bpe", + "spm.th.model", + ), + ) + + def translate(self, text: str) -> str: + """ + Translate text from Thai to English + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Thai to English:: + + from pythainlp.translate import ThEnTranslator + + then = ThEnTranslator() + + then.translate("ฉันรักแมว") + # output: I love cat. + + """ + return self._model.translate(text) diff --git a/pythainlp/translate/zh_th.py b/pythainlp/translate/zh_th.py new file mode 100644 index 000000000..fd100a57d --- /dev/null +++ b/pythainlp/translate/zh_th.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +""" +Lalita Chinese-Thai Machine Translation + +from Ai builder + +- GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth +- Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822 +""" +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + + +class ThZhTranslator: + def __init__(self, pretrained: str = "Lalita/marianmt-th-zh_cn") -> None: + self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained) + self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + + def translate(self, text: str) -> str: + """ + Translate text from Thai to Chinese + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Thai to Chinese:: + + from pythainlp.translate import ThZhTranslator + + thzh = ThZhTranslator() + + thzh.translate("ผมรักคุณ") + # output: 我爱你 + + """ + self.translated = self.model_thzh.generate( + **self.tokenizer_thzh(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer_thzh.decode( + t, skip_special_tokens=True + ) for t in self.translated + ][0] + + +class ZhThTranslator: + def __init__(self, pretrained: str = "Lalita/marianmt-zh_cn-th") -> None: + self.tokenizer_zhth = AutoTokenizer.from_pretrained(pretrained) + self.model_zhth = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + + def translate(self, text: str) -> str: + """ + Translate text from Chinese to Thai + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Chinese to Thai:: + + from pythainlp.translate import ZhThTranslator + + zhth = ZhThTranslator() + + zhth.translate("我爱你") + # output: ผมรักคุณนะ + + """ + self.translated = self.model_zhth.generate( + **self.tokenizer_zhth(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer_zhth.decode( + t, skip_special_tokens=True + ) for t in self.translated + ][0] diff --git a/setup.py b/setup.py index 0e05f0387..2d2e98302 100644 --- a/setup.py +++ b/setup.py @@ -54,9 +54,10 @@ "sacremoses>=0.0.41", "sentencepiece>=0.1.91", "torch>=1.0.0", + "transformers>=4.6.0", ], "wangchanberta": ["transformers", "sentencepiece"], - "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], + "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], "sefr_cut": ["sefr_cut"], "full": [ @@ -74,7 +75,7 @@ "sentencepiece>=0.1.91", "ssg>=0.0.6", "torch>=1.0.0", - "transformers>=4.1.1", + "transformers>=4.6.0", "sefr_cut" ], } diff --git a/tests/test_translate.py b/tests/test_translate.py index f0df58a2a..7da13e7f5 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -2,8 +2,14 @@ import unittest -from pythainlp.translate import EnThTranslator, ThEnTranslator -from pythainlp.translate.core import download_model_all +from pythainlp.translate import ( + EnThTranslator, + ThEnTranslator, + ThZhTranslator, + ZhThTranslator, + download_model_all, + Translate +) class TestTranslatePackage(unittest.TestCase): @@ -21,3 +27,41 @@ def test_translate(self): "the cat eats fish.", ) ) + self.th_zh_translator = ThZhTranslator() + self.assertIsNotNone( + self.th_zh_translator.translate( + "ผมรักคุณ", + ) + ) + self.zh_th_translator = ZhThTranslator() + self.assertIsNotNone( + self.zh_th_translator.translate( + "我爱你", + ) + ) + self.th_en_translator = Translate('th', 'en') + self.assertIsNotNone( + self.th_en_translator.translate( + "แมวกินปลา", + ) + ) + self.en_th_translator = Translate('en', 'th') + self.assertIsNotNone( + self.en_th_translator.translate( + "the cat eats fish.", + ) + ) + self.th_zh_translator = Translate('th', 'zh') + self.assertIsNotNone( + self.th_zh_translator.translate( + "ผมรักคุณ", + ) + ) + self.zh_th_translator = Translate('zh', 'th') + self.assertIsNotNone( + self.zh_th_translator.translate( + "我爱你", + ) + ) + with self.assertRaises(ValueError): + self.th_cat_translator = Translate('th', 'cat')