| 
 | 1 | +# -*- coding: utf-8 -*-  | 
 | 2 | +"""  | 
 | 3 | +Thai-French Machine Translation  | 
 | 4 | +
  | 
 | 5 | +Trained by OPUS Corpus  | 
 | 6 | +
  | 
 | 7 | +Model from Language Technology Research Group at the University of Helsinki  | 
 | 8 | +
  | 
 | 9 | +BLEU 20.4  | 
 | 10 | +
  | 
 | 11 | +- Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr  | 
 | 12 | +"""  | 
 | 13 | +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  | 
 | 14 | + | 
 | 15 | + | 
 | 16 | +class ThFrTranslator:  | 
 | 17 | +    """  | 
 | 18 | +    Thai-French Machine Translation  | 
 | 19 | +
  | 
 | 20 | +    Trained by OPUS Corpus  | 
 | 21 | +
  | 
 | 22 | +    Model from Language Technology Research Group at the University of Helsinki  | 
 | 23 | +
  | 
 | 24 | +    BLEU 20.4  | 
 | 25 | +
  | 
 | 26 | +    - Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr  | 
 | 27 | +    """  | 
 | 28 | +    def __init__(self, pretrained: str = "Helsinki-NLP/opus-mt-th-fr") -> None:  | 
 | 29 | +        self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained)  | 
 | 30 | +        self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained)  | 
 | 31 | + | 
 | 32 | +    def translate(self, text: str) -> str:  | 
 | 33 | +        """  | 
 | 34 | +        Translate text from Thai to French  | 
 | 35 | +
  | 
 | 36 | +        :param str text: input text in source language  | 
 | 37 | +        :return: translated text in target language  | 
 | 38 | +        :rtype: str  | 
 | 39 | +
  | 
 | 40 | +        :Example:  | 
 | 41 | +
  | 
 | 42 | +        Translate text from Thai to French::  | 
 | 43 | +
  | 
 | 44 | +            from pythainlp.translate.th_fr import ThFrTranslator  | 
 | 45 | +
  | 
 | 46 | +            thfr = ThFrTranslator()  | 
 | 47 | +
  | 
 | 48 | +            thfr.translate("ทดสอบระบบ")  | 
 | 49 | +            # output: "Test du système."  | 
 | 50 | +
  | 
 | 51 | +        """  | 
 | 52 | +        self.translated = self.model_thzh.generate(  | 
 | 53 | +            **self.tokenizer_thzh(text, return_tensors="pt", padding=True)  | 
 | 54 | +        )  | 
 | 55 | +        return [  | 
 | 56 | +            self.tokenizer_thzh.decode(  | 
 | 57 | +                t, skip_special_tokens=True  | 
 | 58 | +            ) for t in self.translated  | 
 | 59 | +        ][0]  | 
0 commit comments