PyThaiNLP · wannaphong · Jul 16, 2021 · Jun 12, 2021 · Jun 12, 2021 · Jun 12, 2021
diff --git a/docs/api/augment.rst b/docs/api/augment.rst
@@ -0,0 +1,25 @@
+.. currentmodule:: pythainlp.augment
+
+pythainlp.augment
+=================
+
+The :class:`textaugment` is Thai text augment. This function for text augment task.
+
+Modules
+-------
+
+.. autoclass:: WordNetAug
+    :members:
+.. autofunction:: postype2wordnet
+.. autoclass:: pythainlp.augment.word2vec.Word2VecAug
+    :members:
+.. autoclass:: pythainlp.augment.word2vec.Thai2fitAug
+    :members:
+.. autoclass:: pythainlp.augment.word2vec.LTW2VAug
+    :members:
+.. autoclass:: pythainlp.augment.lm.FastTextAug
+    :members:
+.. autoclass:: pythainlp.augment.lm.Thai2transformersAug
+    :members:
+.. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug
+    :members:
diff --git a/pythainlp/augment/__init__.py b/pythainlp/augment/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Thai text augment
+"""
+
+__all__ = ["WordNetAug"]
+
+from pythainlp.augment.wordnet import WordNetAug
diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+LM
+"""
+
+__all__ = [
+    "FastTextAug",
+    "Thai2transformersAug",
+]
+
+from pythainlp.augment.lm.fasttext import FastTextAug
+from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
diff --git a/pythainlp/augment/lm/fasttext.py b/pythainlp/augment/lm/fasttext.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+from gensim.models.fasttext import FastText as FastText_gensim
+from pythainlp.tokenize import word_tokenize
+from gensim.models.keyedvectors import KeyedVectors
+import itertools
+
+
+class FastTextAug:
+    """
+    Text Augment from FastText
+    """
+    def __init__(self, model_path: str):
+        """
+        :param str model_path: path of model file
+        """
+        if model_path.endswith('.bin'):
+            self.model = FastText_gensim.load_facebook_vectors(model_path)
+        elif model_path.endswith('.vec'):
+            self.model = KeyedVectors.load_word2vec_format(model_path)
+        else:
+            self.model = FastText_gensim.load(model_path)
+        self.dict_wv = list(self.model.key_to_index.keys())
+
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Thai text tokenize for fasttext
+
+        :param str text: thai text
+
+        :return: list of word
+        :rtype: List[str]
+        """
+        return word_tokenize(text, engine='icu')
+
+    def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
+        """
+        :param str sent: text sentence
+        :param float p: probability
+        :rtype: List[List[str]]
+        """
+        list_sent_new = []
+        for i in sent:
+            if i in self.dict_wv:
+                w = [
+                    j for j, v in self.model.most_similar(i) if v >= p
+                ]
+                if w == []:
+                    list_sent_new.append([i])
+                else:
+                    list_sent_new.append(w)
+            else:
+                list_sent_new.append([i])
+        return list_sent_new
+
+    def augment(
+        self, sentence: str, n_sent: int = 1, p: float = 0.7
+    ) -> List[Tuple[str]]:
+        """
+        Text Augment from FastText
+
+        You wants to download thai model
+        from https://fasttext.cc/docs/en/crawl-vectors.html.
+
+        :param str sentence: thai sentence
+        :param int n_sent: number sentence
+        :param float p: Probability of word
+
+        :return: list of synonyms
+        :rtype: List[Tuple[str]]
+        """
+        self.sentence = self.tokenize(sentence)
+        self.list_synonym = self.modify_sent(self.sentence, p=p)
+        new_sentences = []
+        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+            new_sentences.append(x)
+        return new_sentences
diff --git a/pythainlp/augment/lm/wangchanberta.py b/pythainlp/augment/lm/wangchanberta.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# transformers
+from transformers import (
+    CamembertTokenizer,
+    pipeline,
+)
+import random
+from typing import List
+
+model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+
+
+class Thai2transformersAug:
+    def __init__(self):
+        self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+        self.target_tokenizer = CamembertTokenizer
+        self.tokenizer = CamembertTokenizer.from_pretrained(
+                                    self.model_name,
+                                    revision='main')
+        self.tokenizer.additional_special_tokens = [
+            '<s>NOTUSED',
+            '</s>NOTUSED',
+            '<_>'
+        ]
+        self.fill_mask = pipeline(
+            task='fill-mask',
+            tokenizer=self.tokenizer,
+            model=f'{self.model_name}',
+            revision='main'
+        )
+
+    def generate(self, sentence: str, num_replace_tokens: int = 3):
+        self.sent2 = []
+        self.input_text = sentence
+        sent = [
+            i for i in self.tokenizer.tokenize(self.input_text) if i != '▁'
+        ]
+        if len(sent) < num_replace_tokens:
+            num_replace_tokens = len(sent)
+        masked_text = self.input_text
+        for i in range(num_replace_tokens):
+            replace_token = [
+                sent.pop(random.randrange(len(sent))) for _ in range(1)
+            ][0]
+            masked_text = masked_text.replace(
+                replace_token, f"{self.fill_mask.tokenizer.mask_token}", 1
+            )
+            self.sent2 += [
+                str(j['sequence']).replace('<s> ', '').replace('</s>', '')
+                for j in self.fill_mask(masked_text+'<pad>')
+                if j['sequence'] not in self.sent2
+            ]
+            masked_text = self.input_text
+        return self.sent2
+
+    def augment(
+        self, sentence: str, num_replace_tokens: int = 3
+    ) -> List[str]:
+        """
+        Text Augment from wangchanberta
+
+        :param str sentence: thai sentence
+        :param int num_replace_tokens: number replace tokens
+
+        :return: list of text augment
+        :rtype: List[str]
+        """
+        self.sent2 = []
+        try:
+            self.sent2 = self.generate(sentence, num_replace_tokens)
+            if self.sent2 == []:
+                self.sent2 = self.generate(sentence, num_replace_tokens)
+            return self.sent2
+        except:
+            if len(self.sent2) > 0:
+                return self.sent2
+            else:
+                return self.sent2
diff --git a/pythainlp/augment/word2vec/__init__.py b/pythainlp/augment/word2vec/__init__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""
+Word2Vec
+"""
+
+__all__ = [
+    "Word2VecAug",
+    "Thai2fitAug",
+    "LTW2VAug"
+]
+
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
+from pythainlp.augment.word2vec.ltw2v import LTW2VAug
diff --git a/pythainlp/augment/word2vec/bpemb_wv.py b/pythainlp/augment/word2vec/bpemb_wv.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+from pythainlp.augment.word2vec.core import Word2VecAug
+from bpemb import BPEmb
+from typing import List, Tuple
+
+
+class BPEmbAug:
+    """
+    Thai Text Augment using word2vec from BPEmb
+
+    BPEmb:
+    `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
+    """
+    def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
+        self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
+        self.model = self.bpemb_temp.emb
+        self.load_w2v()
+
+    def tokenizer(self, text: str) -> List[str]:
+        """
+        :param str text: thai text
+        :rtype: List[str]
+        """
+        return self.bpemb_temp.encode(text)
+
+    def load_w2v(self):
+        """
+        Load BPEmb model
+        """
+        self.aug = Word2VecAug(
+            self.model, tokenize=self.tokenizer, type="model"
+        )
+
+    def augment(
+        self, sentence: str, n_sent: int = 1, p: float = 0.7
+    ) -> List[Tuple[str]]:
+        """
+        Text Augment using word2vec from BPEmb
+
+        :param str sentence: thai sentence
+        :param int n_sent: number sentence
+        :param float p: Probability of word
+
+        :return: list of synonyms
+        :rtype: List[Tuple[str]]
+        """
+        self.sentence = sentence.replace(" ", "▁")
+        self.temp = self.aug.augment(self.sentence, n_sent, p=p)
+        self.temp_new = []
+        for i in self.temp:
+            self.t = ""
+            for j in i:
+                self.t += j.replace('▁', '')
+            self.temp_new.append(self.t)
+        return self.temp_new
diff --git a/pythainlp/augment/word2vec/core.py b/pythainlp/augment/word2vec/core.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+import gensim.models.keyedvectors as word2vec
+import itertools
+
+
+class Word2VecAug:
+    def __init__(
+        self, model: str, tokenize: object, type: str = "file"
+    ) -> None:
+        """
+        :param str model: path model
+        :param object tokenize: tokenize function
+        :param str type: moodel type (file, binary)
+        """
+        self.tokenizer = tokenize
+        if type == "file":
+            self.model = word2vec.KeyedVectors.load_word2vec_format(model)
+        elif type == "binary":
+            self.model = word2vec.KeyedVectors.load_word2vec_format(
+                model, binary=True, unicode_errors='ignore'
+            )
+        else:
+            self.model = model
+        self.dict_wv = list(self.model.key_to_index.keys())
+
+    def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
+        """
+        :param str sent: text sentence
+        :param float p: probability
+        :rtype: List[List[str]]
+        """
+        list_sent_new = []
+        for i in sent:
+            if i in self.dict_wv:
+                w = [
+                    j for j, v in self.model.most_similar(i) if v >= p
+                ]
+                if w == []:
+                    list_sent_new.append([i])
+                else:
+                    list_sent_new.append(w)
+            else:
+                list_sent_new.append([i])
+        return list_sent_new
+
+    def augment(
+        self,
+        sentence: str,
+        n_sent: int = 1,
+        p: float = 0.7
+    ) -> List[Tuple[str]]:
+        """
+        :param str sentence: text sentence
+        :param int n_sent: max number for synonyms sentence
+        :param int p: probability
+
+        :return: list of synonyms
+        :rtype: List[Tuple[str]]
+        """
+        self.sentence = self.tokenizer(sentence)
+        self.list_synonym = self.modify_sent(self.sentence, p=p)
+        new_sentences = []
+        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+            new_sentences.append(x)
+        return new_sentences
diff --git a/pythainlp/augment/word2vec/ltw2v.py b/pythainlp/augment/word2vec/ltw2v.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import word_tokenize
+from typing import List, Tuple
+
+
+class LTW2VAug:
+    """
+    Text Augment using word2vec from LTW2V
+
+    LTW2V:
+    `github.com/PyThaiNLP/large-thaiword2vec <https://github.com/PyThaiNLP/large-thaiword2vec>`_
+    """
+    def __init__(self):
+        self.ltw2v_wv = get_corpus_path('ltw2v')
+        self.load_w2v()
+
+    def tokenizer(self, text: str) -> List[str]:
+        """
+        :param str text: thai text
+        :rtype: List[str]
+        """
+        return word_tokenize(text, engine='newmm')
+
+    def load_w2v(self):  # insert substitute
+        """
+        Load ltw2v word2vec model
+        """
+        self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")
+
+    def augment(
+        self,
+        sentence: str,
+        n_sent: int = 1,
+        p: float = 0.7
+    ) -> List[Tuple[str]]:
+        """
+        Text Augment using word2vec from Thai2Fit
+
+        :param str sentence: thai sentence
+        :param int n_sent: number sentence
+        :param float p: Probability of word
+
+        :return: list of text augment
+        :rtype: List[Tuple[str]]
+        """
+        return self.aug.augment(sentence, n_sent, p)