diff --git a/docs/api/augment.rst b/docs/api/augment.rst
new file mode 100644
index 000000000..220cc21c8
--- /dev/null
+++ b/docs/api/augment.rst
@@ -0,0 +1,25 @@
+.. currentmodule:: pythainlp.augment
+
+pythainlp.augment
+=================
+
+The :class:`textaugment` is Thai text augment. This function for text augment task.
+
+Modules
+-------
+
+.. autoclass:: WordNetAug
+ :members:
+.. autofunction:: postype2wordnet
+.. autoclass:: pythainlp.augment.word2vec.Word2VecAug
+ :members:
+.. autoclass:: pythainlp.augment.word2vec.Thai2fitAug
+ :members:
+.. autoclass:: pythainlp.augment.word2vec.LTW2VAug
+ :members:
+.. autoclass:: pythainlp.augment.lm.FastTextAug
+ :members:
+.. autoclass:: pythainlp.augment.lm.Thai2transformersAug
+ :members:
+.. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug
+ :members:
\ No newline at end of file
diff --git a/pythainlp/augment/__init__.py b/pythainlp/augment/__init__.py
new file mode 100644
index 000000000..039935081
--- /dev/null
+++ b/pythainlp/augment/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Thai text augment
+"""
+
+__all__ = ["WordNetAug"]
+
+from pythainlp.augment.wordnet import WordNetAug
diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py
new file mode 100644
index 000000000..c90a81a32
--- /dev/null
+++ b/pythainlp/augment/lm/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+LM
+"""
+
+__all__ = [
+ "FastTextAug",
+ "Thai2transformersAug",
+]
+
+from pythainlp.augment.lm.fasttext import FastTextAug
+from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
diff --git a/pythainlp/augment/lm/fasttext.py b/pythainlp/augment/lm/fasttext.py
new file mode 100644
index 000000000..cc1379e57
--- /dev/null
+++ b/pythainlp/augment/lm/fasttext.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+from gensim.models.fasttext import FastText as FastText_gensim
+from pythainlp.tokenize import word_tokenize
+from gensim.models.keyedvectors import KeyedVectors
+import itertools
+
+
+class FastTextAug:
+ """
+ Text Augment from FastText
+ """
+ def __init__(self, model_path: str):
+ """
+ :param str model_path: path of model file
+ """
+ if model_path.endswith('.bin'):
+ self.model = FastText_gensim.load_facebook_vectors(model_path)
+ elif model_path.endswith('.vec'):
+ self.model = KeyedVectors.load_word2vec_format(model_path)
+ else:
+ self.model = FastText_gensim.load(model_path)
+ self.dict_wv = list(self.model.key_to_index.keys())
+
+ def tokenize(self, text: str) -> List[str]:
+ """
+ Thai text tokenize for fasttext
+
+ :param str text: thai text
+
+ :return: list of word
+ :rtype: List[str]
+ """
+ return word_tokenize(text, engine='icu')
+
+ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
+ """
+ :param str sent: text sentence
+ :param float p: probability
+ :rtype: List[List[str]]
+ """
+ list_sent_new = []
+ for i in sent:
+ if i in self.dict_wv:
+ w = [
+ j for j, v in self.model.most_similar(i) if v >= p
+ ]
+ if w == []:
+ list_sent_new.append([i])
+ else:
+ list_sent_new.append(w)
+ else:
+ list_sent_new.append([i])
+ return list_sent_new
+
+ def augment(
+ self, sentence: str, n_sent: int = 1, p: float = 0.7
+ ) -> List[Tuple[str]]:
+ """
+ Text Augment from FastText
+
+ You wants to download thai model
+ from https://fasttext.cc/docs/en/crawl-vectors.html.
+
+ :param str sentence: thai sentence
+ :param int n_sent: number sentence
+ :param float p: Probability of word
+
+ :return: list of synonyms
+ :rtype: List[Tuple[str]]
+ """
+ self.sentence = self.tokenize(sentence)
+ self.list_synonym = self.modify_sent(self.sentence, p=p)
+ new_sentences = []
+ for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+ new_sentences.append(x)
+ return new_sentences
diff --git a/pythainlp/augment/lm/wangchanberta.py b/pythainlp/augment/lm/wangchanberta.py
new file mode 100644
index 000000000..0322f4d16
--- /dev/null
+++ b/pythainlp/augment/lm/wangchanberta.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# transformers
+from transformers import (
+ CamembertTokenizer,
+ pipeline,
+)
+import random
+from typing import List
+
+model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+
+
+class Thai2transformersAug:
+ def __init__(self):
+ self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+ self.target_tokenizer = CamembertTokenizer
+ self.tokenizer = CamembertTokenizer.from_pretrained(
+ self.model_name,
+ revision='main')
+ self.tokenizer.additional_special_tokens = [
+ 'NOTUSED',
+ 'NOTUSED',
+ '<_>'
+ ]
+ self.fill_mask = pipeline(
+ task='fill-mask',
+ tokenizer=self.tokenizer,
+ model=f'{self.model_name}',
+ revision='main'
+ )
+
+ def generate(self, sentence: str, num_replace_tokens: int = 3):
+ self.sent2 = []
+ self.input_text = sentence
+ sent = [
+ i for i in self.tokenizer.tokenize(self.input_text) if i != '▁'
+ ]
+ if len(sent) < num_replace_tokens:
+ num_replace_tokens = len(sent)
+ masked_text = self.input_text
+ for i in range(num_replace_tokens):
+ replace_token = [
+ sent.pop(random.randrange(len(sent))) for _ in range(1)
+ ][0]
+ masked_text = masked_text.replace(
+ replace_token, f"{self.fill_mask.tokenizer.mask_token}", 1
+ )
+ self.sent2 += [
+ str(j['sequence']).replace(' ', '').replace('', '')
+ for j in self.fill_mask(masked_text+'')
+ if j['sequence'] not in self.sent2
+ ]
+ masked_text = self.input_text
+ return self.sent2
+
+ def augment(
+ self, sentence: str, num_replace_tokens: int = 3
+ ) -> List[str]:
+ """
+ Text Augment from wangchanberta
+
+ :param str sentence: thai sentence
+ :param int num_replace_tokens: number replace tokens
+
+ :return: list of text augment
+ :rtype: List[str]
+ """
+ self.sent2 = []
+ try:
+ self.sent2 = self.generate(sentence, num_replace_tokens)
+ if self.sent2 == []:
+ self.sent2 = self.generate(sentence, num_replace_tokens)
+ return self.sent2
+ except:
+ if len(self.sent2) > 0:
+ return self.sent2
+ else:
+ return self.sent2
diff --git a/pythainlp/augment/word2vec/__init__.py b/pythainlp/augment/word2vec/__init__.py
new file mode 100644
index 000000000..2d94435b3
--- /dev/null
+++ b/pythainlp/augment/word2vec/__init__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""
+Word2Vec
+"""
+
+__all__ = [
+ "Word2VecAug",
+ "Thai2fitAug",
+ "LTW2VAug"
+]
+
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
+from pythainlp.augment.word2vec.ltw2v import LTW2VAug
diff --git a/pythainlp/augment/word2vec/bpemb_wv.py b/pythainlp/augment/word2vec/bpemb_wv.py
new file mode 100644
index 000000000..6c5d44db6
--- /dev/null
+++ b/pythainlp/augment/word2vec/bpemb_wv.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+from pythainlp.augment.word2vec.core import Word2VecAug
+from bpemb import BPEmb
+from typing import List, Tuple
+
+
+class BPEmbAug:
+ """
+ Thai Text Augment using word2vec from BPEmb
+
+ BPEmb:
+ `github.com/bheinzerling/bpemb `_
+ """
+ def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
+ self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
+ self.model = self.bpemb_temp.emb
+ self.load_w2v()
+
+ def tokenizer(self, text: str) -> List[str]:
+ """
+ :param str text: thai text
+ :rtype: List[str]
+ """
+ return self.bpemb_temp.encode(text)
+
+ def load_w2v(self):
+ """
+ Load BPEmb model
+ """
+ self.aug = Word2VecAug(
+ self.model, tokenize=self.tokenizer, type="model"
+ )
+
+ def augment(
+ self, sentence: str, n_sent: int = 1, p: float = 0.7
+ ) -> List[Tuple[str]]:
+ """
+ Text Augment using word2vec from BPEmb
+
+ :param str sentence: thai sentence
+ :param int n_sent: number sentence
+ :param float p: Probability of word
+
+ :return: list of synonyms
+ :rtype: List[Tuple[str]]
+ """
+ self.sentence = sentence.replace(" ", "▁")
+ self.temp = self.aug.augment(self.sentence, n_sent, p=p)
+ self.temp_new = []
+ for i in self.temp:
+ self.t = ""
+ for j in i:
+ self.t += j.replace('▁', '')
+ self.temp_new.append(self.t)
+ return self.temp_new
diff --git a/pythainlp/augment/word2vec/core.py b/pythainlp/augment/word2vec/core.py
new file mode 100644
index 000000000..c98bcd39b
--- /dev/null
+++ b/pythainlp/augment/word2vec/core.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+import gensim.models.keyedvectors as word2vec
+import itertools
+
+
+class Word2VecAug:
+ def __init__(
+ self, model: str, tokenize: object, type: str = "file"
+ ) -> None:
+ """
+ :param str model: path model
+ :param object tokenize: tokenize function
+ :param str type: moodel type (file, binary)
+ """
+ self.tokenizer = tokenize
+ if type == "file":
+ self.model = word2vec.KeyedVectors.load_word2vec_format(model)
+ elif type == "binary":
+ self.model = word2vec.KeyedVectors.load_word2vec_format(
+ model, binary=True, unicode_errors='ignore'
+ )
+ else:
+ self.model = model
+ self.dict_wv = list(self.model.key_to_index.keys())
+
+ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
+ """
+ :param str sent: text sentence
+ :param float p: probability
+ :rtype: List[List[str]]
+ """
+ list_sent_new = []
+ for i in sent:
+ if i in self.dict_wv:
+ w = [
+ j for j, v in self.model.most_similar(i) if v >= p
+ ]
+ if w == []:
+ list_sent_new.append([i])
+ else:
+ list_sent_new.append(w)
+ else:
+ list_sent_new.append([i])
+ return list_sent_new
+
+ def augment(
+ self,
+ sentence: str,
+ n_sent: int = 1,
+ p: float = 0.7
+ ) -> List[Tuple[str]]:
+ """
+ :param str sentence: text sentence
+ :param int n_sent: max number for synonyms sentence
+ :param int p: probability
+
+ :return: list of synonyms
+ :rtype: List[Tuple[str]]
+ """
+ self.sentence = self.tokenizer(sentence)
+ self.list_synonym = self.modify_sent(self.sentence, p=p)
+ new_sentences = []
+ for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+ new_sentences.append(x)
+ return new_sentences
diff --git a/pythainlp/augment/word2vec/ltw2v.py b/pythainlp/augment/word2vec/ltw2v.py
new file mode 100644
index 000000000..61b2c83db
--- /dev/null
+++ b/pythainlp/augment/word2vec/ltw2v.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import word_tokenize
+from typing import List, Tuple
+
+
+class LTW2VAug:
+ """
+ Text Augment using word2vec from LTW2V
+
+ LTW2V:
+ `github.com/PyThaiNLP/large-thaiword2vec `_
+ """
+ def __init__(self):
+ self.ltw2v_wv = get_corpus_path('ltw2v')
+ self.load_w2v()
+
+ def tokenizer(self, text: str) -> List[str]:
+ """
+ :param str text: thai text
+ :rtype: List[str]
+ """
+ return word_tokenize(text, engine='newmm')
+
+ def load_w2v(self): # insert substitute
+ """
+ Load ltw2v word2vec model
+ """
+ self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")
+
+ def augment(
+ self,
+ sentence: str,
+ n_sent: int = 1,
+ p: float = 0.7
+ ) -> List[Tuple[str]]:
+ """
+ Text Augment using word2vec from Thai2Fit
+
+ :param str sentence: thai sentence
+ :param int n_sent: number sentence
+ :param float p: Probability of word
+
+ :return: list of text augment
+ :rtype: List[Tuple[str]]
+ """
+ return self.aug.augment(sentence, n_sent, p)
diff --git a/pythainlp/augment/word2vec/thai2fit.py b/pythainlp/augment/word2vec/thai2fit.py
new file mode 100644
index 000000000..8a553f0f2
--- /dev/null
+++ b/pythainlp/augment/word2vec/thai2fit.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import THAI2FIT_TOKENIZER
+from typing import List, Tuple
+
+
+class Thai2fitAug:
+ """
+ Text Augment using word2vec from Thai2Fit
+
+ Thai2Fit:
+ `github.com/cstorm125/thai2fit `_
+ """
+ def __init__(self):
+ self.thai2fit_wv = get_corpus_path('thai2fit_wv')
+ self.load_w2v()
+
+ def tokenizer(self, text: str) -> List[str]:
+ """
+ :param str text: thai text
+ :rtype: List[str]
+ """
+ return THAI2FIT_TOKENIZER.word_tokenize(text)
+
+ def load_w2v(self): # insert substitute
+ """
+ Load thai2fit word2vec model
+ """
+ self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
+
+ def augment(
+ self,
+ sentence: str,
+ n_sent: int = 1,
+ p: float = 0.7
+ ) -> List[Tuple[str]]:
+ """
+ Text Augment using word2vec from Thai2Fit
+
+ :param str sentence: thai sentence
+ :param int n_sent: number sentence
+ :param float p: Probability of word
+
+ :return: list of text augment
+ :rtype: List[Tuple[str]]
+ """
+ return self.aug.augment(sentence, n_sent, p)
diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py
new file mode 100644
index 000000000..14793a302
--- /dev/null
+++ b/pythainlp/augment/wordnet.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+"""
+Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
+"""
+__all__ = [
+ "WordNetAug",
+ "postype2wordnet",
+]
+
+from pythainlp.corpus import wordnet
+from collections import OrderedDict
+from pythainlp.tokenize import word_tokenize
+from pythainlp.tag import pos_tag
+from typing import List
+from nltk.corpus import wordnet as wn
+import itertools
+
+lst20 = {
+ "": "",
+ "AJ": wn.ADJ,
+ "AV": wn.ADV,
+ "AX": "",
+ "CC": "",
+ "CL": wn.NOUN,
+ "FX": wn.NOUN,
+ "IJ": "",
+ "NN": wn.NOUN,
+ "NU": "",
+ "PA": "",
+ "PR": "",
+ "PS": "",
+ "PU": "",
+ "VV": wn.VERB,
+ "XX": "",
+}
+
+orchid = {
+ "": "",
+ # NOUN
+ "NOUN": wn.NOUN,
+ "NCMN": wn.NOUN,
+ "NTTL": wn.NOUN,
+ "CNIT": wn.NOUN,
+ "CLTV": wn.NOUN,
+ "CMTR": wn.NOUN,
+ "CFQC": wn.NOUN,
+ "CVBL": wn.NOUN,
+ # VERB
+ "VACT": wn.VERB,
+ "VSTA": wn.VERB,
+ # PROPN
+ "PROPN": "",
+ "NPRP": "",
+ # ADJ
+ "ADJ": wn.ADJ,
+ "NONM": wn.ADJ,
+ "VATT": wn.ADJ,
+ "DONM": wn.ADJ,
+ # ADV
+ "ADV": wn.ADV,
+ "ADVN": wn.ADV,
+ "ADVI": wn.ADV,
+ "ADVP": wn.ADV,
+ "ADVS": wn.ADV,
+ # INT
+ "INT": "",
+ # PRON
+ "PRON": "",
+ "PPRS": "",
+ "PDMN": "",
+ "PNTR": "",
+ # DET
+ "DET": "",
+ "DDAN": "",
+ "DDAC": "",
+ "DDBQ": "",
+ "DDAQ": "",
+ "DIAC": "",
+ "DIBQ": "",
+ "DIAQ": "",
+ # NUM
+ "NUM": "",
+ "NCNM": "",
+ "NLBL": "",
+ "DCNM": "",
+ # AUX
+ "AUX": "",
+ "XVBM": "",
+ "XVAM": "",
+ "XVMM": "",
+ "XVBB": "",
+ "XVAE": "",
+ # ADP
+ "ADP": "",
+ "RPRE": "",
+ # CCONJ
+ "CCONJ": "",
+ "JCRG": "",
+ # SCONJ
+ "SCONJ": "",
+ "PREL": "",
+ "JSBR": "",
+ "JCMP": "",
+ # PART
+ "PART": "",
+ "FIXN": "",
+ "FIXV": "",
+ "EAFF": "",
+ "EITT": "",
+ "AITT": "",
+ "NEG": "",
+ # PUNCT
+ "PUNCT": "",
+ "PUNC": "",
+}
+
+
+def postype2wordnet(pos: str, corpus: str):
+ """
+ convert part-of-speech type to wordnet type
+
+ :param str pos: pos type
+ :param str corpus: part-of-speech corpus
+
+ **Options for corpus**
+ * *lst20* - LST20 Corpus
+ * *orchid* - Orchid Corpus
+ """
+ if corpus not in ['lst20', 'orchid']:
+ return None
+ if corpus == 'lst20':
+ return lst20[pos]
+ else:
+ return orchid[pos]
+
+
+class WordNetAug:
+ def __init__(self):
+ pass
+
+ def find_synonyms(
+ self,
+ word: str,
+ pos: str = None,
+ postag_corpus: str = "lst20"
+ ) -> List[str]:
+ """
+ Find synonyms from wordnet
+
+ :param str word: word
+ :param str pos: part-of-speech type
+ :param str postag_corpus: postag corpus name
+ :return: list of synonyms
+ :rtype: List[str]
+ """
+ self.synonyms = []
+ if pos is None:
+ self.list_synsets = wordnet.synsets(word)
+ else:
+ self.p2w_pos = postype2wordnet(pos, postag_corpus)
+ if self.p2w_pos != '':
+ self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
+ else:
+ self.list_synsets = wordnet.synsets(word)
+
+ for self.synset in wordnet.synsets(word):
+ for self.syn in self.synset.lemma_names(lang='tha'):
+ self.synonyms.append(self.syn)
+
+ self.synonyms_without_duplicates = list(
+ OrderedDict.fromkeys(self.synonyms)
+ )
+ return self.synonyms_without_duplicates
+
+ def augment(
+ self,
+ sentence: str,
+ tokenize: object = word_tokenize,
+ max_syn_sent: int = 6,
+ postag: bool = True,
+ postag_corpus: str = "lst20"
+ ) -> List[List[str]]:
+ """
+ Text Augment using wordnet
+
+ :param str sentence: thai sentence
+ :param object tokenize: function for tokenize word
+ :param int max_syn_sent: max number for synonyms sentence
+ :param bool postag: on part-of-speech
+ :param str postag_corpus: postag corpus name
+
+ :return: list of synonyms
+ :rtype: List[Tuple[str]]
+ """
+ new_sentences = []
+ self.list_words = word_tokenize(sentence)
+ self.list_synonym = []
+ self.p_all = 1
+ if postag:
+ self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
+ for word, pos in self.list_pos:
+ self.temp = self.find_synonyms(word, pos, postag_corpus)
+ if self.temp == []:
+ self.list_synonym.append([word])
+ else:
+ self.list_synonym.append(self.temp)
+ self.p_all *= len(self.temp)
+ else:
+ for word in self.list_words:
+ self.temp = self.find_synonyms(word)
+ if self.temp == []:
+ self.list_synonym.append([word])
+ else:
+ self.list_synonym.append(self.temp)
+ self.p_all *= len(self.temp)
+ if max_syn_sent > self.p_all:
+ max_syn_sent = self.p_all
+ for x in list(itertools.product(*self.list_synonym))[0:max_syn_sent]:
+ new_sentences.append(x)
+ return new_sentences
diff --git a/setup.py b/setup.py
index 2d2e98302..2f172be68 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
"ipa": ["epitran>=1.1"],
"ml": ["numpy>=1.16", "torch>=1.0.0"],
"ssg": ["ssg>=0.0.6"],
- "thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16.1"],
+ "thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", "numpy>=1.16.1"],
"thai2rom": ["numpy>=1.16.1", "torch>=1.0.0"],
"translate": [
"fairseq>=0.10.0",
@@ -56,6 +56,10 @@
"torch>=1.0.0",
"transformers>=4.6.0",
],
+ "textaugment": [
+ "bpemb",
+ "gensim>=4.0.0"
+ ],
"wangchanberta": ["transformers", "sentencepiece"],
"mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
"wordnet": ["nltk>=3.3.*"],
@@ -66,7 +70,7 @@
"emoji>=0.5.1",
"epitran>=1.1",
"fairseq>=0.10.0",
- "gensim>=3.2.0",
+ "gensim>=4.0.0",
"nltk>=3.3.*",
"numpy>=1.16.1",
"pandas>=0.24",
@@ -75,6 +79,7 @@
"sentencepiece>=0.1.91",
"ssg>=0.0.6",
"torch>=1.0.0",
+ "bpemb",
"transformers>=4.6.0",
"sefr_cut"
],
diff --git a/tests/test_augment.py b/tests/test_augment.py
new file mode 100644
index 000000000..4048f6d46
--- /dev/null
+++ b/tests/test_augment.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+from pythainlp.augment import WordNetAug
+from pythainlp.augment.wordnet import postype2wordnet
+from pythainlp.augment.lm import Thai2transformersAug
+from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug
+from pythainlp.augment.word2vec import (
+ Thai2fitAug,
+ LTW2VAug
+)
+
+
+class TestTextaugmentPackage(unittest.TestCase):
+ def setUp(self):
+ self.text = "เรารักคุณมากที่สุดในโลก"
+ self.text2 = "เราอยู่ที่มหาวิทยาลัยขอนแก่น"
+
+ def test_WordNetAug(self):
+ wordnetaug = WordNetAug()
+ self.assertIsNotNone(wordnetaug.augment(self.text))
+ self.assertIsNotNone(wordnetaug.find_synonyms("ผม", pos=None))
+ self.assertIsNotNone(wordnetaug.augment(self.text, postag=False))
+ self.assertIsNone(postype2wordnet('n', 'abc'))
+ self.assertIsNotNone(postype2wordnet('NOUN', 'orchid'))
+
+ def test_Thai2fitAug(self):
+ _aug = Thai2fitAug()
+ self.assertIsNotNone(_aug.tokenizer(self.text))
+ self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p=0.5))
+
+ def test_BPEmbAug(self):
+ _aug = BPEmbAug()
+ self.assertIsNotNone(_aug.tokenizer(self.text))
+ self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p=0.5))
+
+ def test_LTW2VAug(self):
+ _aug = LTW2VAug()
+ self.assertIsNotNone(_aug.tokenizer(self.text))
+ self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p=0.5))
+
+ def test_Thai2transformersAug(self):
+ _aug = Thai2transformersAug()
+ self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1))