diff --git a/docs/api/augment.rst b/docs/api/augment.rst new file mode 100644 index 000000000..220cc21c8 --- /dev/null +++ b/docs/api/augment.rst @@ -0,0 +1,25 @@ +.. currentmodule:: pythainlp.augment + +pythainlp.augment +================= + +The :class:`textaugment` is Thai text augment. This function for text augment task. + +Modules +------- + +.. autoclass:: WordNetAug + :members: +.. autofunction:: postype2wordnet +.. autoclass:: pythainlp.augment.word2vec.Word2VecAug + :members: +.. autoclass:: pythainlp.augment.word2vec.Thai2fitAug + :members: +.. autoclass:: pythainlp.augment.word2vec.LTW2VAug + :members: +.. autoclass:: pythainlp.augment.lm.FastTextAug + :members: +.. autoclass:: pythainlp.augment.lm.Thai2transformersAug + :members: +.. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug + :members: \ No newline at end of file diff --git a/pythainlp/augment/__init__.py b/pythainlp/augment/__init__.py new file mode 100644 index 000000000..039935081 --- /dev/null +++ b/pythainlp/augment/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +""" +Thai text augment +""" + +__all__ = ["WordNetAug"] + +from pythainlp.augment.wordnet import WordNetAug diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py new file mode 100644 index 000000000..c90a81a32 --- /dev/null +++ b/pythainlp/augment/lm/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +""" +LM +""" + +__all__ = [ + "FastTextAug", + "Thai2transformersAug", +] + +from pythainlp.augment.lm.fasttext import FastTextAug +from pythainlp.augment.lm.wangchanberta import Thai2transformersAug diff --git a/pythainlp/augment/lm/fasttext.py b/pythainlp/augment/lm/fasttext.py new file mode 100644 index 000000000..cc1379e57 --- /dev/null +++ b/pythainlp/augment/lm/fasttext.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +from typing import List, Tuple +from gensim.models.fasttext import FastText as FastText_gensim +from pythainlp.tokenize import word_tokenize +from gensim.models.keyedvectors import KeyedVectors +import itertools + + +class FastTextAug: + """ + Text Augment from FastText + """ + def __init__(self, model_path: str): + """ + :param str model_path: path of model file + """ + if model_path.endswith('.bin'): + self.model = FastText_gensim.load_facebook_vectors(model_path) + elif model_path.endswith('.vec'): + self.model = KeyedVectors.load_word2vec_format(model_path) + else: + self.model = FastText_gensim.load(model_path) + self.dict_wv = list(self.model.key_to_index.keys()) + + def tokenize(self, text: str) -> List[str]: + """ + Thai text tokenize for fasttext + + :param str text: thai text + + :return: list of word + :rtype: List[str] + """ + return word_tokenize(text, engine='icu') + + def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]: + """ + :param str sent: text sentence + :param float p: probability + :rtype: List[List[str]] + """ + list_sent_new = [] + for i in sent: + if i in self.dict_wv: + w = [ + j for j, v in self.model.most_similar(i) if v >= p + ] + if w == []: + list_sent_new.append([i]) + else: + list_sent_new.append(w) + else: + list_sent_new.append([i]) + return list_sent_new + + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment from FastText + + You wants to download thai model + from https://fasttext.cc/docs/en/crawl-vectors.html. + + :param str sentence: thai sentence + :param int n_sent: number sentence + :param float p: Probability of word + + :return: list of synonyms + :rtype: List[Tuple[str]] + """ + self.sentence = self.tokenize(sentence) + self.list_synonym = self.modify_sent(self.sentence, p=p) + new_sentences = [] + for x in list(itertools.product(*self.list_synonym))[0:n_sent]: + new_sentences.append(x) + return new_sentences diff --git a/pythainlp/augment/lm/wangchanberta.py b/pythainlp/augment/lm/wangchanberta.py new file mode 100644 index 000000000..0322f4d16 --- /dev/null +++ b/pythainlp/augment/lm/wangchanberta.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# transformers +from transformers import ( + CamembertTokenizer, + pipeline, +) +import random +from typing import List + +model_name = "airesearch/wangchanberta-base-att-spm-uncased" + + +class Thai2transformersAug: + def __init__(self): + self.model_name = "airesearch/wangchanberta-base-att-spm-uncased" + self.target_tokenizer = CamembertTokenizer + self.tokenizer = CamembertTokenizer.from_pretrained( + self.model_name, + revision='main') + self.tokenizer.additional_special_tokens = [ + 'NOTUSED', + 'NOTUSED', + '<_>' + ] + self.fill_mask = pipeline( + task='fill-mask', + tokenizer=self.tokenizer, + model=f'{self.model_name}', + revision='main' + ) + + def generate(self, sentence: str, num_replace_tokens: int = 3): + self.sent2 = [] + self.input_text = sentence + sent = [ + i for i in self.tokenizer.tokenize(self.input_text) if i != '▁' + ] + if len(sent) < num_replace_tokens: + num_replace_tokens = len(sent) + masked_text = self.input_text + for i in range(num_replace_tokens): + replace_token = [ + sent.pop(random.randrange(len(sent))) for _ in range(1) + ][0] + masked_text = masked_text.replace( + replace_token, f"{self.fill_mask.tokenizer.mask_token}", 1 + ) + self.sent2 += [ + str(j['sequence']).replace(' ', '').replace('', '') + for j in self.fill_mask(masked_text+'') + if j['sequence'] not in self.sent2 + ] + masked_text = self.input_text + return self.sent2 + + def augment( + self, sentence: str, num_replace_tokens: int = 3 + ) -> List[str]: + """ + Text Augment from wangchanberta + + :param str sentence: thai sentence + :param int num_replace_tokens: number replace tokens + + :return: list of text augment + :rtype: List[str] + """ + self.sent2 = [] + try: + self.sent2 = self.generate(sentence, num_replace_tokens) + if self.sent2 == []: + self.sent2 = self.generate(sentence, num_replace_tokens) + return self.sent2 + except: + if len(self.sent2) > 0: + return self.sent2 + else: + return self.sent2 diff --git a/pythainlp/augment/word2vec/__init__.py b/pythainlp/augment/word2vec/__init__.py new file mode 100644 index 000000000..2d94435b3 --- /dev/null +++ b/pythainlp/augment/word2vec/__init__.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +""" +Word2Vec +""" + +__all__ = [ + "Word2VecAug", + "Thai2fitAug", + "LTW2VAug" +] + +from pythainlp.augment.word2vec.core import Word2VecAug +from pythainlp.augment.word2vec.thai2fit import Thai2fitAug +from pythainlp.augment.word2vec.ltw2v import LTW2VAug diff --git a/pythainlp/augment/word2vec/bpemb_wv.py b/pythainlp/augment/word2vec/bpemb_wv.py new file mode 100644 index 000000000..6c5d44db6 --- /dev/null +++ b/pythainlp/augment/word2vec/bpemb_wv.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +from pythainlp.augment.word2vec.core import Word2VecAug +from bpemb import BPEmb +from typing import List, Tuple + + +class BPEmbAug: + """ + Thai Text Augment using word2vec from BPEmb + + BPEmb: + `github.com/bheinzerling/bpemb `_ + """ + def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300): + self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs) + self.model = self.bpemb_temp.emb + self.load_w2v() + + def tokenizer(self, text: str) -> List[str]: + """ + :param str text: thai text + :rtype: List[str] + """ + return self.bpemb_temp.encode(text) + + def load_w2v(self): + """ + Load BPEmb model + """ + self.aug = Word2VecAug( + self.model, tokenize=self.tokenizer, type="model" + ) + + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment using word2vec from BPEmb + + :param str sentence: thai sentence + :param int n_sent: number sentence + :param float p: Probability of word + + :return: list of synonyms + :rtype: List[Tuple[str]] + """ + self.sentence = sentence.replace(" ", "▁") + self.temp = self.aug.augment(self.sentence, n_sent, p=p) + self.temp_new = [] + for i in self.temp: + self.t = "" + for j in i: + self.t += j.replace('▁', '') + self.temp_new.append(self.t) + return self.temp_new diff --git a/pythainlp/augment/word2vec/core.py b/pythainlp/augment/word2vec/core.py new file mode 100644 index 000000000..c98bcd39b --- /dev/null +++ b/pythainlp/augment/word2vec/core.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +from typing import List, Tuple +import gensim.models.keyedvectors as word2vec +import itertools + + +class Word2VecAug: + def __init__( + self, model: str, tokenize: object, type: str = "file" + ) -> None: + """ + :param str model: path model + :param object tokenize: tokenize function + :param str type: moodel type (file, binary) + """ + self.tokenizer = tokenize + if type == "file": + self.model = word2vec.KeyedVectors.load_word2vec_format(model) + elif type == "binary": + self.model = word2vec.KeyedVectors.load_word2vec_format( + model, binary=True, unicode_errors='ignore' + ) + else: + self.model = model + self.dict_wv = list(self.model.key_to_index.keys()) + + def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]: + """ + :param str sent: text sentence + :param float p: probability + :rtype: List[List[str]] + """ + list_sent_new = [] + for i in sent: + if i in self.dict_wv: + w = [ + j for j, v in self.model.most_similar(i) if v >= p + ] + if w == []: + list_sent_new.append([i]) + else: + list_sent_new.append(w) + else: + list_sent_new.append([i]) + return list_sent_new + + def augment( + self, + sentence: str, + n_sent: int = 1, + p: float = 0.7 + ) -> List[Tuple[str]]: + """ + :param str sentence: text sentence + :param int n_sent: max number for synonyms sentence + :param int p: probability + + :return: list of synonyms + :rtype: List[Tuple[str]] + """ + self.sentence = self.tokenizer(sentence) + self.list_synonym = self.modify_sent(self.sentence, p=p) + new_sentences = [] + for x in list(itertools.product(*self.list_synonym))[0:n_sent]: + new_sentences.append(x) + return new_sentences diff --git a/pythainlp/augment/word2vec/ltw2v.py b/pythainlp/augment/word2vec/ltw2v.py new file mode 100644 index 000000000..61b2c83db --- /dev/null +++ b/pythainlp/augment/word2vec/ltw2v.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +from pythainlp.augment.word2vec.core import Word2VecAug +from pythainlp.corpus import get_corpus_path +from pythainlp.tokenize import word_tokenize +from typing import List, Tuple + + +class LTW2VAug: + """ + Text Augment using word2vec from LTW2V + + LTW2V: + `github.com/PyThaiNLP/large-thaiword2vec `_ + """ + def __init__(self): + self.ltw2v_wv = get_corpus_path('ltw2v') + self.load_w2v() + + def tokenizer(self, text: str) -> List[str]: + """ + :param str text: thai text + :rtype: List[str] + """ + return word_tokenize(text, engine='newmm') + + def load_w2v(self): # insert substitute + """ + Load ltw2v word2vec model + """ + self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary") + + def augment( + self, + sentence: str, + n_sent: int = 1, + p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment using word2vec from Thai2Fit + + :param str sentence: thai sentence + :param int n_sent: number sentence + :param float p: Probability of word + + :return: list of text augment + :rtype: List[Tuple[str]] + """ + return self.aug.augment(sentence, n_sent, p) diff --git a/pythainlp/augment/word2vec/thai2fit.py b/pythainlp/augment/word2vec/thai2fit.py new file mode 100644 index 000000000..8a553f0f2 --- /dev/null +++ b/pythainlp/augment/word2vec/thai2fit.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +from pythainlp.augment.word2vec.core import Word2VecAug +from pythainlp.corpus import get_corpus_path +from pythainlp.tokenize import THAI2FIT_TOKENIZER +from typing import List, Tuple + + +class Thai2fitAug: + """ + Text Augment using word2vec from Thai2Fit + + Thai2Fit: + `github.com/cstorm125/thai2fit `_ + """ + def __init__(self): + self.thai2fit_wv = get_corpus_path('thai2fit_wv') + self.load_w2v() + + def tokenizer(self, text: str) -> List[str]: + """ + :param str text: thai text + :rtype: List[str] + """ + return THAI2FIT_TOKENIZER.word_tokenize(text) + + def load_w2v(self): # insert substitute + """ + Load thai2fit word2vec model + """ + self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary") + + def augment( + self, + sentence: str, + n_sent: int = 1, + p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment using word2vec from Thai2Fit + + :param str sentence: thai sentence + :param int n_sent: number sentence + :param float p: Probability of word + + :return: list of text augment + :rtype: List[Tuple[str]] + """ + return self.aug.augment(sentence, n_sent, p) diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py new file mode 100644 index 000000000..14793a302 --- /dev/null +++ b/pythainlp/augment/wordnet.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +""" +Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l +""" +__all__ = [ + "WordNetAug", + "postype2wordnet", +] + +from pythainlp.corpus import wordnet +from collections import OrderedDict +from pythainlp.tokenize import word_tokenize +from pythainlp.tag import pos_tag +from typing import List +from nltk.corpus import wordnet as wn +import itertools + +lst20 = { + "": "", + "AJ": wn.ADJ, + "AV": wn.ADV, + "AX": "", + "CC": "", + "CL": wn.NOUN, + "FX": wn.NOUN, + "IJ": "", + "NN": wn.NOUN, + "NU": "", + "PA": "", + "PR": "", + "PS": "", + "PU": "", + "VV": wn.VERB, + "XX": "", +} + +orchid = { + "": "", + # NOUN + "NOUN": wn.NOUN, + "NCMN": wn.NOUN, + "NTTL": wn.NOUN, + "CNIT": wn.NOUN, + "CLTV": wn.NOUN, + "CMTR": wn.NOUN, + "CFQC": wn.NOUN, + "CVBL": wn.NOUN, + # VERB + "VACT": wn.VERB, + "VSTA": wn.VERB, + # PROPN + "PROPN": "", + "NPRP": "", + # ADJ + "ADJ": wn.ADJ, + "NONM": wn.ADJ, + "VATT": wn.ADJ, + "DONM": wn.ADJ, + # ADV + "ADV": wn.ADV, + "ADVN": wn.ADV, + "ADVI": wn.ADV, + "ADVP": wn.ADV, + "ADVS": wn.ADV, + # INT + "INT": "", + # PRON + "PRON": "", + "PPRS": "", + "PDMN": "", + "PNTR": "", + # DET + "DET": "", + "DDAN": "", + "DDAC": "", + "DDBQ": "", + "DDAQ": "", + "DIAC": "", + "DIBQ": "", + "DIAQ": "", + # NUM + "NUM": "", + "NCNM": "", + "NLBL": "", + "DCNM": "", + # AUX + "AUX": "", + "XVBM": "", + "XVAM": "", + "XVMM": "", + "XVBB": "", + "XVAE": "", + # ADP + "ADP": "", + "RPRE": "", + # CCONJ + "CCONJ": "", + "JCRG": "", + # SCONJ + "SCONJ": "", + "PREL": "", + "JSBR": "", + "JCMP": "", + # PART + "PART": "", + "FIXN": "", + "FIXV": "", + "EAFF": "", + "EITT": "", + "AITT": "", + "NEG": "", + # PUNCT + "PUNCT": "", + "PUNC": "", +} + + +def postype2wordnet(pos: str, corpus: str): + """ + convert part-of-speech type to wordnet type + + :param str pos: pos type + :param str corpus: part-of-speech corpus + + **Options for corpus** + * *lst20* - LST20 Corpus + * *orchid* - Orchid Corpus + """ + if corpus not in ['lst20', 'orchid']: + return None + if corpus == 'lst20': + return lst20[pos] + else: + return orchid[pos] + + +class WordNetAug: + def __init__(self): + pass + + def find_synonyms( + self, + word: str, + pos: str = None, + postag_corpus: str = "lst20" + ) -> List[str]: + """ + Find synonyms from wordnet + + :param str word: word + :param str pos: part-of-speech type + :param str postag_corpus: postag corpus name + :return: list of synonyms + :rtype: List[str] + """ + self.synonyms = [] + if pos is None: + self.list_synsets = wordnet.synsets(word) + else: + self.p2w_pos = postype2wordnet(pos, postag_corpus) + if self.p2w_pos != '': + self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos) + else: + self.list_synsets = wordnet.synsets(word) + + for self.synset in wordnet.synsets(word): + for self.syn in self.synset.lemma_names(lang='tha'): + self.synonyms.append(self.syn) + + self.synonyms_without_duplicates = list( + OrderedDict.fromkeys(self.synonyms) + ) + return self.synonyms_without_duplicates + + def augment( + self, + sentence: str, + tokenize: object = word_tokenize, + max_syn_sent: int = 6, + postag: bool = True, + postag_corpus: str = "lst20" + ) -> List[List[str]]: + """ + Text Augment using wordnet + + :param str sentence: thai sentence + :param object tokenize: function for tokenize word + :param int max_syn_sent: max number for synonyms sentence + :param bool postag: on part-of-speech + :param str postag_corpus: postag corpus name + + :return: list of synonyms + :rtype: List[Tuple[str]] + """ + new_sentences = [] + self.list_words = word_tokenize(sentence) + self.list_synonym = [] + self.p_all = 1 + if postag: + self.list_pos = pos_tag(self.list_words, corpus=postag_corpus) + for word, pos in self.list_pos: + self.temp = self.find_synonyms(word, pos, postag_corpus) + if self.temp == []: + self.list_synonym.append([word]) + else: + self.list_synonym.append(self.temp) + self.p_all *= len(self.temp) + else: + for word in self.list_words: + self.temp = self.find_synonyms(word) + if self.temp == []: + self.list_synonym.append([word]) + else: + self.list_synonym.append(self.temp) + self.p_all *= len(self.temp) + if max_syn_sent > self.p_all: + max_syn_sent = self.p_all + for x in list(itertools.product(*self.list_synonym))[0:max_syn_sent]: + new_sentences.append(x) + return new_sentences diff --git a/setup.py b/setup.py index 2d2e98302..2f172be68 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ "ipa": ["epitran>=1.1"], "ml": ["numpy>=1.16", "torch>=1.0.0"], "ssg": ["ssg>=0.0.6"], - "thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16.1"], + "thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", "numpy>=1.16.1"], "thai2rom": ["numpy>=1.16.1", "torch>=1.0.0"], "translate": [ "fairseq>=0.10.0", @@ -56,6 +56,10 @@ "torch>=1.0.0", "transformers>=4.6.0", ], + "textaugment": [ + "bpemb", + "gensim>=4.0.0" + ], "wangchanberta": ["transformers", "sentencepiece"], "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], @@ -66,7 +70,7 @@ "emoji>=0.5.1", "epitran>=1.1", "fairseq>=0.10.0", - "gensim>=3.2.0", + "gensim>=4.0.0", "nltk>=3.3.*", "numpy>=1.16.1", "pandas>=0.24", @@ -75,6 +79,7 @@ "sentencepiece>=0.1.91", "ssg>=0.0.6", "torch>=1.0.0", + "bpemb", "transformers>=4.6.0", "sefr_cut" ], diff --git a/tests/test_augment.py b/tests/test_augment.py new file mode 100644 index 000000000..4048f6d46 --- /dev/null +++ b/tests/test_augment.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +import unittest +from pythainlp.augment import WordNetAug +from pythainlp.augment.wordnet import postype2wordnet +from pythainlp.augment.lm import Thai2transformersAug +from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug +from pythainlp.augment.word2vec import ( + Thai2fitAug, + LTW2VAug +) + + +class TestTextaugmentPackage(unittest.TestCase): + def setUp(self): + self.text = "เรารักคุณมากที่สุดในโลก" + self.text2 = "เราอยู่ที่มหาวิทยาลัยขอนแก่น" + + def test_WordNetAug(self): + wordnetaug = WordNetAug() + self.assertIsNotNone(wordnetaug.augment(self.text)) + self.assertIsNotNone(wordnetaug.find_synonyms("ผม", pos=None)) + self.assertIsNotNone(wordnetaug.augment(self.text, postag=False)) + self.assertIsNone(postype2wordnet('n', 'abc')) + self.assertIsNotNone(postype2wordnet('NOUN', 'orchid')) + + def test_Thai2fitAug(self): + _aug = Thai2fitAug() + self.assertIsNotNone(_aug.tokenizer(self.text)) + self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p=0.5)) + + def test_BPEmbAug(self): + _aug = BPEmbAug() + self.assertIsNotNone(_aug.tokenizer(self.text)) + self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p=0.5)) + + def test_LTW2VAug(self): + _aug = LTW2VAug() + self.assertIsNotNone(_aug.tokenizer(self.text)) + self.assertIsNotNone(_aug.augment(self.text, n_sent=3, p=0.5)) + + def test_Thai2transformersAug(self): + _aug = Thai2transformersAug() + self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1))