Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/api/augment.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
.. currentmodule:: pythainlp.augment

pythainlp.augment
=================

The :class:`textaugment` is Thai text augment. This function for text augment task.

Modules
-------

.. autoclass:: WordNetAug
:members:
.. autofunction:: postype2wordnet
.. autoclass:: pythainlp.augment.word2vec.Word2VecAug
:members:
.. autoclass:: pythainlp.augment.word2vec.Thai2fitAug
:members:
.. autoclass:: pythainlp.augment.word2vec.LTW2VAug
:members:
.. autoclass:: pythainlp.augment.lm.FastTextAug
:members:
.. autoclass:: pythainlp.augment.lm.Thai2transformersAug
:members:
.. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug
:members:
8 changes: 8 additions & 0 deletions pythainlp/augment/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
"""
Thai text augment
"""

__all__ = ["WordNetAug"]

from pythainlp.augment.wordnet import WordNetAug
12 changes: 12 additions & 0 deletions pythainlp/augment/lm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
"""
LM
"""

__all__ = [
"FastTextAug",
"Thai2transformersAug",
]

from pythainlp.augment.lm.fasttext import FastTextAug
from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
77 changes: 77 additions & 0 deletions pythainlp/augment/lm/fasttext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple
from gensim.models.fasttext import FastText as FastText_gensim
from pythainlp.tokenize import word_tokenize
from gensim.models.keyedvectors import KeyedVectors
import itertools


class FastTextAug:
"""
Text Augment from FastText
"""
def __init__(self, model_path: str):
"""
:param str model_path: path of model file
"""
if model_path.endswith('.bin'):
self.model = FastText_gensim.load_facebook_vectors(model_path)
elif model_path.endswith('.vec'):
self.model = KeyedVectors.load_word2vec_format(model_path)
else:
self.model = FastText_gensim.load(model_path)
self.dict_wv = list(self.model.key_to_index.keys())

def tokenize(self, text: str) -> List[str]:
"""
Thai text tokenize for fasttext

:param str text: thai text

:return: list of word
:rtype: List[str]
"""
return word_tokenize(text, engine='icu')

def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
"""
:param str sent: text sentence
:param float p: probability
:rtype: List[List[str]]
"""
list_sent_new = []
for i in sent:
if i in self.dict_wv:
w = [
j for j, v in self.model.most_similar(i) if v >= p
]
if w == []:
list_sent_new.append([i])
else:
list_sent_new.append(w)
else:
list_sent_new.append([i])
return list_sent_new

def augment(
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> List[Tuple[str]]:
"""
Text Augment from FastText

You wants to download thai model
from https://fasttext.cc/docs/en/crawl-vectors.html.

:param str sentence: thai sentence
:param int n_sent: number sentence
:param float p: Probability of word

:return: list of synonyms
:rtype: List[Tuple[str]]
"""
self.sentence = self.tokenize(sentence)
self.list_synonym = self.modify_sent(self.sentence, p=p)
new_sentences = []
for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
new_sentences.append(x)
return new_sentences
79 changes: 79 additions & 0 deletions pythainlp/augment/lm/wangchanberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-

# transformers
from transformers import (
CamembertTokenizer,
pipeline,
)
import random
from typing import List

model_name = "airesearch/wangchanberta-base-att-spm-uncased"


class Thai2transformersAug:
def __init__(self):
self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
self.target_tokenizer = CamembertTokenizer
self.tokenizer = CamembertTokenizer.from_pretrained(
self.model_name,
revision='main')
self.tokenizer.additional_special_tokens = [
'<s>NOTUSED',
'</s>NOTUSED',
'<_>'
]
self.fill_mask = pipeline(
task='fill-mask',
tokenizer=self.tokenizer,
model=f'{self.model_name}',
revision='main'
)

def generate(self, sentence: str, num_replace_tokens: int = 3):
self.sent2 = []
self.input_text = sentence
sent = [
i for i in self.tokenizer.tokenize(self.input_text) if i != '▁'
]
if len(sent) < num_replace_tokens:
num_replace_tokens = len(sent)
masked_text = self.input_text
for i in range(num_replace_tokens):
replace_token = [
sent.pop(random.randrange(len(sent))) for _ in range(1)
][0]
masked_text = masked_text.replace(
replace_token, f"{self.fill_mask.tokenizer.mask_token}", 1
)
self.sent2 += [
str(j['sequence']).replace('<s> ', '').replace('</s>', '')
for j in self.fill_mask(masked_text+'<pad>')
if j['sequence'] not in self.sent2
]
masked_text = self.input_text
return self.sent2

def augment(
self, sentence: str, num_replace_tokens: int = 3
) -> List[str]:
"""
Text Augment from wangchanberta

:param str sentence: thai sentence
:param int num_replace_tokens: number replace tokens

:return: list of text augment
:rtype: List[str]
"""
self.sent2 = []
try:
self.sent2 = self.generate(sentence, num_replace_tokens)
if self.sent2 == []:
self.sent2 = self.generate(sentence, num_replace_tokens)
return self.sent2
except:
if len(self.sent2) > 0:
return self.sent2
else:
return self.sent2
14 changes: 14 additions & 0 deletions pythainlp/augment/word2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
"""
Word2Vec
"""

__all__ = [
"Word2VecAug",
"Thai2fitAug",
"LTW2VAug"
]

from pythainlp.augment.word2vec.core import Word2VecAug
from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
from pythainlp.augment.word2vec.ltw2v import LTW2VAug
55 changes: 55 additions & 0 deletions pythainlp/augment/word2vec/bpemb_wv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
from pythainlp.augment.word2vec.core import Word2VecAug
from bpemb import BPEmb
from typing import List, Tuple


class BPEmbAug:
"""
Thai Text Augment using word2vec from BPEmb

BPEmb:
`github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
"""
def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
self.model = self.bpemb_temp.emb
self.load_w2v()

def tokenizer(self, text: str) -> List[str]:
"""
:param str text: thai text
:rtype: List[str]
"""
return self.bpemb_temp.encode(text)

def load_w2v(self):
"""
Load BPEmb model
"""
self.aug = Word2VecAug(
self.model, tokenize=self.tokenizer, type="model"
)

def augment(
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> List[Tuple[str]]:
"""
Text Augment using word2vec from BPEmb

:param str sentence: thai sentence
:param int n_sent: number sentence
:param float p: Probability of word

:return: list of synonyms
:rtype: List[Tuple[str]]
"""
self.sentence = sentence.replace(" ", "▁")
self.temp = self.aug.augment(self.sentence, n_sent, p=p)
self.temp_new = []
for i in self.temp:
self.t = ""
for j in i:
self.t += j.replace('▁', '')
self.temp_new.append(self.t)
return self.temp_new
66 changes: 66 additions & 0 deletions pythainlp/augment/word2vec/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple
import gensim.models.keyedvectors as word2vec
import itertools


class Word2VecAug:
def __init__(
self, model: str, tokenize: object, type: str = "file"
) -> None:
"""
:param str model: path model
:param object tokenize: tokenize function
:param str type: moodel type (file, binary)
"""
self.tokenizer = tokenize
if type == "file":
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
elif type == "binary":
self.model = word2vec.KeyedVectors.load_word2vec_format(
model, binary=True, unicode_errors='ignore'
)
else:
self.model = model
self.dict_wv = list(self.model.key_to_index.keys())

def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
"""
:param str sent: text sentence
:param float p: probability
:rtype: List[List[str]]
"""
list_sent_new = []
for i in sent:
if i in self.dict_wv:
w = [
j for j, v in self.model.most_similar(i) if v >= p
]
if w == []:
list_sent_new.append([i])
else:
list_sent_new.append(w)
else:
list_sent_new.append([i])
return list_sent_new

def augment(
self,
sentence: str,
n_sent: int = 1,
p: float = 0.7
) -> List[Tuple[str]]:
"""
:param str sentence: text sentence
:param int n_sent: max number for synonyms sentence
:param int p: probability

:return: list of synonyms
:rtype: List[Tuple[str]]
"""
self.sentence = self.tokenizer(sentence)
self.list_synonym = self.modify_sent(self.sentence, p=p)
new_sentences = []
for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
new_sentences.append(x)
return new_sentences
48 changes: 48 additions & 0 deletions pythainlp/augment/word2vec/ltw2v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
from pythainlp.augment.word2vec.core import Word2VecAug
from pythainlp.corpus import get_corpus_path
from pythainlp.tokenize import word_tokenize
from typing import List, Tuple


class LTW2VAug:
"""
Text Augment using word2vec from LTW2V

LTW2V:
`github.com/PyThaiNLP/large-thaiword2vec <https://github.com/PyThaiNLP/large-thaiword2vec>`_
"""
def __init__(self):
self.ltw2v_wv = get_corpus_path('ltw2v')
self.load_w2v()

def tokenizer(self, text: str) -> List[str]:
"""
:param str text: thai text
:rtype: List[str]
"""
return word_tokenize(text, engine='newmm')

def load_w2v(self): # insert substitute
"""
Load ltw2v word2vec model
"""
self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")

def augment(
self,
sentence: str,
n_sent: int = 1,
p: float = 0.7
) -> List[Tuple[str]]:
"""
Text Augment using word2vec from Thai2Fit

:param str sentence: thai sentence
:param int n_sent: number sentence
:param float p: Probability of word

:return: list of text augment
:rtype: List[Tuple[str]]
"""
return self.aug.augment(sentence, n_sent, p)
Loading