From b4cb35c9e068e7224ddc16bfa286af0672adebea Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 20:09:17 +0700 Subject: [PATCH 01/26] Add pythainlp.generator --- docs/api/corpus.rst | 10 ++ docs/api/generator.rst | 16 +++ pythainlp/corpus/oscar.py | 53 ++++++++++ pythainlp/corpus/tnc.py | 54 +++++++++- pythainlp/corpus/ttc.py | 20 +++- pythainlp/generator/__init__.py | 12 +++ pythainlp/generator/core.py | 171 ++++++++++++++++++++++++++++++++ pythainlp/generator/thai2fit.py | 65 ++++++++++++ setup.py | 2 + tests/test_corpus.py | 9 ++ tests/test_generator.py | 23 +++++ 11 files changed, 433 insertions(+), 2 deletions(-) create mode 100644 docs/api/generator.rst create mode 100644 pythainlp/corpus/oscar.py create mode 100644 pythainlp/generator/__init__.py create mode 100644 pythainlp/generator/core.py create mode 100644 pythainlp/generator/thai2fit.py create mode 100644 tests/test_generator.py diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index dbf0c95fa..eb8d1bf16 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -35,11 +35,21 @@ TNC --- .. autofunction:: pythainlp.corpus.tnc.word_freqs +.. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs +.. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs +.. autofunction:: pythainlp.corpus.tnc.tigram_word_freqs TTC --- .. autofunction:: pythainlp.corpus.ttc.word_freqs +.. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs + +OSCAR +----- + +.. autofunction:: pythainlp.corpus.oscar.word_freqs +.. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs Util ---- diff --git a/docs/api/generator.rst b/docs/api/generator.rst new file mode 100644 index 000000000..cd8252579 --- /dev/null +++ b/docs/api/generator.rst @@ -0,0 +1,16 @@ +.. currentmodule:: pythainlp.generator + +pythainlp.generator +=================== +The :class:`pythainlp.generator` is Thai text generator with PyThaiNLP. + +Modules +------- + +.. autoclass:: Unigram + :members: +.. autoclass:: Bigram + :members: +.. autoclass:: Tigram + :members: +.. autofunction:: pythainlp.generator.thai2fit.gen_sentence \ No newline at end of file diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py new file mode 100644 index 000000000..e23dbb609 --- /dev/null +++ b/pythainlp/corpus/oscar.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +""" +Thai unigram word frequency from OSCAR Corpus (icu word tokenize) + +Credit: Korakot Chaovavanich +https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/ +""" + +__all__ = [ + "word_freqs", + "unigram_word_freqs" +] + +from collections import defaultdict +from typing import List, Tuple + +from pythainlp.corpus import get_corpus_path + +_FILENAME = "oscar_icu" + + +def word_freqs() -> List[Tuple[str, int]]: + """ + Get word frequency from OSCAR Corpus (icu word tokenize) + """ + word_freqs = [] + _path = get_corpus_path(_FILENAME) + with open(_path,"r",encoding="utf-8") as f: + for line in f.readlines(): + word_freq = line.strip().split(",") + if len(word_freq) >= 2: + word_freqs.append((word_freq[0], int(word_freq[1]))) + + return word_freqs + + +def unigram_word_freqs() -> defaultdict: + """ + Get unigram word frequency from OSCAR Corpus (icu word tokenize) + """ + _path = get_corpus_path(_FILENAME) + _word_freqs = defaultdict(int) + with open(_path, "r", encoding="utf-8-sig") as fh: + _data = [i for i in fh.readlines()] + del _data[0] + for i in _data: + _temp = i.strip().split(",") + if _temp[0]!=" " and '"' not in _temp[0]: + _word_freqs[_temp[0]] = int(_temp[-1]) + elif _temp[0]==" ": + _word_freqs[""] = int(_temp[-1]) + + return _word_freqs diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index db836ea17..9345cea93 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -6,13 +6,23 @@ https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ -__all__ = ["word_freqs"] +__all__ = [ + "word_freqs", + "unigram_word_freqs", + "bigram_word_freqs", + "tigram_word_freqs" +] +from collections import defaultdict from typing import List, Tuple from pythainlp.corpus import get_corpus +from pythainlp.corpus import get_corpus_path + _FILENAME = "tnc_freq.txt" +_BIGRAM = "tnc_bigram_word_freqs" +_TIGRAM = "tnc_tigram_word_freqs" def word_freqs() -> List[Tuple[str, int]]: @@ -29,3 +39,45 @@ def word_freqs() -> List[Tuple[str, int]]: word_freqs.append((word_freq[0], int(word_freq[1]))) return word_freqs + + +def unigram_word_freqs() -> defaultdict: + """ + Get unigram word frequency from Thai National Corpus (TNC) + """ + lines = list(get_corpus(_FILENAME)) + _word_freqs = defaultdict(int) + for i in lines: + _temp = i.strip().split(" ") + if len(_temp) >= 2: + _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1]) + + return _word_freqs + + +def bigram_word_freqs() -> defaultdict: + """ + Get bigram word frequency from Thai National Corpus (TNC) + """ + _path = get_corpus_path(_BIGRAM) + _word_freqs = defaultdict(int) + with open(_path, "r", encoding="utf-8-sig") as fh: + for i in fh.readlines(): + _temp = i.strip().split(" ") + _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1]) + + return _word_freqs + + +def tigram_word_freqs() -> defaultdict: + """ + Get tigram word frequency from Thai National Corpus (TNC) + """ + _path = get_corpus_path(_TIGRAM) + _word_freqs = defaultdict(int) + with open(_path, "r", encoding="utf-8-sig") as fh: + for i in fh.readlines(): + _temp = i.strip().split(" ") + _word_freqs[(_temp[0],_temp[1],_temp[2])] = int(_temp[-1]) + + return _word_freqs \ No newline at end of file diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index 0de0069c7..a42fa4c05 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -6,8 +6,12 @@ https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ -__all__ = ["word_freqs"] +__all__ = [ + "word_freqs", + "unigram_word_freqs" +] +from collections import defaultdict from typing import List, Tuple from pythainlp.corpus import get_corpus @@ -29,3 +33,17 @@ def word_freqs() -> List[Tuple[str, int]]: word_freqs.append((word_freq[0], int(word_freq[1]))) return word_freqs + + +def unigram_word_freqs() -> defaultdict: + """ + Get unigram word frequency from Thai Textbook Corpus (TTC) + """ + lines = list(get_corpus(_FILENAME)) + _word_freqs = defaultdict(int) + for i in lines: + _temp = i.strip().split(" ") + if len(_temp) >= 2: + _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1]) + + return _word_freqs \ No newline at end of file diff --git a/pythainlp/generator/__init__.py b/pythainlp/generator/__init__.py new file mode 100644 index 000000000..637497b45 --- /dev/null +++ b/pythainlp/generator/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +""" +Thai Text generator +""" + +__all__ = [ + "Unigram", + "Bigram", + "Tigram" +] + +from pythainlp.generator.core import Unigram, Bigram, Tigram \ No newline at end of file diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py new file mode 100644 index 000000000..9ca7a1395 --- /dev/null +++ b/pythainlp/generator/core.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +import random +from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram +from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram +from pythainlp.corpus.tnc import tigram_word_freqs as tnc_word_freqs_tigram +from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram +from pythainlp.corpus.oscar import unigram_word_freqs as oscar_word_freqs_unigram + + +class Unigram: + def __init__(self, name:str="tnc"): + """ + :param str name: corpus name + :rtype: None + """ + if name == "tnc": + self.counts = tnc_word_freqs_unigram() + elif name == "ttc": + self.counts = ttc_word_freqs_unigram() + elif name == "oscar": + self.counts = oscar_word_freqs_unigram() + self.word = list(self.counts.keys()) + self.n = 0 + for i in self.word: + self.n += self.counts[i] + self.prob = {i:self.counts[i]/self.n for i in self.word} + self._word_prob = {} + + def gen_sentence(self,N:int=3,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False): + """ + :param int N: number of word. + :param str start_seq: word for begin word. + :param bool output_str: output is str + :param bool duplicate: duplicate word in sent + + :return: list words or str words + :rtype: str,list + """ + if start_seq is None: start_seq = random.choice(self.word) + rand_text = start_seq.lower() + self._word_prob = {i:self.counts[i]/self.n for i in self.word if self.counts[i]/self.n>=prob} + return self.next_word(rand_text, N, output_str,prob=prob, duplicate=duplicate) + + def next_word(self,text:str, N:int, output_str:str,prob, duplicate:bool=False): + self.l = [] + self.l.append(text) + self._word_list = list(self._word_prob.keys()) + if N>len(self._word_list): + N=len(self._word_list) + for i in range(N): + self._word = random.choice(self._word_list) + if duplicate == False: + while self._word in self.l: + self._word = random.choice(self._word_list) + self.l.append(self._word) + + if output_str: + return "".join(self.l) + return self.l + + +class Bigram: + def __init__(self,name:str="tnc"): + """ + :param str name: corpus name + :rtype: None + """ + if name == "tnc": + self.uni = tnc_word_freqs_unigram() + self.bi = tnc_word_freqs_bigram() + self.uni_keys = list(self.uni.keys()) + self.bi_keys = list(self.bi.keys()) + self.words = [i[-1] for i in self.bi_keys] + + def prob(self, t1:str, t2:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 + """ + probability word + + :param int t1: text 1 + :param int t2: text 2 + + :return: probability value + :rtype: float + """ + try: + v=self.bi[(t1,t2)]/self.uni[t1] + except: + v=0.0 + return v + + def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False): + if start_seq is None: start_seq = random.choice(self.words) + self.late_word = start_seq + self.list_word = [] + self.list_word.append(start_seq) + + for i in range(N): + if duplicate: + self._temp = [j for j in self.bi_keys if j[0]==self.late_word] + else: + self._temp = [j for j in self.bi_keys if j[0]==self.late_word and j[1] not in self.list_word] + self._probs = [self.prob(self.late_word,l[-1]) for l in self._temp] + self._p2 = [j for j in self._probs if j>=prob] + if len(self._p2)==0: + break + self.items = self._temp[self._probs.index(random.choice(self._p2))] + self.late_word = self.items[-1] + self.list_word.append(self.late_word) + if output_str: + return ''.join(self.list_word) + return self.list_word + + +class Tigram: + def __init__(self,name:str="tnc"): + """ + :param str name: corpus name + :rtype: None + """ + if name == "tnc": + self.uni = tnc_word_freqs_unigram() + self.bi = tnc_word_freqs_bigram() + self.ti = tnc_word_freqs_tigram() + self.uni_keys = list(self.uni.keys()) + self.bi_keys = list(self.bi.keys()) + self.ti_keys = list(self.ti.keys()) + self.words = [i[-1] for i in self.bi_keys] + + def prob(self, t1:str, t2:str, t3:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 + """ + probability word + + :param int t1: text 1 + :param int t2: text 2 + :param int t3: text 3 + + :return: probability value + :rtype: float + """ + try: + v=self.ti[(t1, t2, t3)]/self.bi[(t1, t2)] + except: + v=0.0 + return v + + def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:tuple=None, output_str:bool = True, duplicate:bool=False): + if start_seq is None: start_seq = random.choice(self.bi_keys) + self.late_word = start_seq + self.list_word = [] + self.list_word.append(start_seq) + + for i in range(N): + if duplicate: + self._temp = [j for j in self.ti_keys if j[:2]==self.late_word] + else: + self._temp = [j for j in self.ti_keys if j[:2]==self.late_word and j[1:] not in self.list_word] + self._probs = [self.prob(l[0],l[1],l[2]) for l in self._temp] + self._p2 = [j for j in self._probs if j>=prob] + if len(self._p2)==0: + break + self.items = self._temp[self._probs.index(random.choice(self._p2))] + self.late_word = self.items[1:] + self.list_word.append(self.late_word) + self.listdata = [] + for i in self.list_word: + for j in i: + if j not in self.listdata: + self.listdata.append(j) + if output_str: + return ''.join(self.listdata) + return self.listdata \ No newline at end of file diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py new file mode 100644 index 000000000..d19d0f648 --- /dev/null +++ b/pythainlp/generator/thai2fit.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +Thai2fit : Thai Wiki Language Model for Text Generation + +Code from https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb +""" +__all__ = [ + "gen_sentence" +] + +import pandas as pd +import random +from ast import literal_eval +from collections import Counter +import re + +#fastai +import fastai +from fastai.text import * +from fastai.callbacks import CSVLogger + +#pythainlp +from pythainlp.ulmfit import * + +#get dummy data +imdb = untar_data(URLs.IMDB_SAMPLE) +dummy_df = pd.read_csv(imdb/'texts.csv') + +#get vocab +thwiki = "" +try: + thwiki =_THWIKI_LSTM +except: + thwiki = THWIKI_LSTM + +thwiki_itos = pickle.load(open(thwiki['itos_fname'],'rb')) +thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) + +#dummy databunch +tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th) +processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), + NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)] +data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor) + .split_by_rand_pct(0.2) + .label_for_lm() + .databunch(bs=64)) + + +data_lm.sanity_check() + +config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True, + output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15) +trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) + +learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args) + +#load pretrained models +learn.load_pretrained(**thwiki) + +def gen_sentence(N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True): + if start_seq is None: start_seq = random.choice(list(thwiki_itos)) + list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-') + if output_str: + return ''.join(list_word) + return list_word \ No newline at end of file diff --git a/setup.py b/setup.py index 1dabf3bcb..9b1d12379 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ "wangchanberta": ["transformers", "sentencepiece"], "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], + "text_generator": ["fastai"], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -74,6 +75,7 @@ "ssg>=0.0.6", "torch>=1.0.0", "transformers>=4.1.1", + "fastai" ], } diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 69fa22dc0..acc5812af 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -10,6 +10,7 @@ get_corpus_db, get_corpus_db_detail, get_corpus_path, + oscar, provinces, remove, thai_family_names, @@ -98,12 +99,20 @@ def test_corpus(self): ) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test")) + + def test_oscar(self): + self.assertIsNotNone(oscar.word_freqs()) + self.assertIsNotNone(oscar.unigram_word_freqs()) def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) + self.assertIsNotNone(tnc.unigram_word_freqs()) + self.assertIsNotNone(tnc.bigram_word_freqs()) + self.assertIsNotNone(tnc.tigram_word_freqs()) def test_ttc(self): self.assertIsNotNone(ttc.word_freqs()) + self.assertIsNotNone(ttc.unigram_word_freqs()) def test_wordnet(self): self.assertIsInstance(wordnet.langs(), list) diff --git a/tests/test_generator.py b/tests/test_generator.py new file mode 100644 index 000000000..4e03cabab --- /dev/null +++ b/tests/test_generator.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +import unittest + +from pythainlp.generator import Unigram, Bigram, Tigram +from pythainlp.generator.thai2fit import gen_sentence + +class TestGeneratorPackage(unittest.TestCase): + def test_unigram(self): + _tnc_unigram = Unigram("tnc") + self.assertIsNotNone(_tnc_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + _ttc_unigram = Unigram("ttc") + self.assertIsNotNone(_ttc_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + _oscar_unigram = Unigram("oscar") + self.assertIsNotNone(_oscar_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + def test_bigram(self): + _bigram = Bigram() + self.assertIsNotNone(_bigram.gen_sentence("ผมชอบไปโรงเรียน")) + def test_tigram(self): + _tigram = Tigram() + self.assertIsNotNone(_tigram.gen_sentence("ผมชอบไปโรงเรียน")) + def test_thai2fit(self): + self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน")) \ No newline at end of file From 5375ee051b29f9e62c110a34704b3367fb8dbf85 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 20:17:13 +0700 Subject: [PATCH 02/26] fixed pep8 --- pythainlp/corpus/oscar.py | 6 +++--- pythainlp/corpus/tnc.py | 8 ++++---- pythainlp/corpus/ttc.py | 4 ++-- pythainlp/generator/__init__.py | 2 +- pythainlp/generator/thai2fit.py | 13 +++++++------ tests/test_corpus.py | 2 +- tests/test_generator.py | 6 +++++- 7 files changed, 23 insertions(+), 18 deletions(-) diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py index e23dbb609..7e19cdf99 100644 --- a/pythainlp/corpus/oscar.py +++ b/pythainlp/corpus/oscar.py @@ -25,7 +25,7 @@ def word_freqs() -> List[Tuple[str, int]]: """ word_freqs = [] _path = get_corpus_path(_FILENAME) - with open(_path,"r",encoding="utf-8") as f: + with open(_path, "r", encoding="utf-8") as f: for line in f.readlines(): word_freq = line.strip().split(",") if len(word_freq) >= 2: @@ -45,9 +45,9 @@ def unigram_word_freqs() -> defaultdict: del _data[0] for i in _data: _temp = i.strip().split(",") - if _temp[0]!=" " and '"' not in _temp[0]: + if _temp[0] != " " and '"' not in _temp[0]: _word_freqs[_temp[0]] = int(_temp[-1]) - elif _temp[0]==" ": + elif _temp[0] == " ": _word_freqs[""] = int(_temp[-1]) return _word_freqs diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 9345cea93..0a147d70a 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -50,7 +50,7 @@ def unigram_word_freqs() -> defaultdict: for i in lines: _temp = i.strip().split(" ") if len(_temp) >= 2: - _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1]) + _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1]) return _word_freqs @@ -64,7 +64,7 @@ def bigram_word_freqs() -> defaultdict: with open(_path, "r", encoding="utf-8-sig") as fh: for i in fh.readlines(): _temp = i.strip().split(" ") - _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1]) + _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1]) return _word_freqs @@ -78,6 +78,6 @@ def tigram_word_freqs() -> defaultdict: with open(_path, "r", encoding="utf-8-sig") as fh: for i in fh.readlines(): _temp = i.strip().split(" ") - _word_freqs[(_temp[0],_temp[1],_temp[2])] = int(_temp[-1]) + _word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1]) - return _word_freqs \ No newline at end of file + return _word_freqs diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index a42fa4c05..000a7f484 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -44,6 +44,6 @@ def unigram_word_freqs() -> defaultdict: for i in lines: _temp = i.strip().split(" ") if len(_temp) >= 2: - _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1]) + _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1]) - return _word_freqs \ No newline at end of file + return _word_freqs diff --git a/pythainlp/generator/__init__.py b/pythainlp/generator/__init__.py index 637497b45..cb18dd716 100644 --- a/pythainlp/generator/__init__.py +++ b/pythainlp/generator/__init__.py @@ -9,4 +9,4 @@ "Tigram" ] -from pythainlp.generator.core import Unigram, Bigram, Tigram \ No newline at end of file +from pythainlp.generator.core import Unigram, Bigram, Tigram diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index d19d0f648..d71eff595 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -14,19 +14,19 @@ from collections import Counter import re -#fastai +# fastai import fastai from fastai.text import * from fastai.callbacks import CSVLogger -#pythainlp +# pythainlp from pythainlp.ulmfit import * -#get dummy data +# get dummy data imdb = untar_data(URLs.IMDB_SAMPLE) dummy_df = pd.read_csv(imdb/'texts.csv') -#get vocab +# get vocab thwiki = "" try: thwiki =_THWIKI_LSTM @@ -36,7 +36,7 @@ thwiki_itos = pickle.load(open(thwiki['itos_fname'],'rb')) thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) -#dummy databunch +# dummy databunch tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th) processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)] @@ -57,9 +57,10 @@ #load pretrained models learn.load_pretrained(**thwiki) + def gen_sentence(N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True): if start_seq is None: start_seq = random.choice(list(thwiki_itos)) list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-') if output_str: return ''.join(list_word) - return list_word \ No newline at end of file + return list_word diff --git a/tests/test_corpus.py b/tests/test_corpus.py index acc5812af..26196129e 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -99,7 +99,7 @@ def test_corpus(self): ) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test")) - + def test_oscar(self): self.assertIsNotNone(oscar.word_freqs()) self.assertIsNotNone(oscar.unigram_word_freqs()) diff --git a/tests/test_generator.py b/tests/test_generator.py index 4e03cabab..eb611ae38 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -5,6 +5,7 @@ from pythainlp.generator import Unigram, Bigram, Tigram from pythainlp.generator.thai2fit import gen_sentence + class TestGeneratorPackage(unittest.TestCase): def test_unigram(self): _tnc_unigram = Unigram("tnc") @@ -13,11 +14,14 @@ def test_unigram(self): self.assertIsNotNone(_ttc_unigram.gen_sentence("ผมชอบไปโรงเรียน")) _oscar_unigram = Unigram("oscar") self.assertIsNotNone(_oscar_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + def test_bigram(self): _bigram = Bigram() self.assertIsNotNone(_bigram.gen_sentence("ผมชอบไปโรงเรียน")) + def test_tigram(self): _tigram = Tigram() self.assertIsNotNone(_tigram.gen_sentence("ผมชอบไปโรงเรียน")) + def test_thai2fit(self): - self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน")) \ No newline at end of file + self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน")) From 652f67ee9eaf5389574634defa7a29c5911de7ca Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 20:26:07 +0700 Subject: [PATCH 03/26] fixed oscar bug --- pythainlp/corpus/oscar.py | 9 ++++--- pythainlp/generator/core.py | 6 +++-- pythainlp/generator/thai2fit.py | 43 ++++++++++++++++++++++++--------- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py index 7e19cdf99..187a8c3dc 100644 --- a/pythainlp/corpus/oscar.py +++ b/pythainlp/corpus/oscar.py @@ -27,9 +27,12 @@ def word_freqs() -> List[Tuple[str, int]]: _path = get_corpus_path(_FILENAME) with open(_path, "r", encoding="utf-8") as f: for line in f.readlines(): - word_freq = line.strip().split(",") - if len(word_freq) >= 2: - word_freqs.append((word_freq[0], int(word_freq[1]))) + _temp = line.strip().split(",") + if len(_temp) >= 2: + if _temp[0] != " " and '"' not in _temp[0]: + word_freqs.append((_temp[0], int(_temp[1]))) + elif _temp[0] == " ": + word_freqs.append(("", int(_temp[1]))) return word_freqs diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 9ca7a1395..349e18171 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -4,11 +4,13 @@ from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram from pythainlp.corpus.tnc import tigram_word_freqs as tnc_word_freqs_tigram from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram -from pythainlp.corpus.oscar import unigram_word_freqs as oscar_word_freqs_unigram +from pythainlp.corpus.oscar import ( + unigram_word_freqs as oscar_word_freqs_unigram +) class Unigram: - def __init__(self, name:str="tnc"): + def __init__(self, name: str = "tnc"): """ :param str name: corpus name :rtype: None diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index d71eff595..2fa78a25b 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -33,23 +33,39 @@ except: thwiki = THWIKI_LSTM -thwiki_itos = pickle.load(open(thwiki['itos_fname'],'rb')) +thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb')) thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) # dummy databunch -tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th) -processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), - NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)] -data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor) - .split_by_rand_pct(0.2) - .label_for_lm() - .databunch(bs=64)) +tt = Tokenizer(tok_func=ThaiTokenizer, lang='th', pre_rules=pre_rules_th, post_rules=post_rules_th) +processor = [ + TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), + NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3) +] +data_lm = ( + TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor) + .split_by_rand_pct(0.2) + .label_for_lm() + .databunch(bs=64) +) data_lm.sanity_check() -config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True, - output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15) +config = dict( + emb_sz=400, + n_hid=1550, + n_layers=4, + pad_token=1, + qrnn=False, + tie_weights=True, + out_bias=True, + output_p=0.25, + hidden_p=0.1, + input_p=0.2, + embed_p=0.02, + weight_p=0.15 +) trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args) @@ -58,7 +74,12 @@ learn.load_pretrained(**thwiki) -def gen_sentence(N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True): +def gen_sentence( + N:int=4, + prob:float=0.001, + start_seq:str=None, + output_str:bool = True +): if start_seq is None: start_seq = random.choice(list(thwiki_itos)) list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-') if output_str: From 22f0e1e6cd85224add966694ac1ad1ce248c4ab8 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 20:39:37 +0700 Subject: [PATCH 04/26] Update thai2fit.py --- pythainlp/generator/thai2fit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index 2fa78a25b..66ca2dffb 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -17,7 +17,6 @@ # fastai import fastai from fastai.text import * -from fastai.callbacks import CSVLogger # pythainlp from pythainlp.ulmfit import * From 772395a30297a1143a0e7926e262b33d4bfeae7c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 20:51:21 +0700 Subject: [PATCH 05/26] Update thai2fit.py --- pythainlp/generator/thai2fit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index 66ca2dffb..d00a9c23e 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -12,11 +12,12 @@ import random from ast import literal_eval from collections import Counter -import re +import pickle # fastai import fastai from fastai.text import * +from fastai.data.external import * # pythainlp from pythainlp.ulmfit import * From 69c4dbff104e9466d8bbe1764d66ffe0b2e45d6e Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 21:38:51 +0700 Subject: [PATCH 06/26] Update fastai --- pythainlp/generator/thai2fit.py | 1 - setup.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index d00a9c23e..be2cafe62 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -17,7 +17,6 @@ # fastai import fastai from fastai.text import * -from fastai.data.external import * # pythainlp from pythainlp.ulmfit import * diff --git a/setup.py b/setup.py index 9b1d12379..6421b5638 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ "wangchanberta": ["transformers", "sentencepiece"], "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], - "text_generator": ["fastai"], + "text_generator": ["fastai<2.0"], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -75,7 +75,7 @@ "ssg>=0.0.6", "torch>=1.0.0", "transformers>=4.1.1", - "fastai" + "fastai<2.0" ], } From ae2eb795844c71e2252903f4ca803d0db16e98cd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 21:54:21 +0700 Subject: [PATCH 07/26] Update core.py --- pythainlp/generator/core.py | 73 ++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 349e18171..49f451d30 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -25,10 +25,10 @@ def __init__(self, name: str = "tnc"): self.n = 0 for i in self.word: self.n += self.counts[i] - self.prob = {i:self.counts[i]/self.n for i in self.word} + self.prob = {i:self.counts[i] / self.n for i in self.word} self._word_prob = {} - def gen_sentence(self,N:int=3,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False): + def gen_sentence(self, N: int = 3,prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False): """ :param int N: number of word. :param str start_seq: word for begin word. @@ -38,20 +38,21 @@ def gen_sentence(self,N:int=3,prob:float=0.001, start_seq:str=None, output_str:b :return: list words or str words :rtype: str,list """ - if start_seq is None: start_seq = random.choice(self.word) + if start_seq is None: + start_seq = random.choice(self.word) rand_text = start_seq.lower() - self._word_prob = {i:self.counts[i]/self.n for i in self.word if self.counts[i]/self.n>=prob} - return self.next_word(rand_text, N, output_str,prob=prob, duplicate=duplicate) + self._word_prob = {i:self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob} + return self.next_word(rand_text, N, output_str, prob = prob, duplicate = duplicate) - def next_word(self,text:str, N:int, output_str:str,prob, duplicate:bool=False): + def next_word(self, text: str, N: int, output_str: str, prob: float, duplicate: bool = False): self.l = [] self.l.append(text) self._word_list = list(self._word_prob.keys()) - if N>len(self._word_list): - N=len(self._word_list) + if N > len(self._word_list): + N =len(self._word_list) for i in range(N): self._word = random.choice(self._word_list) - if duplicate == False: + if duplicate is False: while self._word in self.l: self._word = random.choice(self._word_list) self.l.append(self._word) @@ -62,7 +63,7 @@ def next_word(self,text:str, N:int, output_str:str,prob, duplicate:bool=False): class Bigram: - def __init__(self,name:str="tnc"): + def __init__(self, name: str = "tnc"): """ :param str name: corpus name :rtype: None @@ -74,10 +75,10 @@ def __init__(self,name:str="tnc"): self.bi_keys = list(self.bi.keys()) self.words = [i[-1] for i in self.bi_keys] - def prob(self, t1:str, t2:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 + def prob(self, t1: str, t2: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 """ probability word - + :param int t1: text 1 :param int t2: text 2 @@ -85,12 +86,12 @@ def prob(self, t1:str, t2:str): # from https://towardsdatascience.com/understand :rtype: float """ try: - v=self.bi[(t1,t2)]/self.uni[t1] + v = self.bi[(t1, t2)] / self.uni[t1] except: - v=0.0 + v = 0.0 return v - def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False): + def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False): if start_seq is None: start_seq = random.choice(self.words) self.late_word = start_seq self.list_word = [] @@ -98,12 +99,17 @@ def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:b for i in range(N): if duplicate: - self._temp = [j for j in self.bi_keys if j[0]==self.late_word] + self._temp = [ + j for j in self.bi_keys if j[0] == self.late_word + ] else: - self._temp = [j for j in self.bi_keys if j[0]==self.late_word and j[1] not in self.list_word] - self._probs = [self.prob(self.late_word,l[-1]) for l in self._temp] - self._p2 = [j for j in self._probs if j>=prob] - if len(self._p2)==0: + self._temp = [ + j for j in self.bi_keys + if j[0]==self.late_word and j[1] not in self.list_word + ] + self._probs = [self.prob(self.late_word, l[-1]) for l in self._temp] + self._p2 = [j for j in self._probs if j >= prob] + if len(self._p2) == 0: break self.items = self._temp[self._probs.index(random.choice(self._p2))] self.late_word = self.items[-1] @@ -114,7 +120,7 @@ def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:b class Tigram: - def __init__(self,name:str="tnc"): + def __init__(self, name: str = "tnc"): """ :param str name: corpus name :rtype: None @@ -126,9 +132,9 @@ def __init__(self,name:str="tnc"): self.uni_keys = list(self.uni.keys()) self.bi_keys = list(self.bi.keys()) self.ti_keys = list(self.ti.keys()) - self.words = [i[-1] for i in self.bi_keys] + self.words = [i[-1] for i in self.bi_keys] - def prob(self, t1:str, t2:str, t3:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 + def prob(self, t1: str, t2: str, t3: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 """ probability word @@ -140,25 +146,26 @@ def prob(self, t1:str, t2:str, t3:str): # from https://towardsdatascience.com/un :rtype: float """ try: - v=self.ti[(t1, t2, t3)]/self.bi[(t1, t2)] + v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)] except: - v=0.0 + v = 0.0 return v - def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:tuple=None, output_str:bool = True, duplicate:bool=False): - if start_seq is None: start_seq = random.choice(self.bi_keys) + def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = กFalse): + if start_seq is None: + start_seq = random.choice(self.bi_keys) self.late_word = start_seq self.list_word = [] self.list_word.append(start_seq) for i in range(N): if duplicate: - self._temp = [j for j in self.ti_keys if j[:2]==self.late_word] + self._temp = [j for j in self.ti_keys if j[:2] == self.late_word] else: - self._temp = [j for j in self.ti_keys if j[:2]==self.late_word and j[1:] not in self.list_word] - self._probs = [self.prob(l[0],l[1],l[2]) for l in self._temp] - self._p2 = [j for j in self._probs if j>=prob] - if len(self._p2)==0: + self._temp = [j for j in self.ti_keys if j[:2] == self.late_word and j[1:] not in self.list_word] + self._probs = [self.prob(l[0], l[1], l[2]) for l in self._temp] + self._p2 = [j for j in self._probs if j >= prob] + if len(self._p2) == 0: break self.items = self._temp[self._probs.index(random.choice(self._p2))] self.late_word = self.items[1:] @@ -170,4 +177,4 @@ def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:tuple=None, output_str self.listdata.append(j) if output_str: return ''.join(self.listdata) - return self.listdata \ No newline at end of file + return self.listdata From 7e88b39ea7e496a2c328c456d722b7c62e8644e6 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 21:56:00 +0700 Subject: [PATCH 08/26] Update core.py --- pythainlp/generator/core.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 49f451d30..acd9006c0 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -1,4 +1,9 @@ # -*- coding: utf-8 -*- +""" +Text generator using Unigram, Bigram and Tigram + +code from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 +""" import random from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram @@ -75,7 +80,7 @@ def __init__(self, name: str = "tnc"): self.bi_keys = list(self.bi.keys()) self.words = [i[-1] for i in self.bi_keys] - def prob(self, t1: str, t2: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 + def prob(self, t1: str, t2: str): """ probability word @@ -134,7 +139,7 @@ def __init__(self, name: str = "tnc"): self.ti_keys = list(self.ti.keys()) self.words = [i[-1] for i in self.bi_keys] - def prob(self, t1: str, t2: str, t3: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 + def prob(self, t1: str, t2: str, t3: str): """ probability word From 57c7a0a72a3321865f7d392b60ab9c36ddc64ffd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 22:13:27 +0700 Subject: [PATCH 09/26] Update code --- pythainlp/corpus/oscar.py | 4 +++- pythainlp/generator/core.py | 2 +- pythainlp/generator/thai2fit.py | 5 ++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py index 187a8c3dc..085f5bc41 100644 --- a/pythainlp/corpus/oscar.py +++ b/pythainlp/corpus/oscar.py @@ -26,7 +26,9 @@ def word_freqs() -> List[Tuple[str, int]]: word_freqs = [] _path = get_corpus_path(_FILENAME) with open(_path, "r", encoding="utf-8") as f: - for line in f.readlines(): + _data = [i for i in f.readlines()] + del _data[0] + for line in _data: _temp = line.strip().split(",") if len(_temp) >= 2: if _temp[0] != " " and '"' not in _temp[0]: diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index acd9006c0..27f77aa26 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -156,7 +156,7 @@ def prob(self, t1: str, t2: str, t3: str): v = 0.0 return v - def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = กFalse): + def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = False): if start_seq is None: start_seq = random.choice(self.bi_keys) self.late_word = start_seq diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index be2cafe62..1a6570bcb 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -10,8 +10,6 @@ import pandas as pd import random -from ast import literal_eval -from collections import Counter import pickle # fastai @@ -79,7 +77,8 @@ def gen_sentence( start_seq:str=None, output_str:bool = True ): - if start_seq is None: start_seq = random.choice(list(thwiki_itos)) + if start_seq is None: + start_seq = random.choice(list(thwiki_itos)) list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-') if output_str: return ''.join(list_word) From ec3e66291c2301c83e99e8721438d7b8778d7199 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 11 Jun 2021 22:38:34 +0700 Subject: [PATCH 10/26] Update code --- pythainlp/generator/core.py | 6 +++--- tests/test_generator.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 27f77aa26..3be47e7d3 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -33,7 +33,7 @@ def __init__(self, name: str = "tnc"): self.prob = {i:self.counts[i] / self.n for i in self.word} self._word_prob = {} - def gen_sentence(self, N: int = 3,prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False): + def gen_sentence(self, start_seq: str = None, N: int = 3,prob: float = 0.001, output_str: bool = True, duplicate: bool = False): """ :param int N: number of word. :param str start_seq: word for begin word. @@ -96,7 +96,7 @@ def prob(self, t1: str, t2: str): v = 0.0 return v - def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False): + def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False): if start_seq is None: start_seq = random.choice(self.words) self.late_word = start_seq self.list_word = [] @@ -156,7 +156,7 @@ def prob(self, t1: str, t2: str, t3: str): v = 0.0 return v - def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = False): + def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False): if start_seq is None: start_seq = random.choice(self.bi_keys) self.late_word = start_seq diff --git a/tests/test_generator.py b/tests/test_generator.py index eb611ae38..132b5f737 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -9,19 +9,19 @@ class TestGeneratorPackage(unittest.TestCase): def test_unigram(self): _tnc_unigram = Unigram("tnc") - self.assertIsNotNone(_tnc_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม")) _ttc_unigram = Unigram("ttc") - self.assertIsNotNone(_ttc_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม")) _oscar_unigram = Unigram("oscar") - self.assertIsNotNone(_oscar_unigram.gen_sentence("ผมชอบไปโรงเรียน")) + self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม")) def test_bigram(self): _bigram = Bigram() - self.assertIsNotNone(_bigram.gen_sentence("ผมชอบไปโรงเรียน")) + self.assertIsNotNone(_bigram.gen_sentence("ผม")) def test_tigram(self): _tigram = Tigram() - self.assertIsNotNone(_tigram.gen_sentence("ผมชอบไปโรงเรียน")) + self.assertIsNotNone(_tigram.gen_sentence("ผม")) def test_thai2fit(self): self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน")) From 466f0537cbd14bc7f544cb67c6f791bd9d253aa5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 00:01:16 +0700 Subject: [PATCH 11/26] Update test_generator.py --- tests/test_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 132b5f737..06bfe4631 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -24,4 +24,4 @@ def test_tigram(self): self.assertIsNotNone(_tigram.gen_sentence("ผม")) def test_thai2fit(self): - self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน")) + self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว")) From fd3215e2eb5713268f703b150d8d1cc49e40d791 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 12:27:35 +0700 Subject: [PATCH 12/26] Update thai2fit.py --- pythainlp/generator/thai2fit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index 1a6570bcb..f157af03b 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -72,9 +72,9 @@ def gen_sentence( + start_seq:str=None, N:int=4, prob:float=0.001, - start_seq:str=None, output_str:bool = True ): if start_seq is None: From 84aadbca65fbc8fc26c254185c334c2fbbf26114 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 12:53:37 +0700 Subject: [PATCH 13/26] fixed bug --- pythainlp/corpus/tnc.py | 2 +- pythainlp/corpus/ttc.py | 2 +- pythainlp/generator/core.py | 15 +++++++---- pythainlp/generator/thai2fit.py | 47 ++++++++++++++++++++++----------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 0a147d70a..2dfeb8690 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -50,7 +50,7 @@ def unigram_word_freqs() -> defaultdict: for i in lines: _temp = i.strip().split(" ") if len(_temp) >= 2: - _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1]) + _word_freqs[_temp[0]] = int(_temp[-1]) return _word_freqs diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index 000a7f484..c3ffa0c0d 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -44,6 +44,6 @@ def unigram_word_freqs() -> defaultdict: for i in lines: _temp = i.strip().split(" ") if len(_temp) >= 2: - _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1]) + _word_freqs[_temp[0]] = int(_temp[-1]) return _word_freqs diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 3be47e7d3..acdd40cfe 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -54,14 +54,14 @@ def next_word(self, text: str, N: int, output_str: str, prob: float, duplicate: self.l.append(text) self._word_list = list(self._word_prob.keys()) if N > len(self._word_list): - N =len(self._word_list) + N = len(self._word_list) for i in range(N): self._word = random.choice(self._word_list) if duplicate is False: while self._word in self.l: self._word = random.choice(self._word_list) self.l.append(self._word) - + if output_str: return "".join(self.l) return self.l @@ -142,7 +142,7 @@ def __init__(self, name: str = "tnc"): def prob(self, t1: str, t2: str, t3: str): """ probability word - + :param int t1: text 1 :param int t2: text 2 :param int t3: text 3 @@ -165,9 +165,14 @@ def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, o for i in range(N): if duplicate: - self._temp = [j for j in self.ti_keys if j[:2] == self.late_word] + self._temp = [ + j for j in self.ti_keys if j[:2] == self.late_word + ] else: - self._temp = [j for j in self.ti_keys if j[:2] == self.late_word and j[1:] not in self.list_word] + self._temp = [ + j for j in self.ti_keys + if j[:2] == self.late_word and j[1:] not in self.list_word + ] self._probs = [self.prob(l[0], l[1], l[2]) for l in self._temp] self._p2 = [j for j in self._probs if j >= prob] if len(self._p2) == 0: diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index f157af03b..17fb5a094 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -26,15 +26,20 @@ # get vocab thwiki = "" try: - thwiki =_THWIKI_LSTM + thwiki =_THWIKI_LSTM except: - thwiki = THWIKI_LSTM + thwiki = THWIKI_LSTM thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb')) thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) # dummy databunch -tt = Tokenizer(tok_func=ThaiTokenizer, lang='th', pre_rules=pre_rules_th, post_rules=post_rules_th) +tt = Tokenizer( + tok_func = ThaiTokenizer, + lang = 'th', + pre_rules = pre_rules_th, + post_rules = post_rules_th +) processor = [ TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3) @@ -65,21 +70,33 @@ ) trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) -learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args) +learn = language_model_learner( + data_lm, + AWD_LSTM, + config=config, + pretrained=False, + **trn_args +) -#load pretrained models +# load pretrained models learn.load_pretrained(**thwiki) def gen_sentence( - start_seq:str=None, - N:int=4, - prob:float=0.001, - output_str:bool = True + start_seq: str = None, + N: int = 4, + prob: float = 0.001, + output_str: bool = True ): - if start_seq is None: - start_seq = random.choice(list(thwiki_itos)) - list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-') - if output_str: - return ''.join(list_word) - return list_word + if start_seq is None: + start_seq = random.choice(list(thwiki_itos)) + list_word = learn.predict( + start_seq, + N, + temperature=0.8, + min_p=prob, + sep = '-*-' + ).split('-*-') + if output_str: + return ''.join(list_word) + return list_word From 99774e250c87f62d68d45c96498227372a98a0ca Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 13:17:33 +0700 Subject: [PATCH 14/26] Add test --- tests/test_generator.py | 6 ++++ tests/test_ulmfit.py | 61 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/tests/test_generator.py b/tests/test_generator.py index 06bfe4631..af73d8ef8 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -10,18 +10,24 @@ class TestGeneratorPackage(unittest.TestCase): def test_unigram(self): _tnc_unigram = Unigram("tnc") self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม")) + self.assertIsNotNone(_tnc_unigram.gen_sentence()) _ttc_unigram = Unigram("ttc") self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม")) + self.assertIsNotNone(_ttc_unigram.gen_sentence()) _oscar_unigram = Unigram("oscar") self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม")) + self.assertIsNotNone(_oscar_unigram.gen_sentence()) def test_bigram(self): _bigram = Bigram() self.assertIsNotNone(_bigram.gen_sentence("ผม")) + self.assertIsNotNone(_bigram.gen_sentence()) def test_tigram(self): _tigram = Tigram() self.assertIsNotNone(_tigram.gen_sentence("ผม")) + self.assertIsNotNone(_tigram.gen_sentence()) def test_thai2fit(self): self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว")) + self.assertIsNotNone(gen_sentence()) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index a713bd8b6..828fa9b7d 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -30,6 +30,15 @@ ungroup_emoji, ) from pythainlp.ulmfit.tokenizer import BaseTokenizer +import pandas as pd +import random +import pickle +# fastai +import fastai +from fastai.text import * + +# pythainlp +from pythainlp.ulmfit import * class TestUlmfitPackage(unittest.TestCase): @@ -198,3 +207,55 @@ def test_process_thai_dense(self): ] self.assertEqual(actual, expect) + + def test_document_vector(self): + imdb = untar_data(URLs.IMDB_SAMPLE) + dummy_df = pd.read_csv(imdb/'texts.csv') + thwiki = "" + try: + thwiki =_THWIKI_LSTM + except: + thwiki = THWIKI_LSTM + thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb')) + thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) + tt = Tokenizer( + tok_func = ThaiTokenizer, + lang = 'th', + pre_rules = pre_rules_th, + post_rules = post_rules_th + ) + processor = [ + TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), + NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3) + ] + data_lm = ( + TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor) + .split_by_rand_pct(0.2) + .label_for_lm() + .databunch(bs=64) + ) + data_lm.sanity_check() + config = dict( + emb_sz=400, + n_hid=1550, + n_layers=4, + pad_token=1, + qrnn=False, + tie_weights=True, + out_bias=True, + output_p=0.25, + hidden_p=0.1, + input_p=0.2, + embed_p=0.02, + weight_p=0.15 + ) + trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) + learn = language_model_learner( + data_lm, + AWD_LSTM, + config=config, + pretrained=False, + **trn_args + ) + learn.load_pretrained(**thwiki) + self.assertIsNotNone(document_vector('วันนี้วันดีปีใหม่', learn, data_lm)) From 62739790c02591959036068b7f61e15feec1dd20 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 13:49:57 +0700 Subject: [PATCH 15/26] Update test --- pythainlp/generator/core.py | 59 ++++++++++++++++++++++++--------- pythainlp/generator/thai2fit.py | 16 ++++----- tests/test_ulmfit.py | 34 ++++++++++++++----- 3 files changed, 77 insertions(+), 32 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index acdd40cfe..3900bfaae 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -30,10 +30,19 @@ def __init__(self, name: str = "tnc"): self.n = 0 for i in self.word: self.n += self.counts[i] - self.prob = {i:self.counts[i] / self.n for i in self.word} + self.prob = { + i:self.counts[i] / self.n for i in self.word + } self._word_prob = {} - def gen_sentence(self, start_seq: str = None, N: int = 3,prob: float = 0.001, output_str: bool = True, duplicate: bool = False): + def gen_sentence( + self, + start_seq: str = None, + N: int = 3, + prob: float = 0.001, + output_str: bool = True, + duplicate: bool = False + ): """ :param int N: number of word. :param str start_seq: word for begin word. @@ -46,25 +55,35 @@ def gen_sentence(self, start_seq: str = None, N: int = 3,prob: float = 0.001, ou if start_seq is None: start_seq = random.choice(self.word) rand_text = start_seq.lower() - self._word_prob = {i:self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob} - return self.next_word(rand_text, N, output_str, prob = prob, duplicate = duplicate) - - def next_word(self, text: str, N: int, output_str: str, prob: float, duplicate: bool = False): - self.l = [] - self.l.append(text) + self._word_prob = { + i:self.counts[i] / self.n for i in self.word + if self.counts[i] / self.n >= prob + } + return self.next_word(rand_text, N, output_str, prob=prob, duplicate=duplicate) + + def next_word( + self, + text: str, + N: int, + output_str: str, + prob: float, + duplicate: bool = False + ): + self.words = [] + self.words.append(text) self._word_list = list(self._word_prob.keys()) if N > len(self._word_list): N = len(self._word_list) for i in range(N): self._word = random.choice(self._word_list) if duplicate is False: - while self._word in self.l: + while self._word in self.words: self._word = random.choice(self._word_list) - self.l.append(self._word) + self.words.append(self._word) if output_str: - return "".join(self.l) - return self.l + return "".join(self.words) + return self.words class Bigram: @@ -78,7 +97,7 @@ def __init__(self, name: str = "tnc"): self.bi = tnc_word_freqs_bigram() self.uni_keys = list(self.uni.keys()) self.bi_keys = list(self.bi.keys()) - self.words = [i[-1] for i in self.bi_keys] + self.words = [i[-1] for i in self.bi_keys] def prob(self, t1: str, t2: str): """ @@ -154,9 +173,17 @@ def prob(self, t1: str, t2: str, t3: str): v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)] except: v = 0.0 + return v - def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False): + def gen_sentence( + self, + start_seq: str = None, + N: int = 4, + prob: float = 0.001, + output_str: bool = True, + duplicate: bool = False + ): if start_seq is None: start_seq = random.choice(self.bi_keys) self.late_word = start_seq @@ -173,7 +200,9 @@ def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, o j for j in self.ti_keys if j[:2] == self.late_word and j[1:] not in self.list_word ] - self._probs = [self.prob(l[0], l[1], l[2]) for l in self._temp] + self._probs = [ + self.prob(word[0], word[1], word[2]) for word in self._temp + ] self._p2 = [j for j in self._probs if j >= prob] if len(self._p2) == 0: break diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index 17fb5a094..a6b27274c 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -26,7 +26,7 @@ # get vocab thwiki = "" try: - thwiki =_THWIKI_LSTM + thwiki = _THWIKI_LSTM except: thwiki = THWIKI_LSTM @@ -35,10 +35,10 @@ # dummy databunch tt = Tokenizer( - tok_func = ThaiTokenizer, - lang = 'th', - pre_rules = pre_rules_th, - post_rules = post_rules_th + tok_func=ThaiTokenizer, + lang='th', + pre_rules=pre_rules_th, + post_rules=post_rules_th ) processor = [ TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), @@ -89,14 +89,14 @@ def gen_sentence( output_str: bool = True ): if start_seq is None: - start_seq = random.choice(list(thwiki_itos)) + start_seq = random.choice(list(thwiki_itos)) list_word = learn.predict( start_seq, N, temperature=0.8, min_p=prob, - sep = '-*-' + sep='-*-' ).split('-*-') if output_str: - return ''.join(list_word) + return ''.join(list_word) return list_word diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 828fa9b7d..3f66681c2 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -213,23 +213,32 @@ def test_document_vector(self): dummy_df = pd.read_csv(imdb/'texts.csv') thwiki = "" try: - thwiki =_THWIKI_LSTM + thwiki = _THWIKI_LSTM except: thwiki = THWIKI_LSTM thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb')) thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos) tt = Tokenizer( - tok_func = ThaiTokenizer, - lang = 'th', - pre_rules = pre_rules_th, - post_rules = post_rules_th + tok_func=ThaiTokenizer, + lang='th', + pre_rules=pre_rules_th, + post_rules=post_rules_th ) processor = [ - TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False), - NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3) + TokenizeProcessor( + tokenizer=tt, chunksize=10000, mark_fields=False + ), + NumericalizeProcessor( + vocab=thwiki_vocab, max_vocab=60000, min_freq=3 + ) ] data_lm = ( - TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor) + TextList.from_df( + dummy_df, + imdb, + cols=['text'], + processor=processor + ) .split_by_rand_pct(0.2) .label_for_lm() .databunch(bs=64) @@ -258,4 +267,11 @@ def test_document_vector(self): **trn_args ) learn.load_pretrained(**thwiki) - self.assertIsNotNone(document_vector('วันนี้วันดีปีใหม่', learn, data_lm)) + self.assertIsNotNone( + document_vector('วันนี้วันดีปีใหม่', learn, data_lm) + ) + self.assertIsNotNone( + document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg="sum") + ) + with self.assertRaises(ValueError): + document_vector('วันนี้วันดีปีใหม่', learn, data_lm,agg='abc') From b449f01241388c1fba7b7e8f76b42865489195cd Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 13:54:26 +0700 Subject: [PATCH 16/26] Update pep8 --- pythainlp/generator/core.py | 30 +++++++++++++++++++++++------- tests/test_ulmfit.py | 2 +- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 3900bfaae..1af669325 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -31,7 +31,7 @@ def __init__(self, name: str = "tnc"): for i in self.word: self.n += self.counts[i] self.prob = { - i:self.counts[i] / self.n for i in self.word + i: self.counts[i] / self.n for i in self.word } self._word_prob = {} @@ -56,10 +56,16 @@ def gen_sentence( start_seq = random.choice(self.word) rand_text = start_seq.lower() self._word_prob = { - i:self.counts[i] / self.n for i in self.word + i: self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob } - return self.next_word(rand_text, N, output_str, prob=prob, duplicate=duplicate) + return self.next_word( + rand_text, + N, + output_str, + prob=prob, + duplicate=duplicate + ) def next_word( self, @@ -115,8 +121,16 @@ def prob(self, t1: str, t2: str): v = 0.0 return v - def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False): - if start_seq is None: start_seq = random.choice(self.words) + def gen_sentence( + self, + start_seq: str = None, + N: int = 4, + prob: float = 0.001, + output_str: bool = True, + duplicate: bool = False + ): + if start_seq is None: + start_seq = random.choice(self.words) self.late_word = start_seq self.list_word = [] self.list_word.append(start_seq) @@ -129,9 +143,11 @@ def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, o else: self._temp = [ j for j in self.bi_keys - if j[0]==self.late_word and j[1] not in self.list_word + if j[0] == self.late_word and j[1] not in self.list_word ] - self._probs = [self.prob(self.late_word, l[-1]) for l in self._temp] + self._probs = [ + self.prob(self.late_word, next_word[-1]) for next_word in self._temp + ] self._p2 = [j for j in self._probs if j >= prob] if len(self._p2) == 0: break diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 3f66681c2..3aa807704 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -274,4 +274,4 @@ def test_document_vector(self): document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg="sum") ) with self.assertRaises(ValueError): - document_vector('วันนี้วันดีปีใหม่', learn, data_lm,agg='abc') + document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg='abc') From 83c54a84ad2be46718ab817946c1632efb804311 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 14:05:17 +0700 Subject: [PATCH 17/26] Add docs --- pythainlp/generator/core.py | 48 ++++++++++++++++++++++++++------- pythainlp/generator/thai2fit.py | 14 +++++++++- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 1af669325..2df135a10 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -12,12 +12,18 @@ from pythainlp.corpus.oscar import ( unigram_word_freqs as oscar_word_freqs_unigram ) +from typing import List, Union class Unigram: def __init__(self, name: str = "tnc"): """ + Text generator using Unigram + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + * *ttc* - Thai Textbook Corpus (TTC) + * *oscar* - OSCAR Corpus :rtype: None """ if name == "tnc": @@ -42,15 +48,15 @@ def gen_sentence( prob: float = 0.001, output_str: bool = True, duplicate: bool = False - ): + ) -> Union[List[str], str]: """ - :param int N: number of word. :param str start_seq: word for begin word. + :param int N: number of word. :param bool output_str: output is str :param bool duplicate: duplicate word in sent :return: list words or str words - :rtype: str,list + :rtype: List[str], str """ if start_seq is None: start_seq = random.choice(self.word) @@ -59,7 +65,7 @@ def gen_sentence( i: self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob } - return self.next_word( + return self._next_word( rand_text, N, output_str, @@ -67,7 +73,7 @@ def gen_sentence( duplicate=duplicate ) - def next_word( + def _next_word( self, text: str, N: int, @@ -95,7 +101,10 @@ def next_word( class Bigram: def __init__(self, name: str = "tnc"): """ + Text generator using Bigram + :param str name: corpus name + * *tnc* - Thai National Corpus (default) :rtype: None """ if name == "tnc": @@ -105,7 +114,7 @@ def __init__(self, name: str = "tnc"): self.bi_keys = list(self.bi.keys()) self.words = [i[-1] for i in self.bi_keys] - def prob(self, t1: str, t2: str): + def prob(self, t1: str, t2: str) -> float: """ probability word @@ -128,7 +137,16 @@ def gen_sentence( prob: float = 0.001, output_str: bool = True, duplicate: bool = False - ): + ) -> Union[List[str], str]: + """ + :param str start_seq: word for begin word. + :param int N: number of word. + :param bool output_str: output is str + :param bool duplicate: duplicate word in sent + + :return: list words or str words + :rtype: List[str], str + """ if start_seq is None: start_seq = random.choice(self.words) self.late_word = start_seq @@ -162,7 +180,10 @@ def gen_sentence( class Tigram: def __init__(self, name: str = "tnc"): """ + Text generator using Tigram + :param str name: corpus name + * *tnc* - Thai National Corpus (default) :rtype: None """ if name == "tnc": @@ -174,7 +195,7 @@ def __init__(self, name: str = "tnc"): self.ti_keys = list(self.ti.keys()) self.words = [i[-1] for i in self.bi_keys] - def prob(self, t1: str, t2: str, t3: str): + def prob(self, t1: str, t2: str, t3: str) -> float: """ probability word @@ -199,7 +220,16 @@ def gen_sentence( prob: float = 0.001, output_str: bool = True, duplicate: bool = False - ): + ) -> Union[List[str], str]: + """ + :param str start_seq: word for begin word. + :param int N: number of word. + :param bool output_str: output is str + :param bool duplicate: duplicate word in sent + + :return: list words or str words + :rtype: List[str], str + """ if start_seq is None: start_seq = random.choice(self.bi_keys) self.late_word = start_seq diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py index a6b27274c..ee45ae22c 100644 --- a/pythainlp/generator/thai2fit.py +++ b/pythainlp/generator/thai2fit.py @@ -11,6 +11,7 @@ import pandas as pd import random import pickle +from typing import List, Union # fastai import fastai @@ -87,7 +88,18 @@ def gen_sentence( N: int = 4, prob: float = 0.001, output_str: bool = True -): +) -> Union[List[str], str]: + """ + Text generator using Thai2fit + + :param str start_seq: word for begin word. + :param int N: number of word. + :param bool output_str: output is str + :param bool duplicate: duplicate word in sent + + :return: list words or str words + :rtype: List[str], str + """ if start_seq is None: start_seq = random.choice(list(thwiki_itos)) list_word = learn.predict( From 6c64531f00f1ff79363c0c2cbe7f5dbfdab490bc Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 12 Jun 2021 14:15:15 +0700 Subject: [PATCH 18/26] Add test --- tests/test_generator.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_generator.py b/tests/test_generator.py index af73d8ef8..adc781cf1 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -10,23 +10,33 @@ class TestGeneratorPackage(unittest.TestCase): def test_unigram(self): _tnc_unigram = Unigram("tnc") self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม")) + self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม", output_str=False)) self.assertIsNotNone(_tnc_unigram.gen_sentence()) + self.assertIsNotNone(_tnc_unigram.gen_sentence(duplicate=True)) _ttc_unigram = Unigram("ttc") self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม")) + self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม", output_str=False)) self.assertIsNotNone(_ttc_unigram.gen_sentence()) + self.assertIsNotNone(_ttc_unigram.gen_sentence(duplicate=True)) _oscar_unigram = Unigram("oscar") self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม")) + self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม", output_str=False)) self.assertIsNotNone(_oscar_unigram.gen_sentence()) + self.assertIsNotNone(_oscar_unigram.gen_sentence(duplicate=True)) def test_bigram(self): _bigram = Bigram() self.assertIsNotNone(_bigram.gen_sentence("ผม")) + self.assertIsNotNone(_bigram.gen_sentence("ผม", output_str=False)) self.assertIsNotNone(_bigram.gen_sentence()) + self.assertIsNotNone(_bigram.gen_sentence(duplicate=True)) def test_tigram(self): _tigram = Tigram() self.assertIsNotNone(_tigram.gen_sentence("ผม")) + self.assertIsNotNone(_tigram.gen_sentence("ผม", output_str=False)) self.assertIsNotNone(_tigram.gen_sentence()) + self.assertIsNotNone(_tigram.gen_sentence(duplicate=True)) def test_thai2fit(self): self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว")) From 0420bf6516a595e5df0468b22eebd58e7afd54ce Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 13 Jun 2021 12:10:24 +0700 Subject: [PATCH 19/26] fixed pep8 --- pythainlp/generator/core.py | 4 +++- tests/test_generator.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py index 2df135a10..9afb98218 100644 --- a/pythainlp/generator/core.py +++ b/pythainlp/generator/core.py @@ -164,7 +164,9 @@ def gen_sentence( if j[0] == self.late_word and j[1] not in self.list_word ] self._probs = [ - self.prob(self.late_word, next_word[-1]) for next_word in self._temp + self.prob( + self.late_word, next_word[-1] + ) for next_word in self._temp ] self._p2 = [j for j in self._probs if j >= prob] if len(self._p2) == 0: diff --git a/tests/test_generator.py b/tests/test_generator.py index adc781cf1..ea57a66b0 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -20,7 +20,9 @@ def test_unigram(self): self.assertIsNotNone(_ttc_unigram.gen_sentence(duplicate=True)) _oscar_unigram = Unigram("oscar") self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม")) - self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม", output_str=False)) + self.assertIsNotNone( + _oscar_unigram.gen_sentence("ผม", output_str=False) + ) self.assertIsNotNone(_oscar_unigram.gen_sentence()) self.assertIsNotNone(_oscar_unigram.gen_sentence(duplicate=True)) From b46e986b0dfd085255a53663922954449e99e72c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 14 Jun 2021 18:17:00 +0700 Subject: [PATCH 20/26] Change generator to generate --- docs/api/generate.rst | 16 ++++++++++++++++ docs/api/generator.rst | 16 ---------------- pythainlp/{generator => generate}/__init__.py | 4 ++-- pythainlp/{generator => generate}/core.py | 0 pythainlp/{generator => generate}/thai2fit.py | 0 setup.py | 2 +- tests/{test_generator.py => test_generate.py} | 6 +++--- 7 files changed, 22 insertions(+), 22 deletions(-) create mode 100644 docs/api/generate.rst delete mode 100644 docs/api/generator.rst rename pythainlp/{generator => generate}/__init__.py (52%) rename pythainlp/{generator => generate}/core.py (100%) rename pythainlp/{generator => generate}/thai2fit.py (100%) rename tests/{test_generator.py => test_generate.py} (92%) diff --git a/docs/api/generate.rst b/docs/api/generate.rst new file mode 100644 index 000000000..9d450862d --- /dev/null +++ b/docs/api/generate.rst @@ -0,0 +1,16 @@ +.. currentmodule:: pythainlp.generate + +pythainlp.generate +================== +The :class:`pythainlp.generate` is Thai text generate with PyThaiNLP. + +Modules +------- + +.. autoclass:: Unigram + :members: +.. autoclass:: Bigram + :members: +.. autoclass:: Tigram + :members: +.. autofunction:: pythainlp.generate.thai2fit.gen_sentence \ No newline at end of file diff --git a/docs/api/generator.rst b/docs/api/generator.rst deleted file mode 100644 index cd8252579..000000000 --- a/docs/api/generator.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. currentmodule:: pythainlp.generator - -pythainlp.generator -=================== -The :class:`pythainlp.generator` is Thai text generator with PyThaiNLP. - -Modules -------- - -.. autoclass:: Unigram - :members: -.. autoclass:: Bigram - :members: -.. autoclass:: Tigram - :members: -.. autofunction:: pythainlp.generator.thai2fit.gen_sentence \ No newline at end of file diff --git a/pythainlp/generator/__init__.py b/pythainlp/generate/__init__.py similarity index 52% rename from pythainlp/generator/__init__.py rename to pythainlp/generate/__init__.py index cb18dd716..9cd578864 100644 --- a/pythainlp/generator/__init__.py +++ b/pythainlp/generate/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Thai Text generator +Thai Text generate """ __all__ = [ @@ -9,4 +9,4 @@ "Tigram" ] -from pythainlp.generator.core import Unigram, Bigram, Tigram +from pythainlp.generate.core import Unigram, Bigram, Tigram diff --git a/pythainlp/generator/core.py b/pythainlp/generate/core.py similarity index 100% rename from pythainlp/generator/core.py rename to pythainlp/generate/core.py diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generate/thai2fit.py similarity index 100% rename from pythainlp/generator/thai2fit.py rename to pythainlp/generate/thai2fit.py diff --git a/setup.py b/setup.py index 6421b5638..a0c8ac39d 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ "wangchanberta": ["transformers", "sentencepiece"], "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], - "text_generator": ["fastai<2.0"], + "generate": ["fastai<2.0"], "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", diff --git a/tests/test_generator.py b/tests/test_generate.py similarity index 92% rename from tests/test_generator.py rename to tests/test_generate.py index ea57a66b0..f097bbb30 100644 --- a/tests/test_generator.py +++ b/tests/test_generate.py @@ -2,11 +2,11 @@ import unittest -from pythainlp.generator import Unigram, Bigram, Tigram -from pythainlp.generator.thai2fit import gen_sentence +from pythainlp.generate import Unigram, Bigram, Tigram +from pythainlp.generate.thai2fit import gen_sentence -class TestGeneratorPackage(unittest.TestCase): +class TestGeneratePackage(unittest.TestCase): def test_unigram(self): _tnc_unigram = Unigram("tnc") self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม")) From e8edeaa83abb83c8cbb669d36fc114d2d3209d19 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 21 Jun 2021 06:57:07 +0100 Subject: [PATCH 21/26] Update tnc.py --- pythainlp/corpus/tnc.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 2dfeb8690..77251ae64 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -2,7 +2,7 @@ """ Thai National Corpus word frequency -Credit: Korakot Chaovavanich‎ +Credit: Korakot Chaovavanich https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ @@ -10,7 +10,7 @@ "word_freqs", "unigram_word_freqs", "bigram_word_freqs", - "tigram_word_freqs" + "trigram_word_freqs" ] from collections import defaultdict @@ -22,7 +22,7 @@ _FILENAME = "tnc_freq.txt" _BIGRAM = "tnc_bigram_word_freqs" -_TIGRAM = "tnc_tigram_word_freqs" +_TRIGRAM = "tnc_trigram_word_freqs" def word_freqs() -> List[Tuple[str, int]]: @@ -69,11 +69,11 @@ def bigram_word_freqs() -> defaultdict: return _word_freqs -def tigram_word_freqs() -> defaultdict: +def trigram_word_freqs() -> defaultdict: """ - Get tigram word frequency from Thai National Corpus (TNC) + Get trigram word frequency from Thai National Corpus (TNC) """ - _path = get_corpus_path(_TIGRAM) + _path = get_corpus_path(_TRIGRAM) _word_freqs = defaultdict(int) with open(_path, "r", encoding="utf-8-sig") as fh: for i in fh.readlines(): From 26065c6c2e0b6fbd6b4d5b917a608a69f49707f4 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 21 Jun 2021 06:58:34 +0100 Subject: [PATCH 22/26] Update core.py --- pythainlp/generate/core.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py index 9afb98218..17cb57e5d 100644 --- a/pythainlp/generate/core.py +++ b/pythainlp/generate/core.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- """ -Text generator using Unigram, Bigram and Tigram +Text generator using n-gram language model -code from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 +code from +https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 """ import random from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram -from pythainlp.corpus.tnc import tigram_word_freqs as tnc_word_freqs_tigram +from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram from pythainlp.corpus.oscar import ( unigram_word_freqs as oscar_word_freqs_unigram @@ -179,10 +180,10 @@ def gen_sentence( return self.list_word -class Tigram: +class Trigram: def __init__(self, name: str = "tnc"): """ - Text generator using Tigram + Text generator using Trigram :param str name: corpus name * *tnc* - Thai National Corpus (default) @@ -191,7 +192,7 @@ def __init__(self, name: str = "tnc"): if name == "tnc": self.uni = tnc_word_freqs_unigram() self.bi = tnc_word_freqs_bigram() - self.ti = tnc_word_freqs_tigram() + self.ti = tnc_word_freqs_trigram() self.uni_keys = list(self.uni.keys()) self.bi_keys = list(self.bi.keys()) self.ti_keys = list(self.ti.keys()) From 892475f7ebcaa296d20714c9f71b9ec9758f0564 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 21 Jun 2021 07:00:14 +0100 Subject: [PATCH 23/26] Update thai2fit.py --- pythainlp/generate/thai2fit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pythainlp/generate/thai2fit.py b/pythainlp/generate/thai2fit.py index ee45ae22c..b07368360 100644 --- a/pythainlp/generate/thai2fit.py +++ b/pythainlp/generate/thai2fit.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- """ -Thai2fit : Thai Wiki Language Model for Text Generation +Thai2fit: Thai Wikipeida Language Model for Text Generation -Code from https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb +Code from +https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb """ __all__ = [ "gen_sentence" From 5c56376312b644de452d0af5ab7c94bd32a45eab Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 21 Jun 2021 14:07:04 +0700 Subject: [PATCH 24/26] Update test --- docs/api/corpus.rst | 2 +- docs/api/generate.rst | 2 +- pythainlp/generate/__init__.py | 4 ++-- tests/test_corpus.py | 2 +- tests/test_generate.py | 14 +++++++------- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index eb8d1bf16..686d5e5e3 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -37,7 +37,7 @@ TNC .. autofunction:: pythainlp.corpus.tnc.word_freqs .. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs .. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs -.. autofunction:: pythainlp.corpus.tnc.tigram_word_freqs +.. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs TTC --- diff --git a/docs/api/generate.rst b/docs/api/generate.rst index 9d450862d..02459dfc3 100644 --- a/docs/api/generate.rst +++ b/docs/api/generate.rst @@ -11,6 +11,6 @@ Modules :members: .. autoclass:: Bigram :members: -.. autoclass:: Tigram +.. autoclass:: Trigram :members: .. autofunction:: pythainlp.generate.thai2fit.gen_sentence \ No newline at end of file diff --git a/pythainlp/generate/__init__.py b/pythainlp/generate/__init__.py index 9cd578864..fffac652c 100644 --- a/pythainlp/generate/__init__.py +++ b/pythainlp/generate/__init__.py @@ -6,7 +6,7 @@ __all__ = [ "Unigram", "Bigram", - "Tigram" + "Trigram" ] -from pythainlp.generate.core import Unigram, Bigram, Tigram +from pythainlp.generate.core import Unigram, Bigram, Trigram diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 26196129e..a5bdb108a 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -108,7 +108,7 @@ def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) self.assertIsNotNone(tnc.unigram_word_freqs()) self.assertIsNotNone(tnc.bigram_word_freqs()) - self.assertIsNotNone(tnc.tigram_word_freqs()) + self.assertIsNotNone(tnc.trigram_word_freqs()) def test_ttc(self): self.assertIsNotNone(ttc.word_freqs()) diff --git a/tests/test_generate.py b/tests/test_generate.py index f097bbb30..6405c679e 100644 --- a/tests/test_generate.py +++ b/tests/test_generate.py @@ -2,7 +2,7 @@ import unittest -from pythainlp.generate import Unigram, Bigram, Tigram +from pythainlp.generate import Unigram, Bigram, Trigram from pythainlp.generate.thai2fit import gen_sentence @@ -33,12 +33,12 @@ def test_bigram(self): self.assertIsNotNone(_bigram.gen_sentence()) self.assertIsNotNone(_bigram.gen_sentence(duplicate=True)) - def test_tigram(self): - _tigram = Tigram() - self.assertIsNotNone(_tigram.gen_sentence("ผม")) - self.assertIsNotNone(_tigram.gen_sentence("ผม", output_str=False)) - self.assertIsNotNone(_tigram.gen_sentence()) - self.assertIsNotNone(_tigram.gen_sentence(duplicate=True)) + def test_trigram(self): + _trigram = Trigram() + self.assertIsNotNone(_trigram.gen_sentence("ผม")) + self.assertIsNotNone(_trigram.gen_sentence("ผม", output_str=False)) + self.assertIsNotNone(_trigram.gen_sentence()) + self.assertIsNotNone(_trigram.gen_sentence(duplicate=True)) def test_thai2fit(self): self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว")) From 82fea2e96c9482fcca7cadb1cdf37de39d814eaf Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 24 Jun 2021 13:10:13 +0700 Subject: [PATCH 25/26] Update tnc.py --- pythainlp/corpus/tnc.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 77251ae64..5f80ab972 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -1,9 +1,6 @@ # -*- coding: utf-8 -*- """ Thai National Corpus word frequency - -Credit: Korakot Chaovavanich -https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ __all__ = [ @@ -30,6 +27,8 @@ def word_freqs() -> List[Tuple[str, int]]: Get word frequency from Thai National Corpus (TNC) \n(See: `dev/pythainlp/corpus/tnc_freq.txt\ `_) + + Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF """ lines = list(get_corpus(_FILENAME)) word_freqs = [] From fb0d5d236a95b7912223a332e684c1cc0ebeae9f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 17 Jul 2021 23:56:54 +0700 Subject: [PATCH 26/26] Add docs --- pythainlp/generate/core.py | 73 +++++++++++++++++++++++----------- pythainlp/generate/thai2fit.py | 11 +++++ 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py index 17cb57e5d..7e513c287 100644 --- a/pythainlp/generate/core.py +++ b/pythainlp/generate/core.py @@ -17,16 +17,15 @@ class Unigram: - def __init__(self, name: str = "tnc"): - """ - Text generator using Unigram + """ + Text generator using Unigram - :param str name: corpus name - * *tnc* - Thai National Corpus (default) - * *ttc* - Thai Textbook Corpus (TTC) - * *oscar* - OSCAR Corpus - :rtype: None - """ + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + * *ttc* - Thai Textbook Corpus (TTC) + * *oscar* - OSCAR Corpus + """ + def __init__(self, name: str = "tnc"): if name == "tnc": self.counts = tnc_word_freqs_unigram() elif name == "ttc": @@ -58,6 +57,16 @@ def gen_sentence( :return: list words or str words :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate import Unigram + + gen = Unigram() + + gen.gen_sentence("แมว") + # ouput: 'แมวเวลานะนั้น' """ if start_seq is None: start_seq = random.choice(self.word) @@ -100,14 +109,13 @@ def _next_word( class Bigram: - def __init__(self, name: str = "tnc"): - """ - Text generator using Bigram + """ + Text generator using Bigram - :param str name: corpus name - * *tnc* - Thai National Corpus (default) - :rtype: None - """ + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + """ + def __init__(self, name: str = "tnc"): if name == "tnc": self.uni = tnc_word_freqs_unigram() self.bi = tnc_word_freqs_bigram() @@ -147,6 +155,16 @@ def gen_sentence( :return: list words or str words :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate import Bigram + + gen = Bigram() + + gen.gen_sentence("แมว") + # ouput: 'แมวไม่ได้รับเชื้อมัน' """ if start_seq is None: start_seq = random.choice(self.words) @@ -181,14 +199,13 @@ def gen_sentence( class Trigram: - def __init__(self, name: str = "tnc"): - """ - Text generator using Trigram + """ + Text generator using Trigram - :param str name: corpus name - * *tnc* - Thai National Corpus (default) - :rtype: None - """ + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + """ + def __init__(self, name: str = "tnc"): if name == "tnc": self.uni = tnc_word_freqs_unigram() self.bi = tnc_word_freqs_bigram() @@ -232,6 +249,16 @@ def gen_sentence( :return: list words or str words :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate import Trigram + + gen = Trigram() + + gen.gen_sentence() + # ouput: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' """ if start_seq is None: start_seq = random.choice(self.bi_keys) diff --git a/pythainlp/generate/thai2fit.py b/pythainlp/generate/thai2fit.py index b07368360..f299c6648 100644 --- a/pythainlp/generate/thai2fit.py +++ b/pythainlp/generate/thai2fit.py @@ -100,6 +100,17 @@ def gen_sentence( :return: list words or str words :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate.thai2fit import gen_sentence + + gen_sentence() + # output: 'แคทรียา อิงลิช (นักแสดง' + + gen_sentence("แมว") + # output: 'แมว คุณหลวง ' """ if start_seq is None: start_seq = random.choice(list(thwiki_itos))