From b4cb35c9e068e7224ddc16bfa286af0672adebea Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 20:09:17 +0700
Subject: [PATCH 01/26] Add pythainlp.generator

---
 docs/api/corpus.rst             |  10 ++
 docs/api/generator.rst          |  16 +++
 pythainlp/corpus/oscar.py       |  53 ++++++++++
 pythainlp/corpus/tnc.py         |  54 +++++++++-
 pythainlp/corpus/ttc.py         |  20 +++-
 pythainlp/generator/__init__.py |  12 +++
 pythainlp/generator/core.py     | 171 ++++++++++++++++++++++++++++++++
 pythainlp/generator/thai2fit.py |  65 ++++++++++++
 setup.py                        |   2 +
 tests/test_corpus.py            |   9 ++
 tests/test_generator.py         |  23 +++++
 11 files changed, 433 insertions(+), 2 deletions(-)
 create mode 100644 docs/api/generator.rst
 create mode 100644 pythainlp/corpus/oscar.py
 create mode 100644 pythainlp/generator/__init__.py
 create mode 100644 pythainlp/generator/core.py
 create mode 100644 pythainlp/generator/thai2fit.py
 create mode 100644 tests/test_generator.py
diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
index dbf0c95fa..eb8d1bf16 100644
--- a/docs/api/corpus.rst
+++ b/docs/api/corpus.rst
@@ -35,11 +35,21 @@ TNC
 ---
 
 .. autofunction:: pythainlp.corpus.tnc.word_freqs
+.. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs
+.. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs
+.. autofunction:: pythainlp.corpus.tnc.tigram_word_freqs
 
 TTC
 ---
 
 .. autofunction:: pythainlp.corpus.ttc.word_freqs
+.. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs
+
+OSCAR
+-----
+
+.. autofunction:: pythainlp.corpus.oscar.word_freqs
+.. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs
 
 Util
 ----
diff --git a/docs/api/generator.rst b/docs/api/generator.rst
new file mode 100644
index 000000000..cd8252579
--- /dev/null
+++ b/docs/api/generator.rst
@@ -0,0 +1,16 @@
+.. currentmodule:: pythainlp.generator
+
+pythainlp.generator
+===================
+The :class:`pythainlp.generator` is Thai text generator with PyThaiNLP.
+
+Modules
+-------
+
+.. autoclass:: Unigram
+    :members:
+.. autoclass:: Bigram
+    :members:
+.. autoclass:: Tigram
+    :members:
+.. autofunction:: pythainlp.generator.thai2fit.gen_sentence
\ No newline at end of file
diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py
new file mode 100644
index 000000000..e23dbb609
--- /dev/null
+++ b/pythainlp/corpus/oscar.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Thai unigram word frequency from OSCAR Corpus (icu word tokenize)
+
+Credit: Korakot Chaovavanich
+https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
+"""
+
+__all__ = [
+    "word_freqs",
+    "unigram_word_freqs"
+]
+
+from collections import defaultdict
+from typing import List, Tuple
+
+from pythainlp.corpus import get_corpus_path
+
+_FILENAME = "oscar_icu"
+
+
+def word_freqs() -> List[Tuple[str, int]]:
+    """
+    Get word frequency from OSCAR Corpus (icu word tokenize)
+    """
+    word_freqs = []
+    _path = get_corpus_path(_FILENAME)
+    with open(_path,"r",encoding="utf-8") as f:
+        for line in f.readlines():
+            word_freq = line.strip().split(",")
+            if len(word_freq) >= 2:
+                word_freqs.append((word_freq[0], int(word_freq[1])))
+
+    return word_freqs
+
+
+def unigram_word_freqs() -> defaultdict:
+    """
+    Get unigram word frequency from OSCAR Corpus (icu word tokenize)
+    """
+    _path = get_corpus_path(_FILENAME)
+    _word_freqs = defaultdict(int)
+    with open(_path, "r", encoding="utf-8-sig") as fh:
+        _data = [i for i in fh.readlines()]
+        del _data[0]
+        for i in _data:
+            _temp = i.strip().split(",")
+            if _temp[0]!=" " and '"' not in _temp[0]:
+                _word_freqs[_temp[0]] = int(_temp[-1])
+            elif _temp[0]==" ":
+                _word_freqs["<s/>"] = int(_temp[-1])
+
+    return _word_freqs
diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index db836ea17..9345cea93 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -6,13 +6,23 @@
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
 
-__all__ = ["word_freqs"]
+__all__ = [
+    "word_freqs",
+    "unigram_word_freqs",
+    "bigram_word_freqs",
+    "tigram_word_freqs"
+]
 
+from collections import defaultdict
 from typing import List, Tuple
 
 from pythainlp.corpus import get_corpus
+from pythainlp.corpus import get_corpus_path
+
 
 _FILENAME = "tnc_freq.txt"
+_BIGRAM = "tnc_bigram_word_freqs"
+_TIGRAM = "tnc_tigram_word_freqs"
 
 
 def word_freqs() -> List[Tuple[str, int]]:
@@ -29,3 +39,45 @@ def word_freqs() -> List[Tuple[str, int]]:
             word_freqs.append((word_freq[0], int(word_freq[1])))
 
     return word_freqs
+
+
+def unigram_word_freqs() -> defaultdict:
+    """
+    Get unigram word frequency from Thai National Corpus (TNC)
+    """
+    lines = list(get_corpus(_FILENAME))
+    _word_freqs = defaultdict(int)
+    for i in lines:
+        _temp = i.strip().split("	")
+        if len(_temp) >= 2:
+            _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1])
+
+    return _word_freqs
+
+
+def bigram_word_freqs() -> defaultdict:
+    """
+    Get bigram word frequency from Thai National Corpus (TNC)
+    """
+    _path = get_corpus_path(_BIGRAM)
+    _word_freqs = defaultdict(int)
+    with open(_path, "r", encoding="utf-8-sig") as fh:
+        for i in fh.readlines():
+            _temp = i.strip().split("	")
+            _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1])
+
+    return _word_freqs
+
+
+def tigram_word_freqs() -> defaultdict:
+    """
+    Get tigram word frequency from Thai National Corpus (TNC)
+    """
+    _path = get_corpus_path(_TIGRAM)
+    _word_freqs = defaultdict(int)
+    with open(_path, "r", encoding="utf-8-sig") as fh:
+        for i in fh.readlines():
+            _temp = i.strip().split("	")
+            _word_freqs[(_temp[0],_temp[1],_temp[2])] = int(_temp[-1])
+
+    return _word_freqs
\ No newline at end of file
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
index 0de0069c7..a42fa4c05 100644
--- a/pythainlp/corpus/ttc.py
+++ b/pythainlp/corpus/ttc.py
@@ -6,8 +6,12 @@
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
 
-__all__ = ["word_freqs"]
+__all__ = [
+    "word_freqs",
+    "unigram_word_freqs"
+]
 
+from collections import defaultdict
 from typing import List, Tuple
 
 from pythainlp.corpus import get_corpus
@@ -29,3 +33,17 @@ def word_freqs() -> List[Tuple[str, int]]:
             word_freqs.append((word_freq[0], int(word_freq[1])))
 
     return word_freqs
+
+
+def unigram_word_freqs() -> defaultdict:
+    """
+    Get unigram word frequency from Thai Textbook Corpus (TTC)
+    """
+    lines = list(get_corpus(_FILENAME))
+    _word_freqs = defaultdict(int)
+    for i in lines:
+        _temp = i.strip().split("	")
+        if len(_temp) >= 2:
+            _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1])
+
+    return _word_freqs
\ No newline at end of file
diff --git a/pythainlp/generator/__init__.py b/pythainlp/generator/__init__.py
new file mode 100644
index 000000000..637497b45
--- /dev/null
+++ b/pythainlp/generator/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+Thai Text generator
+"""
+
+__all__ = [
+    "Unigram",
+    "Bigram",
+    "Tigram"
+]
+
+from pythainlp.generator.core import Unigram, Bigram, Tigram
\ No newline at end of file
diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
new file mode 100644
index 000000000..9ca7a1395
--- /dev/null
+++ b/pythainlp/generator/core.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+import random
+from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
+from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
+from pythainlp.corpus.tnc import tigram_word_freqs as tnc_word_freqs_tigram
+from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
+from pythainlp.corpus.oscar import unigram_word_freqs as oscar_word_freqs_unigram
+
+
+class Unigram:
+    def __init__(self, name:str="tnc"):
+        """
+        :param str name: corpus name
+        :rtype: None
+        """
+        if name == "tnc":
+            self.counts = tnc_word_freqs_unigram()
+        elif name == "ttc":
+            self.counts = ttc_word_freqs_unigram()
+        elif name == "oscar":
+            self.counts = oscar_word_freqs_unigram()
+        self.word = list(self.counts.keys())
+        self.n = 0
+        for i in self.word:
+            self.n += self.counts[i]
+        self.prob = {i:self.counts[i]/self.n for i in self.word}
+        self._word_prob = {}
+
+    def gen_sentence(self,N:int=3,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False):
+        """
+        :param int N: number of word.
+        :param str start_seq: word for begin word.
+        :param bool output_str: output is str
+        :param bool duplicate: duplicate word in sent
+
+        :return: list words or str words
+        :rtype: str,list
+        """
+        if start_seq is None: start_seq = random.choice(self.word)
+        rand_text = start_seq.lower()
+        self._word_prob = {i:self.counts[i]/self.n for i in self.word if self.counts[i]/self.n>=prob}
+        return self.next_word(rand_text, N, output_str,prob=prob, duplicate=duplicate)
+
+    def next_word(self,text:str, N:int, output_str:str,prob, duplicate:bool=False):
+        self.l = []
+        self.l.append(text)
+        self._word_list = list(self._word_prob.keys())
+        if N>len(self._word_list):
+            N=len(self._word_list)
+        for i in range(N):
+            self._word = random.choice(self._word_list)
+            if duplicate == False:
+                while self._word in self.l:
+                    self._word = random.choice(self._word_list)
+            self.l.append(self._word)
+            
+        if output_str:
+            return "".join(self.l)
+        return self.l
+
+
+class Bigram:
+    def __init__(self,name:str="tnc"):
+        """
+        :param str name: corpus name
+        :rtype: None
+        """
+        if name == "tnc":
+            self.uni = tnc_word_freqs_unigram()
+            self.bi = tnc_word_freqs_bigram()
+        self.uni_keys = list(self.uni.keys())
+        self.bi_keys = list(self.bi.keys())
+        self.words = [i[-1]  for i in self.bi_keys]
+
+    def prob(self, t1:str, t2:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+        """
+        probability word
+        
+        :param int t1: text 1
+        :param int t2: text 2
+
+        :return: probability value
+        :rtype: float
+        """
+        try:
+            v=self.bi[(t1,t2)]/self.uni[t1]
+        except:
+            v=0.0
+        return v
+
+    def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False):
+        if start_seq is None: start_seq = random.choice(self.words)
+        self.late_word = start_seq
+        self.list_word = []
+        self.list_word.append(start_seq)
+
+        for i in range(N):
+            if duplicate:
+                self._temp = [j for j in self.bi_keys if j[0]==self.late_word]
+            else:
+                self._temp = [j for j in self.bi_keys if j[0]==self.late_word and j[1] not in self.list_word]
+            self._probs = [self.prob(self.late_word,l[-1]) for l in self._temp]
+            self._p2 = [j for j in self._probs if j>=prob]
+            if len(self._p2)==0:
+                break
+            self.items = self._temp[self._probs.index(random.choice(self._p2))]
+            self.late_word = self.items[-1]
+            self.list_word.append(self.late_word)
+        if output_str:
+            return ''.join(self.list_word)
+        return self.list_word
+
+
+class Tigram:
+    def __init__(self,name:str="tnc"):
+        """
+        :param str name: corpus name
+        :rtype: None
+        """
+        if name == "tnc":
+            self.uni = tnc_word_freqs_unigram()
+            self.bi = tnc_word_freqs_bigram()
+            self.ti = tnc_word_freqs_tigram()
+        self.uni_keys = list(self.uni.keys())
+        self.bi_keys = list(self.bi.keys())
+        self.ti_keys = list(self.ti.keys())
+        self.words = [i[-1]  for i in self.bi_keys]
+
+    def prob(self, t1:str, t2:str, t3:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+        """
+        probability word
+        
+        :param int t1: text 1
+        :param int t2: text 2
+        :param int t3: text 3
+
+        :return: probability value
+        :rtype: float
+        """
+        try:
+            v=self.ti[(t1, t2, t3)]/self.bi[(t1, t2)]
+        except:
+            v=0.0
+        return v
+
+    def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:tuple=None, output_str:bool = True, duplicate:bool=False):
+        if start_seq is None: start_seq = random.choice(self.bi_keys)
+        self.late_word = start_seq
+        self.list_word = []
+        self.list_word.append(start_seq)
+
+        for i in range(N):
+            if duplicate:
+                self._temp = [j for j in self.ti_keys if j[:2]==self.late_word]
+            else:
+                self._temp = [j for j in self.ti_keys if j[:2]==self.late_word and j[1:] not in self.list_word]
+            self._probs = [self.prob(l[0],l[1],l[2]) for l in self._temp]
+            self._p2 = [j for j in self._probs if j>=prob]
+            if len(self._p2)==0:
+                break
+            self.items = self._temp[self._probs.index(random.choice(self._p2))]
+            self.late_word = self.items[1:]
+            self.list_word.append(self.late_word)
+        self.listdata = []
+        for i in self.list_word:
+            for j in i:
+                if j not in self.listdata:
+                    self.listdata.append(j)
+        if output_str:
+            return ''.join(self.listdata)
+        return self.listdata
\ No newline at end of file
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
new file mode 100644
index 000000000..d19d0f648
--- /dev/null
+++ b/pythainlp/generator/thai2fit.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""
+Thai2fit : Thai Wiki Language Model for Text Generation
+
+Code from https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb
+"""
+__all__ = [
+    "gen_sentence"
+]
+
+import pandas as pd
+import random
+from ast import literal_eval
+from collections import Counter
+import re
+
+#fastai
+import fastai
+from fastai.text import *
+from fastai.callbacks import CSVLogger
+
+#pythainlp
+from pythainlp.ulmfit import *
+
+#get dummy data
+imdb = untar_data(URLs.IMDB_SAMPLE)
+dummy_df = pd.read_csv(imdb/'texts.csv')
+
+#get vocab
+thwiki = ""
+try:
+  thwiki =_THWIKI_LSTM
+except:
+  thwiki = THWIKI_LSTM
+
+thwiki_itos = pickle.load(open(thwiki['itos_fname'],'rb'))
+thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
+
+#dummy databunch
+tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)
+processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
+            NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]
+data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)
+    .split_by_rand_pct(0.2)
+    .label_for_lm()
+    .databunch(bs=64))
+
+
+data_lm.sanity_check()
+
+config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,
+             output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
+trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
+
+learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)
+
+#load pretrained models
+learn.load_pretrained(**thwiki)
+
+def gen_sentence(N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True):
+  if start_seq is None: start_seq = random.choice(list(thwiki_itos))
+  list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-')
+  if output_str:
+    return ''.join(list_word)
+  return list_word
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1dabf3bcb..9b1d12379 100644
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
     "wangchanberta": ["transformers", "sentencepiece"],
     "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"],
     "wordnet": ["nltk>=3.3.*"],
+    "text_generator": ["fastai"],
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -74,6 +75,7 @@
         "ssg>=0.0.6",
         "torch>=1.0.0",
         "transformers>=4.1.1",
+        "fastai"
     ],
 }
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index 69fa22dc0..acc5812af 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -10,6 +10,7 @@
     get_corpus_db,
     get_corpus_db_detail,
     get_corpus_path,
+    oscar,
     provinces,
     remove,
     thai_family_names,
@@ -98,12 +99,20 @@ def test_corpus(self):
         )
         self.assertIsNotNone(download(name="test", version="0.1"))
         self.assertIsNotNone(remove("test"))
+    
+    def test_oscar(self):
+        self.assertIsNotNone(oscar.word_freqs())
+        self.assertIsNotNone(oscar.unigram_word_freqs())
 
     def test_tnc(self):
         self.assertIsNotNone(tnc.word_freqs())
+        self.assertIsNotNone(tnc.unigram_word_freqs())
+        self.assertIsNotNone(tnc.bigram_word_freqs())
+        self.assertIsNotNone(tnc.tigram_word_freqs())
 
     def test_ttc(self):
         self.assertIsNotNone(ttc.word_freqs())
+        self.assertIsNotNone(ttc.unigram_word_freqs())
 
     def test_wordnet(self):
         self.assertIsInstance(wordnet.langs(), list)
diff --git a/tests/test_generator.py b/tests/test_generator.py
new file mode 100644
index 000000000..4e03cabab
--- /dev/null
+++ b/tests/test_generator.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+
+from pythainlp.generator import Unigram, Bigram, Tigram
+from pythainlp.generator.thai2fit import gen_sentence
+
+class TestGeneratorPackage(unittest.TestCase):
+    def test_unigram(self):
+        _tnc_unigram = Unigram("tnc")
+        self.assertIsNotNone(_tnc_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        _ttc_unigram = Unigram("ttc")
+        self.assertIsNotNone(_ttc_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        _oscar_unigram = Unigram("oscar")
+        self.assertIsNotNone(_oscar_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+    def test_bigram(self):
+        _bigram = Bigram()
+        self.assertIsNotNone(_bigram.gen_sentence("ผมชอบไปโรงเรียน"))
+    def test_tigram(self):
+        _tigram = Tigram()
+        self.assertIsNotNone(_tigram.gen_sentence("ผมชอบไปโรงเรียน"))
+    def test_thai2fit(self):
+        self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน"))
\ No newline at end of file

From 5375ee051b29f9e62c110a34704b3367fb8dbf85 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 20:17:13 +0700
Subject: [PATCH 02/26] fixed pep8

---
 pythainlp/corpus/oscar.py       |  6 +++---
 pythainlp/corpus/tnc.py         |  8 ++++----
 pythainlp/corpus/ttc.py         |  4 ++--
 pythainlp/generator/__init__.py |  2 +-
 pythainlp/generator/thai2fit.py | 13 +++++++------
 tests/test_corpus.py            |  2 +-
 tests/test_generator.py         |  6 +++++-
 7 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py
index e23dbb609..7e19cdf99 100644
--- a/pythainlp/corpus/oscar.py
+++ b/pythainlp/corpus/oscar.py
@@ -25,7 +25,7 @@ def word_freqs() -> List[Tuple[str, int]]:
     """
     word_freqs = []
     _path = get_corpus_path(_FILENAME)
-    with open(_path,"r",encoding="utf-8") as f:
+    with open(_path, "r", encoding="utf-8") as f:
         for line in f.readlines():
             word_freq = line.strip().split(",")
             if len(word_freq) >= 2:
@@ -45,9 +45,9 @@ def unigram_word_freqs() -> defaultdict:
         del _data[0]
         for i in _data:
             _temp = i.strip().split(",")
-            if _temp[0]!=" " and '"' not in _temp[0]:
+            if _temp[0] != " " and '"' not in _temp[0]:
                 _word_freqs[_temp[0]] = int(_temp[-1])
-            elif _temp[0]==" ":
+            elif _temp[0] == " ":
                 _word_freqs["<s/>"] = int(_temp[-1])
 
     return _word_freqs
diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index 9345cea93..0a147d70a 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -50,7 +50,7 @@ def unigram_word_freqs() -> defaultdict:
     for i in lines:
         _temp = i.strip().split("	")
         if len(_temp) >= 2:
-            _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1])
+            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
 
     return _word_freqs
 
@@ -64,7 +64,7 @@ def bigram_word_freqs() -> defaultdict:
     with open(_path, "r", encoding="utf-8-sig") as fh:
         for i in fh.readlines():
             _temp = i.strip().split("	")
-            _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1])
+            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
 
     return _word_freqs
 
@@ -78,6 +78,6 @@ def tigram_word_freqs() -> defaultdict:
     with open(_path, "r", encoding="utf-8-sig") as fh:
         for i in fh.readlines():
             _temp = i.strip().split("	")
-            _word_freqs[(_temp[0],_temp[1],_temp[2])] = int(_temp[-1])
+            _word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])
 
-    return _word_freqs
\ No newline at end of file
+    return _word_freqs
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
index a42fa4c05..000a7f484 100644
--- a/pythainlp/corpus/ttc.py
+++ b/pythainlp/corpus/ttc.py
@@ -44,6 +44,6 @@ def unigram_word_freqs() -> defaultdict:
     for i in lines:
         _temp = i.strip().split("	")
         if len(_temp) >= 2:
-            _word_freqs[(_temp[0],_temp[1])] = int(_temp[-1])
+            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
 
-    return _word_freqs
\ No newline at end of file
+    return _word_freqs
diff --git a/pythainlp/generator/__init__.py b/pythainlp/generator/__init__.py
index 637497b45..cb18dd716 100644
--- a/pythainlp/generator/__init__.py
+++ b/pythainlp/generator/__init__.py
@@ -9,4 +9,4 @@
     "Tigram"
 ]
 
-from pythainlp.generator.core import Unigram, Bigram, Tigram
\ No newline at end of file
+from pythainlp.generator.core import Unigram, Bigram, Tigram
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index d19d0f648..d71eff595 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -14,19 +14,19 @@
 from collections import Counter
 import re
 
-#fastai
+# fastai
 import fastai
 from fastai.text import *
 from fastai.callbacks import CSVLogger
 
-#pythainlp
+# pythainlp
 from pythainlp.ulmfit import *
 
-#get dummy data
+# get dummy data
 imdb = untar_data(URLs.IMDB_SAMPLE)
 dummy_df = pd.read_csv(imdb/'texts.csv')
 
-#get vocab
+# get vocab
 thwiki = ""
 try:
   thwiki =_THWIKI_LSTM
@@ -36,7 +36,7 @@
 thwiki_itos = pickle.load(open(thwiki['itos_fname'],'rb'))
 thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
 
-#dummy databunch
+# dummy databunch
 tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)
 processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
             NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]
@@ -57,9 +57,10 @@
 #load pretrained models
 learn.load_pretrained(**thwiki)
 
+
 def gen_sentence(N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True):
   if start_seq is None: start_seq = random.choice(list(thwiki_itos))
   list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-')
   if output_str:
     return ''.join(list_word)
-  return list_word
\ No newline at end of file
+  return list_word
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index acc5812af..26196129e 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -99,7 +99,7 @@ def test_corpus(self):
         )
         self.assertIsNotNone(download(name="test", version="0.1"))
         self.assertIsNotNone(remove("test"))
-    
+
     def test_oscar(self):
         self.assertIsNotNone(oscar.word_freqs())
         self.assertIsNotNone(oscar.unigram_word_freqs())
diff --git a/tests/test_generator.py b/tests/test_generator.py
index 4e03cabab..eb611ae38 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -5,6 +5,7 @@
 from pythainlp.generator import Unigram, Bigram, Tigram
 from pythainlp.generator.thai2fit import gen_sentence
 
+
 class TestGeneratorPackage(unittest.TestCase):
     def test_unigram(self):
         _tnc_unigram = Unigram("tnc")
@@ -13,11 +14,14 @@ def test_unigram(self):
         self.assertIsNotNone(_ttc_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
         _oscar_unigram = Unigram("oscar")
         self.assertIsNotNone(_oscar_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+
     def test_bigram(self):
         _bigram = Bigram()
         self.assertIsNotNone(_bigram.gen_sentence("ผมชอบไปโรงเรียน"))
+
     def test_tigram(self):
         _tigram = Tigram()
         self.assertIsNotNone(_tigram.gen_sentence("ผมชอบไปโรงเรียน"))
+
     def test_thai2fit(self):
-        self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน"))
\ No newline at end of file
+        self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน"))

From 652f67ee9eaf5389574634defa7a29c5911de7ca Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 20:26:07 +0700
Subject: [PATCH 03/26] fixed oscar bug

---
 pythainlp/corpus/oscar.py       |  9 ++++---
 pythainlp/generator/core.py     |  6 +++--
 pythainlp/generator/thai2fit.py | 43 ++++++++++++++++++++++++---------
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py
index 7e19cdf99..187a8c3dc 100644
--- a/pythainlp/corpus/oscar.py
+++ b/pythainlp/corpus/oscar.py
@@ -27,9 +27,12 @@ def word_freqs() -> List[Tuple[str, int]]:
     _path = get_corpus_path(_FILENAME)
     with open(_path, "r", encoding="utf-8") as f:
         for line in f.readlines():
-            word_freq = line.strip().split(",")
-            if len(word_freq) >= 2:
-                word_freqs.append((word_freq[0], int(word_freq[1])))
+            _temp = line.strip().split(",")
+            if len(_temp) >= 2:
+                if _temp[0] != " " and '"' not in _temp[0]:
+                    word_freqs.append((_temp[0], int(_temp[1])))
+                elif _temp[0] == " ":
+                    word_freqs.append(("<s/>", int(_temp[1])))
 
     return word_freqs
 
diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 9ca7a1395..349e18171 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -4,11 +4,13 @@
 from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
 from pythainlp.corpus.tnc import tigram_word_freqs as tnc_word_freqs_tigram
 from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
-from pythainlp.corpus.oscar import unigram_word_freqs as oscar_word_freqs_unigram
+from pythainlp.corpus.oscar import (
+    unigram_word_freqs as oscar_word_freqs_unigram
+)
 
 
 class Unigram:
-    def __init__(self, name:str="tnc"):
+    def __init__(self, name: str = "tnc"):
         """
         :param str name: corpus name
         :rtype: None
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index d71eff595..2fa78a25b 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -33,23 +33,39 @@
 except:
   thwiki = THWIKI_LSTM
 
-thwiki_itos = pickle.load(open(thwiki['itos_fname'],'rb'))
+thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb'))
 thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
 
 # dummy databunch
-tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)
-processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
-            NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]
-data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)
-    .split_by_rand_pct(0.2)
-    .label_for_lm()
-    .databunch(bs=64))
+tt = Tokenizer(tok_func=ThaiTokenizer, lang='th', pre_rules=pre_rules_th, post_rules=post_rules_th)
+processor = [
+  TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
+  NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)
+]
+data_lm = (
+  TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)
+  .split_by_rand_pct(0.2)
+  .label_for_lm()
+  .databunch(bs=64)
+)
 
 
 data_lm.sanity_check()
 
-config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,
-             output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
+config = dict(
+  emb_sz=400,
+  n_hid=1550,
+  n_layers=4,
+  pad_token=1,
+  qrnn=False,
+  tie_weights=True,
+  out_bias=True,
+  output_p=0.25,
+  hidden_p=0.1,
+  input_p=0.2,
+  embed_p=0.02,
+  weight_p=0.15
+)
 trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
 
 learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)
@@ -58,7 +74,12 @@
 learn.load_pretrained(**thwiki)
 
 
-def gen_sentence(N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True):
+def gen_sentence(
+  N:int=4,
+  prob:float=0.001,
+  start_seq:str=None,
+  output_str:bool = True
+):
   if start_seq is None: start_seq = random.choice(list(thwiki_itos))
   list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-')
   if output_str:

From 22f0e1e6cd85224add966694ac1ad1ce248c4ab8 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 20:39:37 +0700
Subject: [PATCH 04/26] Update thai2fit.py

---
 pythainlp/generator/thai2fit.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index 2fa78a25b..66ca2dffb 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -17,7 +17,6 @@
 # fastai
 import fastai
 from fastai.text import *
-from fastai.callbacks import CSVLogger
 
 # pythainlp
 from pythainlp.ulmfit import *

From 772395a30297a1143a0e7926e262b33d4bfeae7c Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 20:51:21 +0700
Subject: [PATCH 05/26] Update thai2fit.py

---
 pythainlp/generator/thai2fit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index 66ca2dffb..d00a9c23e 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -12,11 +12,12 @@
 import random
 from ast import literal_eval
 from collections import Counter
-import re
+import pickle
 
 # fastai
 import fastai
 from fastai.text import *
+from fastai.data.external import *
 
 # pythainlp
 from pythainlp.ulmfit import *

From 69c4dbff104e9466d8bbe1764d66ffe0b2e45d6e Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 21:38:51 +0700
Subject: [PATCH 06/26] Update fastai

---
 pythainlp/generator/thai2fit.py | 1 -
 setup.py                        | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index d00a9c23e..be2cafe62 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -17,7 +17,6 @@
 # fastai
 import fastai
 from fastai.text import *
-from fastai.data.external import *
 
 # pythainlp
 from pythainlp.ulmfit import *
diff --git a/setup.py b/setup.py
index 9b1d12379..6421b5638 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
     "wangchanberta": ["transformers", "sentencepiece"],
     "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"],
     "wordnet": ["nltk>=3.3.*"],
-    "text_generator": ["fastai"],
+    "text_generator": ["fastai<2.0"],
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -75,7 +75,7 @@
         "ssg>=0.0.6",
         "torch>=1.0.0",
         "transformers>=4.1.1",
-        "fastai"
+        "fastai<2.0"
     ],
 }
 

From ae2eb795844c71e2252903f4ca803d0db16e98cd Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 21:54:21 +0700
Subject: [PATCH 07/26] Update core.py

---
 pythainlp/generator/core.py | 73 ++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 349e18171..49f451d30 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -25,10 +25,10 @@ def __init__(self, name: str = "tnc"):
         self.n = 0
         for i in self.word:
             self.n += self.counts[i]
-        self.prob = {i:self.counts[i]/self.n for i in self.word}
+        self.prob = {i:self.counts[i] / self.n for i in self.word}
         self._word_prob = {}
 
-    def gen_sentence(self,N:int=3,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False):
+    def gen_sentence(self, N: int = 3,prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False):
         """
         :param int N: number of word.
         :param str start_seq: word for begin word.
@@ -38,20 +38,21 @@ def gen_sentence(self,N:int=3,prob:float=0.001, start_seq:str=None, output_str:b
         :return: list words or str words
         :rtype: str,list
         """
-        if start_seq is None: start_seq = random.choice(self.word)
+        if start_seq is None:
+            start_seq = random.choice(self.word)
         rand_text = start_seq.lower()
-        self._word_prob = {i:self.counts[i]/self.n for i in self.word if self.counts[i]/self.n>=prob}
-        return self.next_word(rand_text, N, output_str,prob=prob, duplicate=duplicate)
+        self._word_prob = {i:self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob}
+        return self.next_word(rand_text, N, output_str, prob = prob, duplicate = duplicate)
 
-    def next_word(self,text:str, N:int, output_str:str,prob, duplicate:bool=False):
+    def next_word(self, text: str, N: int, output_str: str, prob: float, duplicate: bool = False):
         self.l = []
         self.l.append(text)
         self._word_list = list(self._word_prob.keys())
-        if N>len(self._word_list):
-            N=len(self._word_list)
+        if N > len(self._word_list):
+            N  =len(self._word_list)
         for i in range(N):
             self._word = random.choice(self._word_list)
-            if duplicate == False:
+            if duplicate is False:
                 while self._word in self.l:
                     self._word = random.choice(self._word_list)
             self.l.append(self._word)
@@ -62,7 +63,7 @@ def next_word(self,text:str, N:int, output_str:str,prob, duplicate:bool=False):
 
 
 class Bigram:
-    def __init__(self,name:str="tnc"):
+    def __init__(self, name: str = "tnc"):
         """
         :param str name: corpus name
         :rtype: None
@@ -74,10 +75,10 @@ def __init__(self,name:str="tnc"):
         self.bi_keys = list(self.bi.keys())
         self.words = [i[-1]  for i in self.bi_keys]
 
-    def prob(self, t1:str, t2:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+    def prob(self, t1: str, t2: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
         """
         probability word
-        
+
         :param int t1: text 1
         :param int t2: text 2
 
@@ -85,12 +86,12 @@ def prob(self, t1:str, t2:str): # from https://towardsdatascience.com/understand
         :rtype: float
         """
         try:
-            v=self.bi[(t1,t2)]/self.uni[t1]
+            v = self.bi[(t1, t2)] / self.uni[t1]
         except:
-            v=0.0
+            v = 0.0
         return v
 
-    def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:bool = True, duplicate:bool=False):
+    def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False):
         if start_seq is None: start_seq = random.choice(self.words)
         self.late_word = start_seq
         self.list_word = []
@@ -98,12 +99,17 @@ def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:b
 
         for i in range(N):
             if duplicate:
-                self._temp = [j for j in self.bi_keys if j[0]==self.late_word]
+                self._temp = [
+                    j for j in self.bi_keys if j[0] == self.late_word
+                ]
             else:
-                self._temp = [j for j in self.bi_keys if j[0]==self.late_word and j[1] not in self.list_word]
-            self._probs = [self.prob(self.late_word,l[-1]) for l in self._temp]
-            self._p2 = [j for j in self._probs if j>=prob]
-            if len(self._p2)==0:
+                self._temp = [
+                    j for j in self.bi_keys
+                    if j[0]==self.late_word and j[1] not in self.list_word
+                ]
+            self._probs = [self.prob(self.late_word, l[-1]) for l in self._temp]
+            self._p2 = [j for j in self._probs if j >= prob]
+            if len(self._p2) == 0:
                 break
             self.items = self._temp[self._probs.index(random.choice(self._p2))]
             self.late_word = self.items[-1]
@@ -114,7 +120,7 @@ def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:str=None, output_str:b
 
 
 class Tigram:
-    def __init__(self,name:str="tnc"):
+    def __init__(self, name: str = "tnc"):
         """
         :param str name: corpus name
         :rtype: None
@@ -126,9 +132,9 @@ def __init__(self,name:str="tnc"):
         self.uni_keys = list(self.uni.keys())
         self.bi_keys = list(self.bi.keys())
         self.ti_keys = list(self.ti.keys())
-        self.words = [i[-1]  for i in self.bi_keys]
+        self.words = [i[-1] for i in self.bi_keys]
 
-    def prob(self, t1:str, t2:str, t3:str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+    def prob(self, t1: str, t2: str, t3: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
         """
         probability word
         
@@ -140,25 +146,26 @@ def prob(self, t1:str, t2:str, t3:str): # from https://towardsdatascience.com/un
         :rtype: float
         """
         try:
-            v=self.ti[(t1, t2, t3)]/self.bi[(t1, t2)]
+            v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)]
         except:
-            v=0.0
+            v = 0.0
         return v
 
-    def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:tuple=None, output_str:bool = True, duplicate:bool=False):
-        if start_seq is None: start_seq = random.choice(self.bi_keys)
+    def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = กFalse):
+        if start_seq is None:
+            start_seq = random.choice(self.bi_keys)
         self.late_word = start_seq
         self.list_word = []
         self.list_word.append(start_seq)
 
         for i in range(N):
             if duplicate:
-                self._temp = [j for j in self.ti_keys if j[:2]==self.late_word]
+                self._temp = [j for j in self.ti_keys if j[:2] == self.late_word]
             else:
-                self._temp = [j for j in self.ti_keys if j[:2]==self.late_word and j[1:] not in self.list_word]
-            self._probs = [self.prob(l[0],l[1],l[2]) for l in self._temp]
-            self._p2 = [j for j in self._probs if j>=prob]
-            if len(self._p2)==0:
+                self._temp = [j for j in self.ti_keys if j[:2] == self.late_word and j[1:] not in self.list_word]
+            self._probs = [self.prob(l[0], l[1], l[2]) for l in self._temp]
+            self._p2 = [j for j in self._probs if j >= prob]
+            if len(self._p2) == 0:
                 break
             self.items = self._temp[self._probs.index(random.choice(self._p2))]
             self.late_word = self.items[1:]
@@ -170,4 +177,4 @@ def gen_sentence(self,N:int=4,prob:float=0.001, start_seq:tuple=None, output_str
                     self.listdata.append(j)
         if output_str:
             return ''.join(self.listdata)
-        return self.listdata
\ No newline at end of file
+        return self.listdata

From 7e88b39ea7e496a2c328c456d722b7c62e8644e6 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 21:56:00 +0700
Subject: [PATCH 08/26] Update core.py

---
 pythainlp/generator/core.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 49f451d30..acd9006c0 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -1,4 +1,9 @@
 # -*- coding: utf-8 -*-
+"""
+Text generator using Unigram, Bigram and Tigram
+
+code from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+"""
 import random
 from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
 from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
@@ -75,7 +80,7 @@ def __init__(self, name: str = "tnc"):
         self.bi_keys = list(self.bi.keys())
         self.words = [i[-1]  for i in self.bi_keys]
 
-    def prob(self, t1: str, t2: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+    def prob(self, t1: str, t2: str):
         """
         probability word
 
@@ -134,7 +139,7 @@ def __init__(self, name: str = "tnc"):
         self.ti_keys = list(self.ti.keys())
         self.words = [i[-1] for i in self.bi_keys]
 
-    def prob(self, t1: str, t2: str, t3: str): # from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+    def prob(self, t1: str, t2: str, t3: str):
         """
         probability word
         

From 57c7a0a72a3321865f7d392b60ab9c36ddc64ffd Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 22:13:27 +0700
Subject: [PATCH 09/26] Update code

---
 pythainlp/corpus/oscar.py       | 4 +++-
 pythainlp/generator/core.py     | 2 +-
 pythainlp/generator/thai2fit.py | 5 ++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py
index 187a8c3dc..085f5bc41 100644
--- a/pythainlp/corpus/oscar.py
+++ b/pythainlp/corpus/oscar.py
@@ -26,7 +26,9 @@ def word_freqs() -> List[Tuple[str, int]]:
     word_freqs = []
     _path = get_corpus_path(_FILENAME)
     with open(_path, "r", encoding="utf-8") as f:
-        for line in f.readlines():
+        _data = [i for i in f.readlines()]
+        del _data[0]
+        for line in _data:
             _temp = line.strip().split(",")
             if len(_temp) >= 2:
                 if _temp[0] != " " and '"' not in _temp[0]:
diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index acd9006c0..27f77aa26 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -156,7 +156,7 @@ def prob(self, t1: str, t2: str, t3: str):
             v = 0.0
         return v
 
-    def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = กFalse):
+    def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = False):
         if start_seq is None:
             start_seq = random.choice(self.bi_keys)
         self.late_word = start_seq
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index be2cafe62..1a6570bcb 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -10,8 +10,6 @@
 
 import pandas as pd
 import random
-from ast import literal_eval
-from collections import Counter
 import pickle
 
 # fastai
@@ -79,7 +77,8 @@ def gen_sentence(
   start_seq:str=None,
   output_str:bool = True
 ):
-  if start_seq is None: start_seq = random.choice(list(thwiki_itos))
+  if start_seq is None:
+    start_seq = random.choice(list(thwiki_itos))
   list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-')
   if output_str:
     return ''.join(list_word)

From ec3e66291c2301c83e99e8721438d7b8778d7199 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 11 Jun 2021 22:38:34 +0700
Subject: [PATCH 10/26] Update code

---
 pythainlp/generator/core.py |  6 +++---
 tests/test_generator.py     | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 27f77aa26..3be47e7d3 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -33,7 +33,7 @@ def __init__(self, name: str = "tnc"):
         self.prob = {i:self.counts[i] / self.n for i in self.word}
         self._word_prob = {}
 
-    def gen_sentence(self, N: int = 3,prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False):
+    def gen_sentence(self, start_seq: str = None, N: int = 3,prob: float = 0.001, output_str: bool = True, duplicate: bool = False):
         """
         :param int N: number of word.
         :param str start_seq: word for begin word.
@@ -96,7 +96,7 @@ def prob(self, t1: str, t2: str):
             v = 0.0
         return v
 
-    def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: str = None, output_str: bool = True, duplicate: bool = False):
+    def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False):
         if start_seq is None: start_seq = random.choice(self.words)
         self.late_word = start_seq
         self.list_word = []
@@ -156,7 +156,7 @@ def prob(self, t1: str, t2: str, t3: str):
             v = 0.0
         return v
 
-    def gen_sentence(self, N: int = 4, prob: float = 0.001, start_seq: tuple = None, output_str: bool = True, duplicate: bool = False):
+    def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False):
         if start_seq is None:
             start_seq = random.choice(self.bi_keys)
         self.late_word = start_seq
diff --git a/tests/test_generator.py b/tests/test_generator.py
index eb611ae38..132b5f737 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -9,19 +9,19 @@
 class TestGeneratorPackage(unittest.TestCase):
     def test_unigram(self):
         _tnc_unigram = Unigram("tnc")
-        self.assertIsNotNone(_tnc_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม"))
         _ttc_unigram = Unigram("ttc")
-        self.assertIsNotNone(_ttc_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม"))
         _oscar_unigram = Unigram("oscar")
-        self.assertIsNotNone(_oscar_unigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม"))
 
     def test_bigram(self):
         _bigram = Bigram()
-        self.assertIsNotNone(_bigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        self.assertIsNotNone(_bigram.gen_sentence("ผม"))
 
     def test_tigram(self):
         _tigram = Tigram()
-        self.assertIsNotNone(_tigram.gen_sentence("ผมชอบไปโรงเรียน"))
+        self.assertIsNotNone(_tigram.gen_sentence("ผม"))
 
     def test_thai2fit(self):
         self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน"))

From 466f0537cbd14bc7f544cb67c6f791bd9d253aa5 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 00:01:16 +0700
Subject: [PATCH 11/26] Update test_generator.py

---
 tests/test_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_generator.py b/tests/test_generator.py
index 132b5f737..06bfe4631 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -24,4 +24,4 @@ def test_tigram(self):
         self.assertIsNotNone(_tigram.gen_sentence("ผม"))
 
     def test_thai2fit(self):
-        self.assertIsNotNone(gen_sentence("ผมชอบไปโรงเรียน"))
+        self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว"))

From fd3215e2eb5713268f703b150d8d1cc49e40d791 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 12:27:35 +0700
Subject: [PATCH 12/26] Update thai2fit.py

---
 pythainlp/generator/thai2fit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index 1a6570bcb..f157af03b 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -72,9 +72,9 @@
 
 
 def gen_sentence(
+  start_seq:str=None,
   N:int=4,
   prob:float=0.001,
-  start_seq:str=None,
   output_str:bool = True
 ):
   if start_seq is None:

From 84aadbca65fbc8fc26c254185c334c2fbbf26114 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 12:53:37 +0700
Subject: [PATCH 13/26] fixed bug

---
 pythainlp/corpus/tnc.py         |  2 +-
 pythainlp/corpus/ttc.py         |  2 +-
 pythainlp/generator/core.py     | 15 +++++++----
 pythainlp/generator/thai2fit.py | 47 ++++++++++++++++++++++-----------
 4 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index 0a147d70a..2dfeb8690 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -50,7 +50,7 @@ def unigram_word_freqs() -> defaultdict:
     for i in lines:
         _temp = i.strip().split("	")
         if len(_temp) >= 2:
-            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
+            _word_freqs[_temp[0]] = int(_temp[-1])
 
     return _word_freqs
 
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
index 000a7f484..c3ffa0c0d 100644
--- a/pythainlp/corpus/ttc.py
+++ b/pythainlp/corpus/ttc.py
@@ -44,6 +44,6 @@ def unigram_word_freqs() -> defaultdict:
     for i in lines:
         _temp = i.strip().split("	")
         if len(_temp) >= 2:
-            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
+            _word_freqs[_temp[0]] = int(_temp[-1])
 
     return _word_freqs
diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 3be47e7d3..acdd40cfe 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -54,14 +54,14 @@ def next_word(self, text: str, N: int, output_str: str, prob: float, duplicate:
         self.l.append(text)
         self._word_list = list(self._word_prob.keys())
         if N > len(self._word_list):
-            N  =len(self._word_list)
+            N = len(self._word_list)
         for i in range(N):
             self._word = random.choice(self._word_list)
             if duplicate is False:
                 while self._word in self.l:
                     self._word = random.choice(self._word_list)
             self.l.append(self._word)
-            
+
         if output_str:
             return "".join(self.l)
         return self.l
@@ -142,7 +142,7 @@ def __init__(self, name: str = "tnc"):
     def prob(self, t1: str, t2: str, t3: str):
         """
         probability word
-        
+
         :param int t1: text 1
         :param int t2: text 2
         :param int t3: text 3
@@ -165,9 +165,14 @@ def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, o
 
         for i in range(N):
             if duplicate:
-                self._temp = [j for j in self.ti_keys if j[:2] == self.late_word]
+                self._temp = [
+                    j for j in self.ti_keys if j[:2] == self.late_word
+                ]
             else:
-                self._temp = [j for j in self.ti_keys if j[:2] == self.late_word and j[1:] not in self.list_word]
+                self._temp = [
+                    j for j in self.ti_keys
+                    if j[:2] == self.late_word and j[1:] not in self.list_word
+                ]
             self._probs = [self.prob(l[0], l[1], l[2]) for l in self._temp]
             self._p2 = [j for j in self._probs if j >= prob]
             if len(self._p2) == 0:
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index f157af03b..17fb5a094 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -26,15 +26,20 @@
 # get vocab
 thwiki = ""
 try:
-  thwiki =_THWIKI_LSTM
+    thwiki =_THWIKI_LSTM
 except:
-  thwiki = THWIKI_LSTM
+    thwiki = THWIKI_LSTM
 
 thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb'))
 thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
 
 # dummy databunch
-tt = Tokenizer(tok_func=ThaiTokenizer, lang='th', pre_rules=pre_rules_th, post_rules=post_rules_th)
+tt = Tokenizer(
+  tok_func = ThaiTokenizer,
+  lang = 'th',
+  pre_rules = pre_rules_th,
+  post_rules = post_rules_th
+)
 processor = [
   TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
   NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)
@@ -65,21 +70,33 @@
 )
 trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
 
-learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)
+learn = language_model_learner(
+  data_lm,
+  AWD_LSTM,
+  config=config,
+  pretrained=False,
+  **trn_args
+)
 
-#load pretrained models
+# load pretrained models
 learn.load_pretrained(**thwiki)
 
 
 def gen_sentence(
-  start_seq:str=None,
-  N:int=4,
-  prob:float=0.001,
-  output_str:bool = True
+  start_seq: str = None,
+  N: int = 4,
+  prob: float = 0.001,
+  output_str: bool = True
 ):
-  if start_seq is None:
-    start_seq = random.choice(list(thwiki_itos))
-  list_word = learn.predict(start_seq, N, temperature=0.8, min_p=prob, sep = '-*-').split('-*-')
-  if output_str:
-    return ''.join(list_word)
-  return list_word
+    if start_seq is None:
+      start_seq = random.choice(list(thwiki_itos))
+    list_word = learn.predict(
+      start_seq,
+      N,
+      temperature=0.8,
+      min_p=prob,
+      sep = '-*-'
+    ).split('-*-')
+    if output_str:
+      return ''.join(list_word)
+    return list_word

From 99774e250c87f62d68d45c96498227372a98a0ca Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 13:17:33 +0700
Subject: [PATCH 14/26] Add test

---
 tests/test_generator.py |  6 ++++
 tests/test_ulmfit.py    | 61 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/tests/test_generator.py b/tests/test_generator.py
index 06bfe4631..af73d8ef8 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -10,18 +10,24 @@ class TestGeneratorPackage(unittest.TestCase):
     def test_unigram(self):
         _tnc_unigram = Unigram("tnc")
         self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_tnc_unigram.gen_sentence())
         _ttc_unigram = Unigram("ttc")
         self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_ttc_unigram.gen_sentence())
         _oscar_unigram = Unigram("oscar")
         self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_oscar_unigram.gen_sentence())
 
     def test_bigram(self):
         _bigram = Bigram()
         self.assertIsNotNone(_bigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_bigram.gen_sentence())
 
     def test_tigram(self):
         _tigram = Tigram()
         self.assertIsNotNone(_tigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_tigram.gen_sentence())
 
     def test_thai2fit(self):
         self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว"))
+        self.assertIsNotNone(gen_sentence())
diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py
index a713bd8b6..828fa9b7d 100644
--- a/tests/test_ulmfit.py
+++ b/tests/test_ulmfit.py
@@ -30,6 +30,15 @@
     ungroup_emoji,
 )
 from pythainlp.ulmfit.tokenizer import BaseTokenizer
+import pandas as pd
+import random
+import pickle
+# fastai
+import fastai
+from fastai.text import *
+
+# pythainlp
+from pythainlp.ulmfit import *
 
 
 class TestUlmfitPackage(unittest.TestCase):
@@ -198,3 +207,55 @@ def test_process_thai_dense(self):
         ]
 
         self.assertEqual(actual, expect)
+
+    def test_document_vector(self):
+        imdb = untar_data(URLs.IMDB_SAMPLE)
+        dummy_df = pd.read_csv(imdb/'texts.csv')
+        thwiki = ""
+        try:
+            thwiki =_THWIKI_LSTM
+        except:
+            thwiki = THWIKI_LSTM
+        thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb'))
+        thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
+        tt = Tokenizer(
+            tok_func = ThaiTokenizer,
+            lang = 'th',
+            pre_rules = pre_rules_th,
+            post_rules = post_rules_th
+        )
+        processor = [
+            TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
+            NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)
+        ]
+        data_lm = (
+            TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)
+            .split_by_rand_pct(0.2)
+            .label_for_lm()
+            .databunch(bs=64)
+        )
+        data_lm.sanity_check()
+        config = dict(
+            emb_sz=400,
+            n_hid=1550,
+            n_layers=4,
+            pad_token=1,
+            qrnn=False,
+            tie_weights=True,
+            out_bias=True,
+            output_p=0.25,
+            hidden_p=0.1,
+            input_p=0.2,
+            embed_p=0.02,
+            weight_p=0.15
+        )
+        trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
+        learn = language_model_learner(
+            data_lm,
+            AWD_LSTM,
+            config=config,
+            pretrained=False,
+            **trn_args
+        )
+        learn.load_pretrained(**thwiki)
+        self.assertIsNotNone(document_vector('วันนี้วันดีปีใหม่', learn, data_lm))

From 62739790c02591959036068b7f61e15feec1dd20 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 13:49:57 +0700
Subject: [PATCH 15/26] Update test

---
 pythainlp/generator/core.py     | 59 ++++++++++++++++++++++++---------
 pythainlp/generator/thai2fit.py | 16 ++++-----
 tests/test_ulmfit.py            | 34 ++++++++++++++-----
 3 files changed, 77 insertions(+), 32 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index acdd40cfe..3900bfaae 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -30,10 +30,19 @@ def __init__(self, name: str = "tnc"):
         self.n = 0
         for i in self.word:
             self.n += self.counts[i]
-        self.prob = {i:self.counts[i] / self.n for i in self.word}
+        self.prob = {
+            i:self.counts[i] / self.n for i in self.word
+        }
         self._word_prob = {}
 
-    def gen_sentence(self, start_seq: str = None, N: int = 3,prob: float = 0.001, output_str: bool = True, duplicate: bool = False):
+    def gen_sentence(
+        self,
+        start_seq: str = None,
+        N: int = 3,
+        prob: float = 0.001,
+        output_str: bool = True,
+        duplicate: bool = False
+    ):
         """
         :param int N: number of word.
         :param str start_seq: word for begin word.
@@ -46,25 +55,35 @@ def gen_sentence(self, start_seq: str = None, N: int = 3,prob: float = 0.001, ou
         if start_seq is None:
             start_seq = random.choice(self.word)
         rand_text = start_seq.lower()
-        self._word_prob = {i:self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob}
-        return self.next_word(rand_text, N, output_str, prob = prob, duplicate = duplicate)
-
-    def next_word(self, text: str, N: int, output_str: str, prob: float, duplicate: bool = False):
-        self.l = []
-        self.l.append(text)
+        self._word_prob = {
+            i:self.counts[i] / self.n for i in self.word
+            if self.counts[i] / self.n >= prob
+        }
+        return self.next_word(rand_text, N, output_str, prob=prob, duplicate=duplicate)
+
+    def next_word(
+        self,
+        text: str,
+        N: int,
+        output_str: str,
+        prob: float,
+        duplicate: bool = False
+    ):
+        self.words = []
+        self.words.append(text)
         self._word_list = list(self._word_prob.keys())
         if N > len(self._word_list):
             N = len(self._word_list)
         for i in range(N):
             self._word = random.choice(self._word_list)
             if duplicate is False:
-                while self._word in self.l:
+                while self._word in self.words:
                     self._word = random.choice(self._word_list)
-            self.l.append(self._word)
+            self.words.append(self._word)
 
         if output_str:
-            return "".join(self.l)
-        return self.l
+            return "".join(self.words)
+        return self.words
 
 
 class Bigram:
@@ -78,7 +97,7 @@ def __init__(self, name: str = "tnc"):
             self.bi = tnc_word_freqs_bigram()
         self.uni_keys = list(self.uni.keys())
         self.bi_keys = list(self.bi.keys())
-        self.words = [i[-1]  for i in self.bi_keys]
+        self.words = [i[-1] for i in self.bi_keys]
 
     def prob(self, t1: str, t2: str):
         """
@@ -154,9 +173,17 @@ def prob(self, t1: str, t2: str, t3: str):
             v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)]
         except:
             v = 0.0
+
         return v
 
-    def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False):
+    def gen_sentence(
+        self,
+        start_seq: str = None,
+        N: int = 4,
+        prob: float = 0.001,
+        output_str: bool = True,
+        duplicate: bool = False
+    ):
         if start_seq is None:
             start_seq = random.choice(self.bi_keys)
         self.late_word = start_seq
@@ -173,7 +200,9 @@ def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, o
                     j for j in self.ti_keys
                     if j[:2] == self.late_word and j[1:] not in self.list_word
                 ]
-            self._probs = [self.prob(l[0], l[1], l[2]) for l in self._temp]
+            self._probs = [
+                self.prob(word[0], word[1], word[2]) for word in self._temp
+            ]
             self._p2 = [j for j in self._probs if j >= prob]
             if len(self._p2) == 0:
                 break
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index 17fb5a094..a6b27274c 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -26,7 +26,7 @@
 # get vocab
 thwiki = ""
 try:
-    thwiki =_THWIKI_LSTM
+    thwiki = _THWIKI_LSTM
 except:
     thwiki = THWIKI_LSTM
 
@@ -35,10 +35,10 @@
 
 # dummy databunch
 tt = Tokenizer(
-  tok_func = ThaiTokenizer,
-  lang = 'th',
-  pre_rules = pre_rules_th,
-  post_rules = post_rules_th
+  tok_func=ThaiTokenizer,
+  lang='th',
+  pre_rules=pre_rules_th,
+  post_rules=post_rules_th
 )
 processor = [
   TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
@@ -89,14 +89,14 @@ def gen_sentence(
   output_str: bool = True
 ):
     if start_seq is None:
-      start_seq = random.choice(list(thwiki_itos))
+        start_seq = random.choice(list(thwiki_itos))
     list_word = learn.predict(
       start_seq,
       N,
       temperature=0.8,
       min_p=prob,
-      sep = '-*-'
+      sep='-*-'
     ).split('-*-')
     if output_str:
-      return ''.join(list_word)
+        return ''.join(list_word)
     return list_word
diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py
index 828fa9b7d..3f66681c2 100644
--- a/tests/test_ulmfit.py
+++ b/tests/test_ulmfit.py
@@ -213,23 +213,32 @@ def test_document_vector(self):
         dummy_df = pd.read_csv(imdb/'texts.csv')
         thwiki = ""
         try:
-            thwiki =_THWIKI_LSTM
+            thwiki = _THWIKI_LSTM
         except:
             thwiki = THWIKI_LSTM
         thwiki_itos = pickle.load(open(thwiki['itos_fname'], 'rb'))
         thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
         tt = Tokenizer(
-            tok_func = ThaiTokenizer,
-            lang = 'th',
-            pre_rules = pre_rules_th,
-            post_rules = post_rules_th
+            tok_func=ThaiTokenizer,
+            lang='th',
+            pre_rules=pre_rules_th,
+            post_rules=post_rules_th
         )
         processor = [
-            TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
-            NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)
+            TokenizeProcessor(
+                tokenizer=tt, chunksize=10000, mark_fields=False
+            ),
+            NumericalizeProcessor(
+                vocab=thwiki_vocab, max_vocab=60000, min_freq=3
+            )
         ]
         data_lm = (
-            TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)
+            TextList.from_df(
+                dummy_df,
+                imdb,
+                cols=['text'],
+                processor=processor
+            )
             .split_by_rand_pct(0.2)
             .label_for_lm()
             .databunch(bs=64)
@@ -258,4 +267,11 @@ def test_document_vector(self):
             **trn_args
         )
         learn.load_pretrained(**thwiki)
-        self.assertIsNotNone(document_vector('วันนี้วันดีปีใหม่', learn, data_lm))
+        self.assertIsNotNone(
+            document_vector('วันนี้วันดีปีใหม่', learn, data_lm)
+        )
+        self.assertIsNotNone(
+            document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg="sum")
+        )
+        with self.assertRaises(ValueError):
+            document_vector('วันนี้วันดีปีใหม่', learn, data_lm,agg='abc')

From b449f01241388c1fba7b7e8f76b42865489195cd Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 13:54:26 +0700
Subject: [PATCH 16/26] Update pep8

---
 pythainlp/generator/core.py | 30 +++++++++++++++++++++++-------
 tests/test_ulmfit.py        |  2 +-
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 3900bfaae..1af669325 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -31,7 +31,7 @@ def __init__(self, name: str = "tnc"):
         for i in self.word:
             self.n += self.counts[i]
         self.prob = {
-            i:self.counts[i] / self.n for i in self.word
+            i: self.counts[i] / self.n for i in self.word
         }
         self._word_prob = {}
 
@@ -56,10 +56,16 @@ def gen_sentence(
             start_seq = random.choice(self.word)
         rand_text = start_seq.lower()
         self._word_prob = {
-            i:self.counts[i] / self.n for i in self.word
+            i: self.counts[i] / self.n for i in self.word
             if self.counts[i] / self.n >= prob
         }
-        return self.next_word(rand_text, N, output_str, prob=prob, duplicate=duplicate)
+        return self.next_word(
+            rand_text,
+            N,
+            output_str,
+            prob=prob,
+            duplicate=duplicate
+        )
 
     def next_word(
         self,
@@ -115,8 +121,16 @@ def prob(self, t1: str, t2: str):
             v = 0.0
         return v
 
-    def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False):
-        if start_seq is None: start_seq = random.choice(self.words)
+    def gen_sentence(
+        self,
+        start_seq: str = None,
+        N: int = 4,
+        prob: float = 0.001,
+        output_str: bool = True,
+        duplicate: bool = False
+    ):
+        if start_seq is None:
+            start_seq = random.choice(self.words)
         self.late_word = start_seq
         self.list_word = []
         self.list_word.append(start_seq)
@@ -129,9 +143,11 @@ def gen_sentence(self, start_seq: str = None, N: int = 4, prob: float = 0.001, o
             else:
                 self._temp = [
                     j for j in self.bi_keys
-                    if j[0]==self.late_word and j[1] not in self.list_word
+                    if j[0] == self.late_word and j[1] not in self.list_word
                 ]
-            self._probs = [self.prob(self.late_word, l[-1]) for l in self._temp]
+            self._probs = [
+                self.prob(self.late_word, next_word[-1]) for next_word in self._temp
+            ]
             self._p2 = [j for j in self._probs if j >= prob]
             if len(self._p2) == 0:
                 break
diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py
index 3f66681c2..3aa807704 100644
--- a/tests/test_ulmfit.py
+++ b/tests/test_ulmfit.py
@@ -274,4 +274,4 @@ def test_document_vector(self):
             document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg="sum")
         )
         with self.assertRaises(ValueError):
-            document_vector('วันนี้วันดีปีใหม่', learn, data_lm,agg='abc')
+            document_vector('วันนี้วันดีปีใหม่', learn, data_lm, agg='abc')

From 83c54a84ad2be46718ab817946c1632efb804311 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 14:05:17 +0700
Subject: [PATCH 17/26] Add docs

---
 pythainlp/generator/core.py     | 48 ++++++++++++++++++++++++++-------
 pythainlp/generator/thai2fit.py | 14 +++++++++-
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 1af669325..2df135a10 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -12,12 +12,18 @@
 from pythainlp.corpus.oscar import (
     unigram_word_freqs as oscar_word_freqs_unigram
 )
+from typing import List, Union
 
 
 class Unigram:
     def __init__(self, name: str = "tnc"):
         """
+        Text generator using Unigram
+
         :param str name: corpus name
+            * *tnc* - Thai National Corpus (default)
+            * *ttc* - Thai Textbook Corpus (TTC)
+            * *oscar* - OSCAR Corpus
         :rtype: None
         """
         if name == "tnc":
@@ -42,15 +48,15 @@ def gen_sentence(
         prob: float = 0.001,
         output_str: bool = True,
         duplicate: bool = False
-    ):
+    ) -> Union[List[str], str]:
         """
-        :param int N: number of word.
         :param str start_seq: word for begin word.
+        :param int N: number of word.
         :param bool output_str: output is str
         :param bool duplicate: duplicate word in sent
 
         :return: list words or str words
-        :rtype: str,list
+        :rtype: List[str], str
         """
         if start_seq is None:
             start_seq = random.choice(self.word)
@@ -59,7 +65,7 @@ def gen_sentence(
             i: self.counts[i] / self.n for i in self.word
             if self.counts[i] / self.n >= prob
         }
-        return self.next_word(
+        return self._next_word(
             rand_text,
             N,
             output_str,
@@ -67,7 +73,7 @@ def gen_sentence(
             duplicate=duplicate
         )
 
-    def next_word(
+    def _next_word(
         self,
         text: str,
         N: int,
@@ -95,7 +101,10 @@ def next_word(
 class Bigram:
     def __init__(self, name: str = "tnc"):
         """
+        Text generator using Bigram
+
         :param str name: corpus name
+            * *tnc* - Thai National Corpus (default)
         :rtype: None
         """
         if name == "tnc":
@@ -105,7 +114,7 @@ def __init__(self, name: str = "tnc"):
         self.bi_keys = list(self.bi.keys())
         self.words = [i[-1] for i in self.bi_keys]
 
-    def prob(self, t1: str, t2: str):
+    def prob(self, t1: str, t2: str) -> float:
         """
         probability word
 
@@ -128,7 +137,16 @@ def gen_sentence(
         prob: float = 0.001,
         output_str: bool = True,
         duplicate: bool = False
-    ):
+    ) -> Union[List[str], str]:
+        """
+        :param str start_seq: word for begin word.
+        :param int N: number of word.
+        :param bool output_str: output is str
+        :param bool duplicate: duplicate word in sent
+
+        :return: list words or str words
+        :rtype: List[str], str
+        """
         if start_seq is None:
             start_seq = random.choice(self.words)
         self.late_word = start_seq
@@ -162,7 +180,10 @@ def gen_sentence(
 class Tigram:
     def __init__(self, name: str = "tnc"):
         """
+        Text generator using Tigram
+
         :param str name: corpus name
+            * *tnc* - Thai National Corpus (default)
         :rtype: None
         """
         if name == "tnc":
@@ -174,7 +195,7 @@ def __init__(self, name: str = "tnc"):
         self.ti_keys = list(self.ti.keys())
         self.words = [i[-1] for i in self.bi_keys]
 
-    def prob(self, t1: str, t2: str, t3: str):
+    def prob(self, t1: str, t2: str, t3: str) -> float:
         """
         probability word
 
@@ -199,7 +220,16 @@ def gen_sentence(
         prob: float = 0.001,
         output_str: bool = True,
         duplicate: bool = False
-    ):
+    ) -> Union[List[str], str]:
+        """
+        :param str start_seq: word for begin word.
+        :param int N: number of word.
+        :param bool output_str: output is str
+        :param bool duplicate: duplicate word in sent
+
+        :return: list words or str words
+        :rtype: List[str], str
+        """
         if start_seq is None:
             start_seq = random.choice(self.bi_keys)
         self.late_word = start_seq
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generator/thai2fit.py
index a6b27274c..ee45ae22c 100644
--- a/pythainlp/generator/thai2fit.py
+++ b/pythainlp/generator/thai2fit.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import random
 import pickle
+from typing import List, Union
 
 # fastai
 import fastai
@@ -87,7 +88,18 @@ def gen_sentence(
   N: int = 4,
   prob: float = 0.001,
   output_str: bool = True
-):
+) -> Union[List[str], str]:
+    """
+    Text generator using Thai2fit
+
+    :param str start_seq: word for begin word.
+    :param int N: number of word.
+    :param bool output_str: output is str
+    :param bool duplicate: duplicate word in sent
+
+    :return: list words or str words
+    :rtype: List[str], str
+    """
     if start_seq is None:
         start_seq = random.choice(list(thwiki_itos))
     list_word = learn.predict(

From 6c64531f00f1ff79363c0c2cbe7f5dbfdab490bc Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 12 Jun 2021 14:15:15 +0700
Subject: [PATCH 18/26] Add test

---
 tests/test_generator.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_generator.py b/tests/test_generator.py
index af73d8ef8..adc781cf1 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -10,23 +10,33 @@ class TestGeneratorPackage(unittest.TestCase):
     def test_unigram(self):
         _tnc_unigram = Unigram("tnc")
         self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม", output_str=False))
         self.assertIsNotNone(_tnc_unigram.gen_sentence())
+        self.assertIsNotNone(_tnc_unigram.gen_sentence(duplicate=True))
         _ttc_unigram = Unigram("ttc")
         self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_ttc_unigram.gen_sentence("ผม", output_str=False))
         self.assertIsNotNone(_ttc_unigram.gen_sentence())
+        self.assertIsNotNone(_ttc_unigram.gen_sentence(duplicate=True))
         _oscar_unigram = Unigram("oscar")
         self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม", output_str=False))
         self.assertIsNotNone(_oscar_unigram.gen_sentence())
+        self.assertIsNotNone(_oscar_unigram.gen_sentence(duplicate=True))
 
     def test_bigram(self):
         _bigram = Bigram()
         self.assertIsNotNone(_bigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_bigram.gen_sentence("ผม", output_str=False))
         self.assertIsNotNone(_bigram.gen_sentence())
+        self.assertIsNotNone(_bigram.gen_sentence(duplicate=True))
 
     def test_tigram(self):
         _tigram = Tigram()
         self.assertIsNotNone(_tigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_tigram.gen_sentence("ผม", output_str=False))
         self.assertIsNotNone(_tigram.gen_sentence())
+        self.assertIsNotNone(_tigram.gen_sentence(duplicate=True))
 
     def test_thai2fit(self):
         self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว"))

From 0420bf6516a595e5df0468b22eebd58e7afd54ce Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 13 Jun 2021 12:10:24 +0700
Subject: [PATCH 19/26] fixed pep8

---
 pythainlp/generator/core.py | 4 +++-
 tests/test_generator.py     | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pythainlp/generator/core.py b/pythainlp/generator/core.py
index 2df135a10..9afb98218 100644
--- a/pythainlp/generator/core.py
+++ b/pythainlp/generator/core.py
@@ -164,7 +164,9 @@ def gen_sentence(
                     if j[0] == self.late_word and j[1] not in self.list_word
                 ]
             self._probs = [
-                self.prob(self.late_word, next_word[-1]) for next_word in self._temp
+                self.prob(
+                    self.late_word, next_word[-1]
+                ) for next_word in self._temp
             ]
             self._p2 = [j for j in self._probs if j >= prob]
             if len(self._p2) == 0:
diff --git a/tests/test_generator.py b/tests/test_generator.py
index adc781cf1..ea57a66b0 100644
--- a/tests/test_generator.py
+++ b/tests/test_generator.py
@@ -20,7 +20,9 @@ def test_unigram(self):
         self.assertIsNotNone(_ttc_unigram.gen_sentence(duplicate=True))
         _oscar_unigram = Unigram("oscar")
         self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม"))
-        self.assertIsNotNone(_oscar_unigram.gen_sentence("ผม", output_str=False))
+        self.assertIsNotNone(
+            _oscar_unigram.gen_sentence("ผม", output_str=False)
+        )
         self.assertIsNotNone(_oscar_unigram.gen_sentence())
         self.assertIsNotNone(_oscar_unigram.gen_sentence(duplicate=True))
 

From b46e986b0dfd085255a53663922954449e99e72c Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 14 Jun 2021 18:17:00 +0700
Subject: [PATCH 20/26] Change generator to generate

---
 docs/api/generate.rst                         | 16 ++++++++++++++++
 docs/api/generator.rst                        | 16 ----------------
 pythainlp/{generator => generate}/__init__.py |  4 ++--
 pythainlp/{generator => generate}/core.py     |  0
 pythainlp/{generator => generate}/thai2fit.py |  0
 setup.py                                      |  2 +-
 tests/{test_generator.py => test_generate.py} |  6 +++---
 7 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 docs/api/generate.rst
 delete mode 100644 docs/api/generator.rst
 rename pythainlp/{generator => generate}/__init__.py (52%)
 rename pythainlp/{generator => generate}/core.py (100%)
 rename pythainlp/{generator => generate}/thai2fit.py (100%)
 rename tests/{test_generator.py => test_generate.py} (92%)

diff --git a/docs/api/generate.rst b/docs/api/generate.rst
new file mode 100644
index 000000000..9d450862d
--- /dev/null
+++ b/docs/api/generate.rst
@@ -0,0 +1,16 @@
+.. currentmodule:: pythainlp.generate
+
+pythainlp.generate
+==================
+The :class:`pythainlp.generate` is Thai text generate with PyThaiNLP.
+
+Modules
+-------
+
+.. autoclass:: Unigram
+    :members:
+.. autoclass:: Bigram
+    :members:
+.. autoclass:: Tigram
+    :members:
+.. autofunction:: pythainlp.generate.thai2fit.gen_sentence
\ No newline at end of file
diff --git a/docs/api/generator.rst b/docs/api/generator.rst
deleted file mode 100644
index cd8252579..000000000
--- a/docs/api/generator.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. currentmodule:: pythainlp.generator
-
-pythainlp.generator
-===================
-The :class:`pythainlp.generator` is Thai text generator with PyThaiNLP.
-
-Modules
--------
-
-.. autoclass:: Unigram
-    :members:
-.. autoclass:: Bigram
-    :members:
-.. autoclass:: Tigram
-    :members:
-.. autofunction:: pythainlp.generator.thai2fit.gen_sentence
\ No newline at end of file
diff --git a/pythainlp/generator/__init__.py b/pythainlp/generate/__init__.py
similarity index 52%
rename from pythainlp/generator/__init__.py
rename to pythainlp/generate/__init__.py
index cb18dd716..9cd578864 100644
--- a/pythainlp/generator/__init__.py
+++ b/pythainlp/generate/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Thai Text generator
+Thai Text generate
 """
 
 __all__ = [
@@ -9,4 +9,4 @@
     "Tigram"
 ]
 
-from pythainlp.generator.core import Unigram, Bigram, Tigram
+from pythainlp.generate.core import Unigram, Bigram, Tigram
diff --git a/pythainlp/generator/core.py b/pythainlp/generate/core.py
similarity index 100%
rename from pythainlp/generator/core.py
rename to pythainlp/generate/core.py
diff --git a/pythainlp/generator/thai2fit.py b/pythainlp/generate/thai2fit.py
similarity index 100%
rename from pythainlp/generator/thai2fit.py
rename to pythainlp/generate/thai2fit.py
diff --git a/setup.py b/setup.py
index 6421b5638..a0c8ac39d 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
     "wangchanberta": ["transformers", "sentencepiece"],
     "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"],
     "wordnet": ["nltk>=3.3.*"],
-    "text_generator": ["fastai<2.0"],
+    "generate": ["fastai<2.0"],
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
diff --git a/tests/test_generator.py b/tests/test_generate.py
similarity index 92%
rename from tests/test_generator.py
rename to tests/test_generate.py
index ea57a66b0..f097bbb30 100644
--- a/tests/test_generator.py
+++ b/tests/test_generate.py
@@ -2,11 +2,11 @@
 
 import unittest
 
-from pythainlp.generator import Unigram, Bigram, Tigram
-from pythainlp.generator.thai2fit import gen_sentence
+from pythainlp.generate import Unigram, Bigram, Tigram
+from pythainlp.generate.thai2fit import gen_sentence
 
 
-class TestGeneratorPackage(unittest.TestCase):
+class TestGeneratePackage(unittest.TestCase):
     def test_unigram(self):
         _tnc_unigram = Unigram("tnc")
         self.assertIsNotNone(_tnc_unigram.gen_sentence("ผม"))

From e8edeaa83abb83c8cbb669d36fc114d2d3209d19 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 21 Jun 2021 06:57:07 +0100
Subject: [PATCH 21/26] Update tnc.py

---
 pythainlp/corpus/tnc.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index 2dfeb8690..77251ae64 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -2,7 +2,7 @@
 """
 Thai National Corpus word frequency
 
-Credit: Korakot Chaovavanich‎
+Credit: Korakot Chaovavanich
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
 
@@ -10,7 +10,7 @@
     "word_freqs",
     "unigram_word_freqs",
     "bigram_word_freqs",
-    "tigram_word_freqs"
+    "trigram_word_freqs"
 ]
 
 from collections import defaultdict
@@ -22,7 +22,7 @@
 
 _FILENAME = "tnc_freq.txt"
 _BIGRAM = "tnc_bigram_word_freqs"
-_TIGRAM = "tnc_tigram_word_freqs"
+_TRIGRAM = "tnc_trigram_word_freqs"
 
 
 def word_freqs() -> List[Tuple[str, int]]:
@@ -69,11 +69,11 @@ def bigram_word_freqs() -> defaultdict:
     return _word_freqs
 
 
-def tigram_word_freqs() -> defaultdict:
+def trigram_word_freqs() -> defaultdict:
     """
-    Get tigram word frequency from Thai National Corpus (TNC)
+    Get trigram word frequency from Thai National Corpus (TNC)
     """
-    _path = get_corpus_path(_TIGRAM)
+    _path = get_corpus_path(_TRIGRAM)
     _word_freqs = defaultdict(int)
     with open(_path, "r", encoding="utf-8-sig") as fh:
         for i in fh.readlines():

From 26065c6c2e0b6fbd6b4d5b917a608a69f49707f4 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 21 Jun 2021 06:58:34 +0100
Subject: [PATCH 22/26] Update core.py

---
 pythainlp/generate/core.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py
index 9afb98218..17cb57e5d 100644
--- a/pythainlp/generate/core.py
+++ b/pythainlp/generate/core.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 """
-Text generator using Unigram, Bigram and Tigram
+Text generator using n-gram language model
 
-code from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+code from
+https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
 """
 import random
 from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
 from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
-from pythainlp.corpus.tnc import tigram_word_freqs as tnc_word_freqs_tigram
+from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
 from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
 from pythainlp.corpus.oscar import (
     unigram_word_freqs as oscar_word_freqs_unigram
@@ -179,10 +180,10 @@ def gen_sentence(
         return self.list_word
 
 
-class Tigram:
+class Trigram:
     def __init__(self, name: str = "tnc"):
         """
-        Text generator using Tigram
+        Text generator using Trigram
 
         :param str name: corpus name
             * *tnc* - Thai National Corpus (default)
@@ -191,7 +192,7 @@ def __init__(self, name: str = "tnc"):
         if name == "tnc":
             self.uni = tnc_word_freqs_unigram()
             self.bi = tnc_word_freqs_bigram()
-            self.ti = tnc_word_freqs_tigram()
+            self.ti = tnc_word_freqs_trigram()
         self.uni_keys = list(self.uni.keys())
         self.bi_keys = list(self.bi.keys())
         self.ti_keys = list(self.ti.keys())

From 892475f7ebcaa296d20714c9f71b9ec9758f0564 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 21 Jun 2021 07:00:14 +0100
Subject: [PATCH 23/26] Update thai2fit.py

---
 pythainlp/generate/thai2fit.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pythainlp/generate/thai2fit.py b/pythainlp/generate/thai2fit.py
index ee45ae22c..b07368360 100644
--- a/pythainlp/generate/thai2fit.py
+++ b/pythainlp/generate/thai2fit.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 """
-Thai2fit : Thai Wiki Language Model for Text Generation
+Thai2fit: Thai Wikipeida Language Model for Text Generation
 
-Code from https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb
+Code from
+https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb
 """
 __all__ = [
     "gen_sentence"

From 5c56376312b644de452d0af5ab7c94bd32a45eab Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 21 Jun 2021 14:07:04 +0700
Subject: [PATCH 24/26] Update test

---
 docs/api/corpus.rst            |  2 +-
 docs/api/generate.rst          |  2 +-
 pythainlp/generate/__init__.py |  4 ++--
 tests/test_corpus.py           |  2 +-
 tests/test_generate.py         | 14 +++++++-------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
index eb8d1bf16..686d5e5e3 100644
--- a/docs/api/corpus.rst
+++ b/docs/api/corpus.rst
@@ -37,7 +37,7 @@ TNC
 .. autofunction:: pythainlp.corpus.tnc.word_freqs
 .. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs
 .. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs
-.. autofunction:: pythainlp.corpus.tnc.tigram_word_freqs
+.. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs
 
 TTC
 ---
diff --git a/docs/api/generate.rst b/docs/api/generate.rst
index 9d450862d..02459dfc3 100644
--- a/docs/api/generate.rst
+++ b/docs/api/generate.rst
@@ -11,6 +11,6 @@ Modules
     :members:
 .. autoclass:: Bigram
     :members:
-.. autoclass:: Tigram
+.. autoclass:: Trigram
     :members:
 .. autofunction:: pythainlp.generate.thai2fit.gen_sentence
\ No newline at end of file
diff --git a/pythainlp/generate/__init__.py b/pythainlp/generate/__init__.py
index 9cd578864..fffac652c 100644
--- a/pythainlp/generate/__init__.py
+++ b/pythainlp/generate/__init__.py
@@ -6,7 +6,7 @@
 __all__ = [
     "Unigram",
     "Bigram",
-    "Tigram"
+    "Trigram"
 ]
 
-from pythainlp.generate.core import Unigram, Bigram, Tigram
+from pythainlp.generate.core import Unigram, Bigram, Trigram
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index 26196129e..a5bdb108a 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -108,7 +108,7 @@ def test_tnc(self):
         self.assertIsNotNone(tnc.word_freqs())
         self.assertIsNotNone(tnc.unigram_word_freqs())
         self.assertIsNotNone(tnc.bigram_word_freqs())
-        self.assertIsNotNone(tnc.tigram_word_freqs())
+        self.assertIsNotNone(tnc.trigram_word_freqs())
 
     def test_ttc(self):
         self.assertIsNotNone(ttc.word_freqs())
diff --git a/tests/test_generate.py b/tests/test_generate.py
index f097bbb30..6405c679e 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -2,7 +2,7 @@
 
 import unittest
 
-from pythainlp.generate import Unigram, Bigram, Tigram
+from pythainlp.generate import Unigram, Bigram, Trigram
 from pythainlp.generate.thai2fit import gen_sentence
 
 
@@ -33,12 +33,12 @@ def test_bigram(self):
         self.assertIsNotNone(_bigram.gen_sentence())
         self.assertIsNotNone(_bigram.gen_sentence(duplicate=True))
 
-    def test_tigram(self):
-        _tigram = Tigram()
-        self.assertIsNotNone(_tigram.gen_sentence("ผม"))
-        self.assertIsNotNone(_tigram.gen_sentence("ผม", output_str=False))
-        self.assertIsNotNone(_tigram.gen_sentence())
-        self.assertIsNotNone(_tigram.gen_sentence(duplicate=True))
+    def test_trigram(self):
+        _trigram = Trigram()
+        self.assertIsNotNone(_trigram.gen_sentence("ผม"))
+        self.assertIsNotNone(_trigram.gen_sentence("ผม", output_str=False))
+        self.assertIsNotNone(_trigram.gen_sentence())
+        self.assertIsNotNone(_trigram.gen_sentence(duplicate=True))
 
     def test_thai2fit(self):
         self.assertIsNotNone(gen_sentence("กาลครั้งหนึ่งนานมาแล้ว"))

From 82fea2e96c9482fcca7cadb1cdf37de39d814eaf Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 24 Jun 2021 13:10:13 +0700
Subject: [PATCH 25/26] Update tnc.py

---
 pythainlp/corpus/tnc.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index 77251ae64..5f80ab972 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -1,9 +1,6 @@
 # -*- coding: utf-8 -*-
 """
 Thai National Corpus word frequency
-
-Credit: Korakot Chaovavanich
-https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
 
 __all__ = [
@@ -30,6 +27,8 @@ def word_freqs() -> List[Tuple[str, int]]:
     Get word frequency from Thai National Corpus (TNC)
     \n(See: `dev/pythainlp/corpus/tnc_freq.txt\
     <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)
+
+    Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF
     """
     lines = list(get_corpus(_FILENAME))
     word_freqs = []

From fb0d5d236a95b7912223a332e684c1cc0ebeae9f Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 17 Jul 2021 23:56:54 +0700
Subject: [PATCH 26/26] Add docs

---
 pythainlp/generate/core.py     | 73 +++++++++++++++++++++++-----------
 pythainlp/generate/thai2fit.py | 11 +++++
 2 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py
index 17cb57e5d..7e513c287 100644
--- a/pythainlp/generate/core.py
+++ b/pythainlp/generate/core.py
@@ -17,16 +17,15 @@
 
 
 class Unigram:
-    def __init__(self, name: str = "tnc"):
-        """
-        Text generator using Unigram
+    """
+    Text generator using Unigram
 
-        :param str name: corpus name
-            * *tnc* - Thai National Corpus (default)
-            * *ttc* - Thai Textbook Corpus (TTC)
-            * *oscar* - OSCAR Corpus
-        :rtype: None
-        """
+    :param str name: corpus name
+        * *tnc* - Thai National Corpus (default)
+        * *ttc* - Thai Textbook Corpus (TTC)
+        * *oscar* - OSCAR Corpus
+    """
+    def __init__(self, name: str = "tnc"):
         if name == "tnc":
             self.counts = tnc_word_freqs_unigram()
         elif name == "ttc":
@@ -58,6 +57,16 @@ def gen_sentence(
 
         :return: list words or str words
         :rtype: List[str], str
+
+        :Example:
+        ::
+
+            from pythainlp.generate import Unigram
+
+            gen = Unigram()
+
+            gen.gen_sentence("แมว")
+            # ouput: 'แมวเวลานะนั้น'
         """
         if start_seq is None:
             start_seq = random.choice(self.word)
@@ -100,14 +109,13 @@ def _next_word(
 
 
 class Bigram:
-    def __init__(self, name: str = "tnc"):
-        """
-        Text generator using Bigram
+    """
+    Text generator using Bigram
 
-        :param str name: corpus name
-            * *tnc* - Thai National Corpus (default)
-        :rtype: None
-        """
+    :param str name: corpus name
+        * *tnc* - Thai National Corpus (default)
+    """
+    def __init__(self, name: str = "tnc"):
         if name == "tnc":
             self.uni = tnc_word_freqs_unigram()
             self.bi = tnc_word_freqs_bigram()
@@ -147,6 +155,16 @@ def gen_sentence(
 
         :return: list words or str words
         :rtype: List[str], str
+
+        :Example:
+        ::
+
+            from pythainlp.generate import Bigram
+
+            gen = Bigram()
+
+            gen.gen_sentence("แมว")
+            # ouput: 'แมวไม่ได้รับเชื้อมัน'
         """
         if start_seq is None:
             start_seq = random.choice(self.words)
@@ -181,14 +199,13 @@ def gen_sentence(
 
 
 class Trigram:
-    def __init__(self, name: str = "tnc"):
-        """
-        Text generator using Trigram
+    """
+    Text generator using Trigram
 
-        :param str name: corpus name
-            * *tnc* - Thai National Corpus (default)
-        :rtype: None
-        """
+    :param str name: corpus name
+        * *tnc* - Thai National Corpus (default)
+    """
+    def __init__(self, name: str = "tnc"):
         if name == "tnc":
             self.uni = tnc_word_freqs_unigram()
             self.bi = tnc_word_freqs_bigram()
@@ -232,6 +249,16 @@ def gen_sentence(
 
         :return: list words or str words
         :rtype: List[str], str
+
+        :Example:
+        ::
+
+            from pythainlp.generate import Trigram
+
+            gen = Trigram()
+
+            gen.gen_sentence()
+            # ouput: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
         """
         if start_seq is None:
             start_seq = random.choice(self.bi_keys)
diff --git a/pythainlp/generate/thai2fit.py b/pythainlp/generate/thai2fit.py
index b07368360..f299c6648 100644
--- a/pythainlp/generate/thai2fit.py
+++ b/pythainlp/generate/thai2fit.py
@@ -100,6 +100,17 @@ def gen_sentence(
 
     :return: list words or str words
     :rtype: List[str], str
+
+    :Example:
+    ::
+
+      from pythainlp.generate.thai2fit import gen_sentence
+
+      gen_sentence()
+      # output: 'แคทรียา อิงลิช  (นักแสดง'
+
+      gen_sentence("แมว")
+      # output: 'แมว คุณหลวง '
     """
     if start_seq is None:
         start_seq = random.choice(list(thwiki_itos))