PyThaiNLP · wannaphong · Jul 21, 2021 · Jun 11, 2021 · Jun 11, 2021 · Jun 11, 2021
diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst
@@ -36,11 +36,21 @@ TNC
 ---
 
 .. autofunction:: pythainlp.corpus.tnc.word_freqs
+.. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs
+.. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs
+.. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs
 
 TTC
 ---
 
 .. autofunction:: pythainlp.corpus.ttc.word_freqs
+.. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs
+
+OSCAR
+-----
+
+.. autofunction:: pythainlp.corpus.oscar.word_freqs
+.. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs
 
 Util
 ----

diff --git a/docs/api/generate.rst b/docs/api/generate.rst
@@ -0,0 +1,16 @@
+.. currentmodule:: pythainlp.generate
+
+pythainlp.generate
+==================
+The :class:`pythainlp.generate` is Thai text generate with PyThaiNLP.
+
+Modules
+-------
+
+.. autoclass:: Unigram
+    :members:
+.. autoclass:: Bigram
+    :members:
+.. autoclass:: Trigram
+    :members:
+.. autofunction:: pythainlp.generate.thai2fit.gen_sentence
diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+"""
+Thai unigram word frequency from OSCAR Corpus (icu word tokenize)
+
+Credit: Korakot Chaovavanich
+https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
+"""
+
+__all__ = [
+    "word_freqs",
+    "unigram_word_freqs"
+]
+
+from collections import defaultdict
+from typing import List, Tuple
+
+from pythainlp.corpus import get_corpus_path
+
+_FILENAME = "oscar_icu"
+
+
+def word_freqs() -> List[Tuple[str, int]]:
+    """
+    Get word frequency from OSCAR Corpus (icu word tokenize)
+    """
+    word_freqs = []
+    _path = get_corpus_path(_FILENAME)
+    with open(_path, "r", encoding="utf-8") as f:
+        _data = [i for i in f.readlines()]
+        del _data[0]
+        for line in _data:
+            _temp = line.strip().split(",")
+            if len(_temp) >= 2:
+                if _temp[0] != " " and '"' not in _temp[0]:
+                    word_freqs.append((_temp[0], int(_temp[1])))
+                elif _temp[0] == " ":
+                    word_freqs.append(("<s/>", int(_temp[1])))
+
+    return word_freqs
+
+
+def unigram_word_freqs() -> defaultdict:
+    """
+    Get unigram word frequency from OSCAR Corpus (icu word tokenize)
+    """
+    _path = get_corpus_path(_FILENAME)
+    _word_freqs = defaultdict(int)
+    with open(_path, "r", encoding="utf-8-sig") as fh:
+        _data = [i for i in fh.readlines()]
+        del _data[0]
+        for i in _data:
+            _temp = i.strip().split(",")
+            if _temp[0] != " " and '"' not in _temp[0]:
+                _word_freqs[_temp[0]] = int(_temp[-1])
+            elif _temp[0] == " ":
+                _word_freqs["<s/>"] = int(_temp[-1])
+
+    return _word_freqs
diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
@@ -1,25 +1,34 @@
 # -*- coding: utf-8 -*-
 """
 Thai National Corpus word frequency
-
-Credit: Korakot Chaovavanich‎
-https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
 
-__all__ = ["word_freqs"]
+__all__ = [
+    "word_freqs",
+    "unigram_word_freqs",
+    "bigram_word_freqs",
+    "trigram_word_freqs"
+]
 
+from collections import defaultdict
 from typing import List, Tuple
 
 from pythainlp.corpus import get_corpus
+from pythainlp.corpus import get_corpus_path
+
 
 _FILENAME = "tnc_freq.txt"
+_BIGRAM = "tnc_bigram_word_freqs"
+_TRIGRAM = "tnc_trigram_word_freqs"
 
 
 def word_freqs() -> List[Tuple[str, int]]:
     """
     Get word frequency from Thai National Corpus (TNC)
     \n(See: `dev/pythainlp/corpus/tnc_freq.txt\
     <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)
+
+    Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF
     """
     lines = list(get_corpus(_FILENAME))
     word_freqs = []
@@ -29,3 +38,45 @@ def word_freqs() -> List[Tuple[str, int]]:
             word_freqs.append((word_freq[0], int(word_freq[1])))
 
     return word_freqs
+
+
+def unigram_word_freqs() -> defaultdict:
+    """
+    Get unigram word frequency from Thai National Corpus (TNC)
+    """
+    lines = list(get_corpus(_FILENAME))
+    _word_freqs = defaultdict(int)
+    for i in lines:
+        _temp = i.strip().split("	")
+        if len(_temp) >= 2:
+            _word_freqs[_temp[0]] = int(_temp[-1])
+
+    return _word_freqs
+
+
+def bigram_word_freqs() -> defaultdict:
+    """
+    Get bigram word frequency from Thai National Corpus (TNC)
+    """
+    _path = get_corpus_path(_BIGRAM)
+    _word_freqs = defaultdict(int)
+    with open(_path, "r", encoding="utf-8-sig") as fh:
+        for i in fh.readlines():
+            _temp = i.strip().split("	")
+            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
+
+    return _word_freqs
+
+
+def trigram_word_freqs() -> defaultdict:
+    """
+    Get trigram word frequency from Thai National Corpus (TNC)
+    """
+    _path = get_corpus_path(_TRIGRAM)
+    _word_freqs = defaultdict(int)
+    with open(_path, "r", encoding="utf-8-sig") as fh:
+        for i in fh.readlines():
+            _temp = i.strip().split("	")
+            _word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])
+
+    return _word_freqs
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
@@ -6,8 +6,12 @@
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
 
-__all__ = ["word_freqs"]
+__all__ = [
+    "word_freqs",
+    "unigram_word_freqs"
+]
 
+from collections import defaultdict
 from typing import List, Tuple
 
 from pythainlp.corpus import get_corpus
@@ -29,3 +33,17 @@ def word_freqs() -> List[Tuple[str, int]]:
             word_freqs.append((word_freq[0], int(word_freq[1])))
 
     return word_freqs
+
+
+def unigram_word_freqs() -> defaultdict:
+    """
+    Get unigram word frequency from Thai Textbook Corpus (TTC)
+    """
+    lines = list(get_corpus(_FILENAME))
+    _word_freqs = defaultdict(int)
+    for i in lines:
+        _temp = i.strip().split("	")
+        if len(_temp) >= 2:
+            _word_freqs[_temp[0]] = int(_temp[-1])
+
+    return _word_freqs
diff --git a/pythainlp/generate/__init__.py b/pythainlp/generate/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+Thai Text generate
+"""
+
+__all__ = [
+    "Unigram",
+    "Bigram",
+    "Trigram"
+]
+
+from pythainlp.generate.core import Unigram, Bigram, Trigram