Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,21 @@ TNC
---

.. autofunction:: pythainlp.corpus.tnc.word_freqs
.. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs
.. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs
.. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs

TTC
---

.. autofunction:: pythainlp.corpus.ttc.word_freqs
.. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs

OSCAR
-----

.. autofunction:: pythainlp.corpus.oscar.word_freqs
.. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs

Util
----
Expand Down
16 changes: 16 additions & 0 deletions docs/api/generate.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. currentmodule:: pythainlp.generate

pythainlp.generate
==================
The :class:`pythainlp.generate` is Thai text generate with PyThaiNLP.

Modules
-------

.. autoclass:: Unigram
:members:
.. autoclass:: Bigram
:members:
.. autoclass:: Trigram
:members:
.. autofunction:: pythainlp.generate.thai2fit.gen_sentence
58 changes: 58 additions & 0 deletions pythainlp/corpus/oscar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
"""
Thai unigram word frequency from OSCAR Corpus (icu word tokenize)

Credit: Korakot Chaovavanich
https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
"""

__all__ = [
"word_freqs",
"unigram_word_freqs"
]

from collections import defaultdict
from typing import List, Tuple

from pythainlp.corpus import get_corpus_path

_FILENAME = "oscar_icu"


def word_freqs() -> List[Tuple[str, int]]:
"""
Get word frequency from OSCAR Corpus (icu word tokenize)
"""
word_freqs = []
_path = get_corpus_path(_FILENAME)
with open(_path, "r", encoding="utf-8") as f:
_data = [i for i in f.readlines()]
del _data[0]
for line in _data:
_temp = line.strip().split(",")
if len(_temp) >= 2:
if _temp[0] != " " and '"' not in _temp[0]:
word_freqs.append((_temp[0], int(_temp[1])))
elif _temp[0] == " ":
word_freqs.append(("<s/>", int(_temp[1])))

return word_freqs


def unigram_word_freqs() -> defaultdict:
"""
Get unigram word frequency from OSCAR Corpus (icu word tokenize)
"""
_path = get_corpus_path(_FILENAME)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
_data = [i for i in fh.readlines()]
del _data[0]
for i in _data:
_temp = i.strip().split(",")
if _temp[0] != " " and '"' not in _temp[0]:
_word_freqs[_temp[0]] = int(_temp[-1])
elif _temp[0] == " ":
_word_freqs["<s/>"] = int(_temp[-1])

return _word_freqs
59 changes: 55 additions & 4 deletions pythainlp/corpus/tnc.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
# -*- coding: utf-8 -*-
"""
Thai National Corpus word frequency

Credit: Korakot Chaovavanich‎
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
"""

__all__ = ["word_freqs"]
__all__ = [
"word_freqs",
"unigram_word_freqs",
"bigram_word_freqs",
"trigram_word_freqs"
]

from collections import defaultdict
from typing import List, Tuple

from pythainlp.corpus import get_corpus
from pythainlp.corpus import get_corpus_path


_FILENAME = "tnc_freq.txt"
_BIGRAM = "tnc_bigram_word_freqs"
_TRIGRAM = "tnc_trigram_word_freqs"


def word_freqs() -> List[Tuple[str, int]]:
"""
Get word frequency from Thai National Corpus (TNC)
\n(See: `dev/pythainlp/corpus/tnc_freq.txt\
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)

Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF
"""
lines = list(get_corpus(_FILENAME))
word_freqs = []
Expand All @@ -29,3 +38,45 @@ def word_freqs() -> List[Tuple[str, int]]:
word_freqs.append((word_freq[0], int(word_freq[1])))

return word_freqs


def unigram_word_freqs() -> defaultdict:
"""
Get unigram word frequency from Thai National Corpus (TNC)
"""
lines = list(get_corpus(_FILENAME))
_word_freqs = defaultdict(int)
for i in lines:
_temp = i.strip().split(" ")
if len(_temp) >= 2:
_word_freqs[_temp[0]] = int(_temp[-1])

return _word_freqs


def bigram_word_freqs() -> defaultdict:
"""
Get bigram word frequency from Thai National Corpus (TNC)
"""
_path = get_corpus_path(_BIGRAM)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
for i in fh.readlines():
_temp = i.strip().split(" ")
_word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])

return _word_freqs


def trigram_word_freqs() -> defaultdict:
"""
Get trigram word frequency from Thai National Corpus (TNC)
"""
_path = get_corpus_path(_TRIGRAM)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
for i in fh.readlines():
_temp = i.strip().split(" ")
_word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])

return _word_freqs
20 changes: 19 additions & 1 deletion pythainlp/corpus/ttc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
"""

__all__ = ["word_freqs"]
__all__ = [
"word_freqs",
"unigram_word_freqs"
]

from collections import defaultdict
from typing import List, Tuple

from pythainlp.corpus import get_corpus
Expand All @@ -29,3 +33,17 @@ def word_freqs() -> List[Tuple[str, int]]:
word_freqs.append((word_freq[0], int(word_freq[1])))

return word_freqs


def unigram_word_freqs() -> defaultdict:
"""
Get unigram word frequency from Thai Textbook Corpus (TTC)
"""
lines = list(get_corpus(_FILENAME))
_word_freqs = defaultdict(int)
for i in lines:
_temp = i.strip().split(" ")
if len(_temp) >= 2:
_word_freqs[_temp[0]] = int(_temp[-1])

return _word_freqs
12 changes: 12 additions & 0 deletions pythainlp/generate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
"""
Thai Text generate
"""

__all__ = [
"Unigram",
"Bigram",
"Trigram"
]

from pythainlp.generate.core import Unigram, Bigram, Trigram
Loading