Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/tag.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ Modules
.. autofunction:: pos_tag
.. autofunction:: pos_tag_sents
.. autofunction:: tag_provinces
.. autofunction:: chunk_parse
.. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
:members: get_ner

Expand Down
2 changes: 2 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"thai_stopwords",
"thai_syllables",
"thai_words",
"path_pythainlp_corpus",
]

import os
Expand Down Expand Up @@ -81,6 +82,7 @@ def corpus_db_path() -> str:
get_corpus_db_detail,
get_corpus_path,
remove,
path_pythainlp_corpus,
) # these imports must come before other pythainlp.corpus.* imports
from pythainlp.corpus.common import (
countries,
Expand Down
19 changes: 16 additions & 3 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict:
return dict()


def path_pythainlp_corpus(filename: str) -> str:
"""
Get path pythainlp.corpus data

:param str filename: filename of the corpus to be read

:return: : path of corpus
:rtype: str
"""
return os.path.join(corpus_path(), filename)


def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
"""
Read corpus data from file and return a frozenset or a list.
Expand All @@ -67,9 +79,6 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
If as_is is True, a list will be return, with no modifications
in member values and their orders.

(Please see the filename from
`this file
<https://pythainlp.github.io/pythainlp-corpus/db.json>`_

:param str filename: filename of the corpus to be read

Expand Down Expand Up @@ -115,6 +124,10 @@ def get_corpus_path(name: str, version : str = None) -> Union[str, None]:

:Example:

(Please see the filename from
`this file
<https://pythainlp.github.io/pythainlp-corpus/db.json>`_

If the corpus already exists::

from pythainlp.corpus import get_corpus_path
Expand Down
Binary file added pythainlp/corpus/crfchunk_orchidpp.model
Binary file not shown.
1 change: 1 addition & 0 deletions pythainlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@
from pythainlp.tag.locations import tag_provinces
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
from pythainlp.tag._tag_perceptron import PerceptronTagger
from pythainlp.tag.chunk import chunk_parse
20 changes: 20 additions & 0 deletions pythainlp/tag/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
from typing import Dict, List, Tuple


def chunk_parse(
sent: List[Tuple[str, str]],
engine="crf", corpus="orchidpp"
) -> List[str]:
"""
This function parse thai sentence to phrase structure in IOB format.

:param list sent: list [(word,part-of-speech)]
:param str engine: chunk parse engine (now, it has orchidpp only)

:return: a list of tuple (word,part-of-speech,chunking)
:rtype: List[str]
"""
from .crfchunk import CRFchunk
_engine = CRFchunk()
return _engine.parse(sent)
67 changes: 67 additions & 0 deletions pythainlp/tag/crfchunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from typing import Dict, List, Tuple, Union
from pycrfsuite import Tagger as CRFTagger
from pythainlp.corpus import path_pythainlp_corpus, thai_stopwords


def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย
return word in thai_stopwords()


def _doc2features(tokens: List[Tuple[str, str]], index: int) -> Dict:
"""
`tokens` = a POS-tagged sentence [(w1, t1), ...]
`index` = the index of the token we want to extract features for
"""
word, pos = tokens[index]
f = {
'word': word,
'word_is_stopword': _is_stopword(word),
'pos': pos,
}
if index > 0 and index > 1:
prevprevword, prevprevpos = tokens[index - 2]
f['prev-prev-word'] = prevprevword
f['prev-prevz-word_is_stopword'] = _is_stopword(prevprevword)
f['prev-prevz-pos'] = prevprevpos
if index > 0:
prevword, prevpos = tokens[index-1]
f['prev-word'] = prevword
f['prev-word_is_stopword'] = _is_stopword(prevword)
f['prev-pos'] = prevpos
else:
f['BOS'] = True
if index < len(tokens)-2:
nextnextword, nextnextpos = tokens[index + 2]
f['nextnext-word'] = nextnextword
f['nextnext-word_is_stopword'] = _is_stopword(nextnextword)
f['nextnext-pos'] = nextnextpos
if index < len(tokens)-1:
nextword, nextpos = tokens[index+1]
f['next-word'] = nextword
f['next-word_is_stopword'] = _is_stopword(nextword)
f['next-pos'] = nextpos
else:
f['EOS'] = True

return f


def extract_features(doc):
return [_doc2features(doc, i) for i in range(0, len(doc))]


class CRFchunk:
def __init__(self, corpus: str = "orchidpp"):
self.corpus = corpus
self.load_model(self.corpus)

def load_model(self, corpus: str):
self.tagger = CRFTagger()
if corpus == "orchidpp":
self.path = path_pythainlp_corpus("crfchunk_orchidpp.model")
self.tagger.open(self.path)

def parse(self, token_pos: List[Tuple[str, str]]):
self.xseq = extract_features(token_pos)
return self.tagger.tag(self.xseq)
8 changes: 8 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from os import path

from pythainlp.tag import (
chunk_parse,
PerceptronTagger,
perceptron,
pos_tag,
Expand All @@ -15,6 +16,13 @@


class TestTagPackage(unittest.TestCase):
# ### pythainlp.tag.PerceptronTagger

def test_chunk_parse(self):
tokens = ["ผม", "รัก", "คุณ"]

w_p = pos_tag(tokens, engine="perceptron", corpus="orchid")
self.assertIsNotNone(chunk_parse(w_p))

# ### pythainlp.tag.pos_tag

Expand Down