PyThaiNLP · wannaphong · Feb 4, 2021 · Jan 16, 2021 · Jan 17, 2021 · Jan 17, 2021
diff --git a/docs/api/tag.rst b/docs/api/tag.rst
@@ -229,6 +229,7 @@ Modules
 .. autofunction:: pos_tag
 .. autofunction:: pos_tag_sents
 .. autofunction:: tag_provinces
+.. autofunction:: chunk_parse
 .. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
    :members: get_ner
 

diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -25,6 +25,7 @@
     "thai_stopwords",
     "thai_syllables",
     "thai_words",
+    "path_pythainlp_corpus",
 ]
 
 import os
@@ -81,6 +82,7 @@ def corpus_db_path() -> str:
     get_corpus_db_detail,
     get_corpus_path,
     remove,
+    path_pythainlp_corpus,
 )  # these imports must come before other pythainlp.corpus.* imports
 from pythainlp.corpus.common import (
     countries,

diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -55,6 +55,18 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict:
     return dict()
 
 
+def path_pythainlp_corpus(filename: str) -> str:
+    """
+    Get path pythainlp.corpus data
+
+    :param str filename: filename of the corpus to be read
+
+    :return: : path of corpus
+    :rtype: str
+    """
+    return os.path.join(corpus_path(), filename)
+
+
 def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
     """
     Read corpus data from file and return a frozenset or a list.
@@ -67,9 +79,6 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
     If as_is is True, a list will be return, with no modifications
     in member values and their orders.
 
-    (Please see the filename from
-    `this file
-    <https://pythainlp.github.io/pythainlp-corpus/db.json>`_
 
     :param str filename: filename of the corpus to be read
 
@@ -115,6 +124,10 @@ def get_corpus_path(name: str,  version : str = None) -> Union[str, None]:
 
     :Example:
 
+    (Please see the filename from
+    `this file
+    <https://pythainlp.github.io/pythainlp-corpus/db.json>`_
+
     If the corpus already exists::
 
         from pythainlp.corpus import get_corpus_path

diff --git a/pythainlp/corpus/crfchunk_orchidpp.model b/pythainlp/corpus/crfchunk_orchidpp.model
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -16,3 +16,4 @@
 from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
 from pythainlp.tag._tag_perceptron import PerceptronTagger
+from pythainlp.tag.chunk import chunk_parse
diff --git a/pythainlp/tag/chunk.py b/pythainlp/tag/chunk.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+from typing import Dict, List, Tuple
+
+
+def chunk_parse(
+    sent: List[Tuple[str, str]],
+    engine="crf", corpus="orchidpp"
+) -> List[str]:
+    """
+    This function parse thai sentence to phrase structure in IOB format.
+
+    :param list sent: list [(word,part-of-speech)]
+    :param str engine: chunk parse engine (now, it has orchidpp only)
+
+    :return: a list of tuple (word,part-of-speech,chunking)
+    :rtype: List[str]
+    """
+    from .crfchunk import CRFchunk
+    _engine = CRFchunk()
+    return _engine.parse(sent)
diff --git a/pythainlp/tag/crfchunk.py b/pythainlp/tag/crfchunk.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+from typing import Dict, List, Tuple, Union
+from pycrfsuite import Tagger as CRFTagger
+from pythainlp.corpus import path_pythainlp_corpus, thai_stopwords
+
+
+def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
+    return word in thai_stopwords()
+
+
+def _doc2features(tokens: List[Tuple[str, str]], index: int) -> Dict:
+    """
+    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
+    `index`   = the index of the token we want to extract features for
+    """
+    word, pos = tokens[index]
+    f = {
+        'word': word,
+        'word_is_stopword': _is_stopword(word),
+        'pos': pos,
+    }
+    if index > 0 and index > 1:
+        prevprevword, prevprevpos = tokens[index - 2]
+        f['prev-prev-word'] = prevprevword
+        f['prev-prevz-word_is_stopword'] = _is_stopword(prevprevword)
+        f['prev-prevz-pos'] = prevprevpos
+    if index > 0:
+        prevword, prevpos = tokens[index-1]
+        f['prev-word'] = prevword
+        f['prev-word_is_stopword'] = _is_stopword(prevword)
+        f['prev-pos'] = prevpos
+    else:
+        f['BOS'] = True
+    if index < len(tokens)-2:
+        nextnextword, nextnextpos = tokens[index + 2]
+        f['nextnext-word'] = nextnextword
+        f['nextnext-word_is_stopword'] = _is_stopword(nextnextword)
+        f['nextnext-pos'] = nextnextpos
+    if index < len(tokens)-1:
+        nextword, nextpos = tokens[index+1]
+        f['next-word'] = nextword
+        f['next-word_is_stopword'] = _is_stopword(nextword)
+        f['next-pos'] = nextpos
+    else:
+        f['EOS'] = True
+
+    return f
+
+
+def extract_features(doc):
+    return [_doc2features(doc, i) for i in range(0, len(doc))]
+
+
+class CRFchunk:
+    def __init__(self, corpus: str = "orchidpp"):
+        self.corpus = corpus
+        self.load_model(self.corpus)
+
+    def load_model(self, corpus: str):
+        self.tagger = CRFTagger()
+        if corpus == "orchidpp":
+            self.path = path_pythainlp_corpus("crfchunk_orchidpp.model")
+        self.tagger.open(self.path)
+
+    def parse(self, token_pos: List[Tuple[str, str]]):
+        self.xseq = extract_features(token_pos)
+        return self.tagger.tag(self.xseq)
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -4,6 +4,7 @@
 from os import path
 
 from pythainlp.tag import (
+    chunk_parse,
     PerceptronTagger,
     perceptron,
     pos_tag,
@@ -15,6 +16,13 @@
 
 
 class TestTagPackage(unittest.TestCase):
+    # ### pythainlp.tag.PerceptronTagger
+
+    def test_chunk_parse(self):
+        tokens = ["ผม", "รัก", "คุณ"]
+
+        w_p = pos_tag(tokens, engine="perceptron", corpus="orchid")
+        self.assertIsNotNone(chunk_parse(w_p))
 
     # ### pythainlp.tag.pos_tag