diff --git a/docs/api/tag.rst b/docs/api/tag.rst index 11d4b3a5c..87cf0a766 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -229,6 +229,7 @@ Modules .. autofunction:: pos_tag .. autofunction:: pos_tag_sents .. autofunction:: tag_provinces +.. autofunction:: chunk_parse .. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger :members: get_ner diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 09bf6248d..e038c3a3e 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -25,6 +25,7 @@ "thai_stopwords", "thai_syllables", "thai_words", + "path_pythainlp_corpus", ] import os @@ -81,6 +82,7 @@ def corpus_db_path() -> str: get_corpus_db_detail, get_corpus_path, remove, + path_pythainlp_corpus, ) # these imports must come before other pythainlp.corpus.* imports from pythainlp.corpus.common import ( countries, diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 77d1d3978..e298c9339 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -55,6 +55,18 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict: return dict() +def path_pythainlp_corpus(filename: str) -> str: + """ + Get path pythainlp.corpus data + + :param str filename: filename of the corpus to be read + + :return: : path of corpus + :rtype: str + """ + return os.path.join(corpus_path(), filename) + + def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: """ Read corpus data from file and return a frozenset or a list. @@ -67,9 +79,6 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: If as_is is True, a list will be return, with no modifications in member values and their orders. - (Please see the filename from - `this file - `_ :param str filename: filename of the corpus to be read @@ -115,6 +124,10 @@ def get_corpus_path(name: str, version : str = None) -> Union[str, None]: :Example: + (Please see the filename from + `this file + `_ + If the corpus already exists:: from pythainlp.corpus import get_corpus_path diff --git a/pythainlp/corpus/crfchunk_orchidpp.model b/pythainlp/corpus/crfchunk_orchidpp.model new file mode 100644 index 000000000..b94b64e4f Binary files /dev/null and b/pythainlp/corpus/crfchunk_orchidpp.model differ diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 390188ba8..5dd10b784 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -16,3 +16,4 @@ from pythainlp.tag.locations import tag_provinces from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents from pythainlp.tag._tag_perceptron import PerceptronTagger +from pythainlp.tag.chunk import chunk_parse diff --git a/pythainlp/tag/chunk.py b/pythainlp/tag/chunk.py new file mode 100644 index 000000000..a7c299fb6 --- /dev/null +++ b/pythainlp/tag/chunk.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +from typing import Dict, List, Tuple + + +def chunk_parse( + sent: List[Tuple[str, str]], + engine="crf", corpus="orchidpp" +) -> List[str]: + """ + This function parse thai sentence to phrase structure in IOB format. + + :param list sent: list [(word,part-of-speech)] + :param str engine: chunk parse engine (now, it has orchidpp only) + + :return: a list of tuple (word,part-of-speech,chunking) + :rtype: List[str] + """ + from .crfchunk import CRFchunk + _engine = CRFchunk() + return _engine.parse(sent) diff --git a/pythainlp/tag/crfchunk.py b/pythainlp/tag/crfchunk.py new file mode 100644 index 000000000..bebaf60ff --- /dev/null +++ b/pythainlp/tag/crfchunk.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +from typing import Dict, List, Tuple, Union +from pycrfsuite import Tagger as CRFTagger +from pythainlp.corpus import path_pythainlp_corpus, thai_stopwords + + +def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย + return word in thai_stopwords() + + +def _doc2features(tokens: List[Tuple[str, str]], index: int) -> Dict: + """ + `tokens` = a POS-tagged sentence [(w1, t1), ...] + `index` = the index of the token we want to extract features for + """ + word, pos = tokens[index] + f = { + 'word': word, + 'word_is_stopword': _is_stopword(word), + 'pos': pos, + } + if index > 0 and index > 1: + prevprevword, prevprevpos = tokens[index - 2] + f['prev-prev-word'] = prevprevword + f['prev-prevz-word_is_stopword'] = _is_stopword(prevprevword) + f['prev-prevz-pos'] = prevprevpos + if index > 0: + prevword, prevpos = tokens[index-1] + f['prev-word'] = prevword + f['prev-word_is_stopword'] = _is_stopword(prevword) + f['prev-pos'] = prevpos + else: + f['BOS'] = True + if index < len(tokens)-2: + nextnextword, nextnextpos = tokens[index + 2] + f['nextnext-word'] = nextnextword + f['nextnext-word_is_stopword'] = _is_stopword(nextnextword) + f['nextnext-pos'] = nextnextpos + if index < len(tokens)-1: + nextword, nextpos = tokens[index+1] + f['next-word'] = nextword + f['next-word_is_stopword'] = _is_stopword(nextword) + f['next-pos'] = nextpos + else: + f['EOS'] = True + + return f + + +def extract_features(doc): + return [_doc2features(doc, i) for i in range(0, len(doc))] + + +class CRFchunk: + def __init__(self, corpus: str = "orchidpp"): + self.corpus = corpus + self.load_model(self.corpus) + + def load_model(self, corpus: str): + self.tagger = CRFTagger() + if corpus == "orchidpp": + self.path = path_pythainlp_corpus("crfchunk_orchidpp.model") + self.tagger.open(self.path) + + def parse(self, token_pos: List[Tuple[str, str]]): + self.xseq = extract_features(token_pos) + return self.tagger.tag(self.xseq) diff --git a/tests/test_tag.py b/tests/test_tag.py index bff5b3e37..467be8c58 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -4,6 +4,7 @@ from os import path from pythainlp.tag import ( + chunk_parse, PerceptronTagger, perceptron, pos_tag, @@ -15,6 +16,13 @@ class TestTagPackage(unittest.TestCase): + # ### pythainlp.tag.PerceptronTagger + + def test_chunk_parse(self): + tokens = ["ผม", "รัก", "คุณ"] + + w_p = pos_tag(tokens, engine="perceptron", corpus="orchid") + self.assertIsNotNone(chunk_parse(w_p)) # ### pythainlp.tag.pos_tag