PyThaiNLP · wannaphong · Mar 15, 2021 · Feb 26, 2021 · Feb 26, 2021 · Feb 26, 2021
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -79,3 +79,5 @@ tcc
 etcc
 ++++
 .. automodule:: pythainlp.tokenize.etcc
+
+.. autofunction:: pythainlp.tokenize.etcc.segment
diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst
@@ -3,6 +3,8 @@
 pythainlp.ulmfit
 ====================================
 
+Universal Language Model Fine-tuning for Text Classification (ULMFiT).
+
 Modules
 -------
 .. autoclass:: ThaiTokenizer

diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst
@@ -0,0 +1,42 @@
+.. currentmodule:: pythainlp.wangchanberta
+
+pythainlp.wangchanberta
+=======================
+
+WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_2021]_
+
+We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer.
+
+**Speed Benchmark**
+
+============================= ======================== ==============
+Function                      Named Entity Recognition Part of Speech
+============================= ======================== ==============
+PyThaiNLP basic function      89.7 ms                  312 ms
+pythainlp.wangchanberta (CPU) 9.64 s                   9.65 s
+pythainlp.wangchanberta (GPU) 8.02 s                   8 s
+============================= ======================== ==============
+
+Notebook:
+
+-  `PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google
+   Colab`_
+-  `pythainlp.wangchanberta GPU`_
+
+.. _PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google Colab: https://colab.research.google.com/drive/1ymTVB1UESXAyZlSpjknCb72xpdcZ86Db?usp=sharing
+.. _pythainlp.wangchanberta GPU: https://colab.research.google.com/drive/1AtkFT1HMGL2GO7O2tM_hi_7mExKwmhMw?usp=sharing
+
+Modules
+-------
+.. autoclass:: ThaiNameTagger
+    :members:
+.. autofunction:: pos_tag
+.. autofunction:: segment
+
+References
+----------
+
+.. [#Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S.
+            WangchanBERTa: Pretraining transformer-based Thai Language Models.
+            arXiv:210109635 [cs] [Internet]. 2021 Jan 23 [cited 2021 Feb 27];
+            Available from: http://arxiv.org/abs/2101.09635
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -12,11 +12,14 @@ def pos_tag(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
+        * *wangchanberta* - wangchanberta model (support lst20 corpus only \
+            and it supports a string only. if you input a list of word, \
+            it will convert list word to a string.
     :param str corpus:
         the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
             by National Electronics and Computer Technology Center, Thailand
-        * *lst20_ud* - LST20 text, with tags mapped to Universal POS tags \
+        * *lst20_ud* - LST20 text, with tags mapped to Universal POS tag \
             from `Universal Dependencies <https://universaldependencies.org/>`
         * *orchid* - `ORCHID \
             <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \
@@ -88,6 +91,9 @@ def pos_tag(
 
     if engine == "perceptron":
         from pythainlp.tag.perceptron import tag as tag_
+    elif engine == "wangchanberta" and corpus == "lst20":
+        from pythainlp.wangchanberta.postag import pos_tag as tag_
+        words = ''.join(words)
     else:  # default, use "unigram" ("old") engine
         from pythainlp.tag.unigram import tag as tag_
 

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -301,6 +301,7 @@ def subword_tokenize(
     **Options for engine**
         * *tcc* (default) -  Thai Character Cluster (Theeramunkong et al. 2000)
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
+        * *wangchanberta* - SentencePiece from wangchanberta model.
 
     :Example:
 
@@ -320,7 +321,7 @@ def subword_tokenize(
         # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก',
         'และ', 'พัฒ','นา', 'กา', 'ร']
 
-    Tokenize text into subword based on *etcc* **(Work In Progress)**::
+    Tokenize text into subword based on *etcc*::
 
         text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
         text_2 = "ความแปลกแยกและพัฒนาการ"
@@ -330,6 +331,17 @@ def subword_tokenize(
 
         subword_tokenize(text_2, engine='etcc')
         # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ']
+
+    Tokenize text into subword based on *wangchanberta*::
+
+        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
+        text_2 = "ความแปลกแยกและพัฒนาการ"
+
+        subword_tokenize(text_1, engine='wangchanberta')
+        # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง']
+
+        subword_tokenize(text_2, engine='wangchanberta')
+        # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ']
     """
     if not text or not isinstance(text, str):
         return []
@@ -338,6 +350,8 @@ def subword_tokenize(
         from pythainlp.tokenize.tcc import segment
     elif engine == "etcc":
         from pythainlp.tokenize.etcc import segment
+    elif engine == "wangchanberta":
+        from pythainlp.wangchanberta import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.

diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py
@@ -0,0 +1,8 @@
+__all__ = [
+    "ThaiNameTagger",
+    "pos_tag",
+    "segment",
+]
+
+from pythainlp.wangchanberta.core import ThaiNameTagger, segment
+from pythainlp.wangchanberta.postag import pos_tag
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
@@ -0,0 +1,144 @@
+from typing import Dict, List, Tuple, Union
+import re
+from transformers import (
+    CamembertTokenizer,
+    AutoTokenizer,
+    pipeline,
+)
+
+_model_name = "wangchanberta-base-att-spm-uncased"
+_tokenizer = CamembertTokenizer.from_pretrained(
+        f'airesearch/{_model_name}',
+        revision='main')
+if _model_name == "wangchanberta-base-att-spm-uncased":
+    _tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+
+
+class ThaiNameTagger:
+    def __init__(
+        self,
+        dataset_name: str = "thainer",
+        grouped_entities: bool = True
+    ):
+        """
+        This function tags named-entitiy from text in IOB format.
+
+        Powered by wangchanberta from VISTEC-depa\
+             AI Research Institute of Thailand
+
+        :param str dataset_name:
+            * *thainer* - ThaiNER dataset
+            * *lst20* - LST20 Corpus
+        :param bool grouped_entities: grouped entities
+        """
+        self.dataset_name = dataset_name
+        self.grouped_entities = grouped_entities
+        self.classify_tokens = pipeline(
+            task='ner',
+            tokenizer=_tokenizer,
+            model=f'airesearch/{_model_name}',
+            revision=f'finetuned@{self.dataset_name}-ner',
+            ignore_labels=[],
+            grouped_entities=self.grouped_entities)
+
+    def _IOB(self, tag):
+        if tag != "O":
+            return "B-"+tag
+        return "O"
+
+    def _clear_tag(self, tag):
+        return tag.replace('B-', '').replace('I-', '')
+
+    def get_ner(
+        self, text: str, tag: bool = False
+    ) -> Union[List[Tuple[str, str]], str]:
+        """
+        This function tags named-entitiy from text in IOB format.
+
+        Powered by wangchanberta from VISTEC-depa\
+             AI Research Institute of Thailand
+        :param str text: text in Thai to be tagged
+        :param bool tag: output like html tag.
+        :return: a list of tuple associated with tokenized word group, NER tag,
+                 and output like html tag (if the parameter `tag` is
+                 specified as `True`).
+                 Otherwise, return a list of tuple associated with tokenized
+                 word and NER tag
+        :rtype: Union[list[tuple[str, str]]], str
+        """
+        text = re.sub(" ", "<_>", text)
+        self.json_ner = self.classify_tokens(text)
+        self.output = ""
+        if self.grouped_entities and self.dataset_name == "thainer":
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " ").replace('▁', ''),
+                    self._IOB(i['entity_group'])
+                ) for i in self.json_ner
+            ]
+        elif self.dataset_name == "thainer":
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " ").replace('▁', ''), i['entity']
+                ) for i in self.json_ner if i['word'] != '▁'
+            ]
+        elif self.grouped_entities and self.dataset_name == "lst20":
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " ").replace('▁', ''),
+                    i['entity_group'].replace('_', '-').replace('E-', 'I-')
+                ) for i in self.json_ner
+            ]
+        else:
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " ").replace('▁', ''),
+                    i['entity'].replace('_', '-').replace('E-', 'I-')
+                ) for i in self.json_ner
+            ]
+        if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1:
+            self.sent_ner = self.sent_ner[1:]
+        for idx, (word, ner) in enumerate(self.sent_ner):
+            if idx > 0 and ner.startswith("B-"):
+                if (
+                    self._clear_tag(ner) == self._clear_tag(
+                        self.sent_ner[idx-1][1]
+                    )
+                ):
+                    self.sent_ner[idx] = (word, ner.replace('B-', 'I-'))
+        if tag:
+            temp = ""
+            sent = ""
+            for idx, (word, ner) in enumerate(self.sent_ner):
+                if ner.startswith("B-") and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner.startswith("B-"):
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner == "O" and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ""
+                sent += word
+
+                if idx == len(self.sent_ner) - 1 and temp != "":
+                    sent += "</" + temp + ">"
+
+            return sent
+        else:
+            return self.sent_ner
+
+
+def segment(text: str) -> List[str]:
+    """
+    Subword tokenize. SentencePiece from wangchanberta model.
+
+    :param str text: text to be tokenized
+    :return: list of subwords
+    :rtype: list[str]
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    return _tokenizer.tokenize(text)
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
@@ -0,0 +1,101 @@
+from typing import Dict, List, Tuple, Union
+import re
+from transformers import (
+    CamembertTokenizer,
+    AutoTokenizer,
+    pipeline,
+)
+
+_model_name = "wangchanberta-base-att-spm-uncased"
+_tokenizer = CamembertTokenizer.from_pretrained(
+        f'airesearch/{_model_name}',
+        revision='main')
+if _model_name == "wangchanberta-base-att-spm-uncased":
+    _tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+
+
+class PosTagTransformers:
+    def __init__(
+        self,
+        corpus: str = "lst20",
+        grouped_word: bool = False
+    ) -> None:
+        self.corpus = corpus
+        self.grouped_word = grouped_word
+        self.load()
+
+    def load(self):
+        self.classify_tokens = pipeline(
+            task='ner',
+            tokenizer=_tokenizer,
+            model=f'airesearch/{_model_name}',
+            revision=f'finetuned@{self.corpus}-pos',
+            ignore_labels=[],
+            grouped_entities=self.grouped_word
+        )
+
+    def tag(
+        self, text: str, corpus: str = "lst20", grouped_word: bool = False
+    ) -> List[Tuple[str, str]]:
+        if (
+            corpus != self.corpus and corpus in ['lst20']
+        ) or grouped_word != self.grouped_word:
+            self.grouped_word = grouped_word
+            self.corpus = corpus
+            self.load()
+        text = re.sub(" ", "<_>", text)
+        self.json_pos = self.classify_tokens(text)
+        self.output = ""
+        if grouped_word:
+            self.sent_pos = [
+                (
+                    i['word'].replace("<_>", " "), i['entity_group']
+                ) for i in self.json_pos
+            ]
+        else:
+            self.sent_pos = [
+                (
+                    i['word'].replace("<_>", " ").replace('▁', ''),
+                    i['entity']
+                )
+                for i in self.json_pos if i['word'] != '▁'
+            ]
+        return self.sent_pos
+
+
+_corpus = "lst20"
+_grouped_word = False
+_postag = PosTagTransformers(corpus=_corpus, grouped_word=_grouped_word)
+
+
+def pos_tag(
+    text: str, corpus: str = "lst20", grouped_word: bool = False
+) -> List[Tuple[str, str]]:
+    """
+    Marks words with part-of-speech (POS) tags.
+
+    :param str text: thai text
+    :param str corpus:
+        * *lst20* - a LST20 tagger (default)
+    :param bool grouped_word: grouped word (default is False)
+    :return: a list of tuples (word, POS tag)
+    :rtype: list[tuple[str, str]]
+    """
+    global _grouped_word, _postag
+    if isinstance(text, list):
+        text = ''.join(text)
+    elif not text or not isinstance(text, str):
+        return []
+    if corpus not in ["lst20"]:
+        raise NotImplementedError()
+    if _grouped_word != grouped_word:
+        _postag = PosTagTransformers(
+            corpus=corpus,
+            grouped_word=grouped_word
+        )
+        _grouped_word = grouped_word
+    return _postag.tag(
+        text,
+        corpus=corpus,
+        grouped_word=grouped_word
+    )
-Original file line number
+Diff line change
@@ Expand Up / @@ -79,3 +79,5 @@ tcc @@
     etcc
     ++++
     .. automodule:: pythainlp.tokenize.etcc
+    .. autofunction:: pythainlp.tokenize.etcc.segment