From 43d2e5503628c0a2ba266351a303b084da81f87c Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 16:41:41 +0700
Subject: [PATCH 01/34] Add wangchanberta

---
 pythainlp/tag/wangchanberta.py | 70 ++++++++++++++++++++++++++++++++++
 setup.py                       |  3 +-
 2 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 pythainlp/tag/wangchanberta.py
diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py
new file mode 100644
index 000000000..0feb32443
--- /dev/null
+++ b/pythainlp/tag/wangchanberta.py
@@ -0,0 +1,70 @@
+from typing import Dict, List, Tuple, Union
+import re
+from transformers import (
+    CamembertTokenizer,
+    AutoTokenizer,
+    pipeline,
+)
+
+class ThaiNameTagger:
+    def __init__(self,
+                dataset_name: str = "thainer",
+                model_name:str = "wangchanberta-base-att-spm-uncased"
+                ) -> None:
+        self.model_name = model_name
+        self.tokenizer = CamembertTokenizer.from_pretrained(
+                                    f'airesearch/{self.model_name}',
+                                    revision='main')
+        if self.model_name == "wangchanberta-base-att-spm-uncased":
+            self.tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+        self.classify_tokens = pipeline(
+            task='ner',
+            tokenizer=self.tokenizer,
+            model = f'airesearch/{self.model_name}',
+            revision = f'finetuned@{dataset_name}-ner',
+            ignore_labels=[], 
+            grouped_entities=True
+        )
+
+    def get_ner(
+        self, text: str, tag: bool = False
+    ) -> List[Tuple[str, str]]:
+        """
+        This function tags named-entitiy from text in IOB format.
+
+        Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand
+        :param str text: text in Thai to be tagged
+        :param bool tag: output like html tag.
+        :return: a list of tuple associated with tokenized word group, NER tag,
+                 and output like html tag (if the parameter `tag` is
+                 specified as `True`).
+                 Otherwise, return a list of tuple associated with tokenized
+                 word and NER tag
+        :rtype: Union[list[tuple[str, str]]], str
+        """
+        text = re.sub(" ", "<_>", text)
+        self.json_ner = self.classify_tokens(text)
+        self.output = ""
+        self.sent_ner = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_ner]
+        if tag:
+            temp = ""
+            sent = ""
+            for idx, (word, ner) in enumerate(self.sent_ner):
+                if ner.startswith("B-") and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner.startswith("B-"):
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner == "O" and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ""
+                sent += word
+
+                if idx == len(self.sent_ner) - 1 and temp != "":
+                    sent += "</" + temp + ">"
+
+            return sent
+        
+        return self.sent_ner
diff --git a/setup.py b/setup.py
index 208fa9f55..cec6f9a8b 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@
         "sentencepiece>=0.1.91",
         "torch>=1.0.0",
     ],
+    "transformers": ["transformers"],
     "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"],
     "wordnet": ["nltk>=3.3.*"],
     "full": [
@@ -70,7 +71,7 @@
         "sentencepiece>=0.1.91",
         "ssg>=0.0.6",
         "torch>=1.0.0",
-        "transformers>=4.1.1"
+        "transformers>=4.1.1",
     ],
 }
 

From 8dd0f21f5530493eabfb7026203342be1e62eb19 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 16:46:19 +0700
Subject: [PATCH 02/34] Update wangchanberta.py

---
 pythainlp/tag/wangchanberta.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py
index 0feb32443..00e622578 100644
--- a/pythainlp/tag/wangchanberta.py
+++ b/pythainlp/tag/wangchanberta.py
@@ -6,11 +6,11 @@
     pipeline,
 )
 
+
 class ThaiNameTagger:
     def __init__(self,
                 dataset_name: str = "thainer",
-                model_name:str = "wangchanberta-base-att-spm-uncased"
-                ) -> None:
+                model_name: str = "wangchanberta-base-att-spm-uncased"):
         self.model_name = model_name
         self.tokenizer = CamembertTokenizer.from_pretrained(
                                     f'airesearch/{self.model_name}',

From fc20b4b8544a2018e47e147be46852f2ce33ab8c Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 17:02:57 +0700
Subject: [PATCH 03/34] fixed IOB

---
 pythainlp/tag/wangchanberta.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py
index 00e622578..bed608557 100644
--- a/pythainlp/tag/wangchanberta.py
+++ b/pythainlp/tag/wangchanberta.py
@@ -25,6 +25,11 @@ def __init__(self,
             ignore_labels=[], 
             grouped_entities=True
         )
+    
+    def IOB(self, tag):
+        if tag != "O":
+            return "B-"+tag
+        return "O"
 
     def get_ner(
         self, text: str, tag: bool = False
@@ -45,7 +50,7 @@ def get_ner(
         text = re.sub(" ", "<_>", text)
         self.json_ner = self.classify_tokens(text)
         self.output = ""
-        self.sent_ner = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_ner]
+        self.sent_ner = [(i['word'].replace("<_>", " "),self.IOB(i['entity_group'])) for i in self.json_ner]
         if tag:
             temp = ""
             sent = ""

From 78377251f5fec2ce4b961b37697feafc2395ddd2 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 17:13:48 +0700
Subject: [PATCH 04/34] Update wangchanberta.py

fixed grouped_entities
---
 pythainlp/tag/wangchanberta.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py
index bed608557..53cfa8572 100644
--- a/pythainlp/tag/wangchanberta.py
+++ b/pythainlp/tag/wangchanberta.py
@@ -10,8 +10,11 @@
 class ThaiNameTagger:
     def __init__(self,
                 dataset_name: str = "thainer",
-                model_name: str = "wangchanberta-base-att-spm-uncased"):
+                model_name: str = "wangchanberta-base-att-spm-uncased",
+                grouped_entities: bool = True):
         self.model_name = model_name
+        self.dataset_name = dataset_name
+        self.grouped_entities = grouped_entities
         self.tokenizer = CamembertTokenizer.from_pretrained(
                                     f'airesearch/{self.model_name}',
                                     revision='main')
@@ -21,9 +24,9 @@ def __init__(self,
             task='ner',
             tokenizer=self.tokenizer,
             model = f'airesearch/{self.model_name}',
-            revision = f'finetuned@{dataset_name}-ner',
+            revision = f'finetuned@{self.dataset_name}-ner',
             ignore_labels=[], 
-            grouped_entities=True
+            grouped_entities=self.grouped_entities
         )
     
     def IOB(self, tag):
@@ -50,7 +53,10 @@ def get_ner(
         text = re.sub(" ", "<_>", text)
         self.json_ner = self.classify_tokens(text)
         self.output = ""
-        self.sent_ner = [(i['word'].replace("<_>", " "),self.IOB(i['entity_group'])) for i in self.json_ner]
+        if self.grouped_entities:
+            self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner]
+        else:
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner]
         if tag:
             temp = ""
             sent = ""

From f1548f67665d9dfda2576925775449a4c4ef2c70 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 17:15:52 +0700
Subject: [PATCH 05/34] Update wangchanberta.py

---
 pythainlp/tag/wangchanberta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py
index 53cfa8572..bf80154ce 100644
--- a/pythainlp/tag/wangchanberta.py
+++ b/pythainlp/tag/wangchanberta.py
@@ -56,7 +56,7 @@ def get_ner(
         if self.grouped_entities:
             self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner]
         else:
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner]
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁']
         if tag:
             temp = ""
             sent = ""

From ca551d2c42b73a9383ba06accf3fad45f43bb8d8 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 19:36:07 +0700
Subject: [PATCH 06/34] Add wangchanberta.PosTagTransformers

---
 pythainlp/tag/pos_tag.py       |  4 +++
 pythainlp/tag/wangchanberta.py | 52 ++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index da7865a68..87dd353b9 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -12,6 +12,7 @@ def pos_tag(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
+        * *wangchanberta* - wangchanberta model (support lst20 corpus only)
     :param str corpus:
         the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
@@ -88,6 +89,9 @@ def pos_tag(
 
     if engine == "perceptron":
         from pythainlp.tag.perceptron import tag as tag_
+    elif engine == "wangchanberta" and corpus == "lst20":
+        from pythainlp.tag.wangchanberta import wangchanberta_pos_tag as tag_
+        words = ''.join(words)
     else:  # default, use "unigram" ("old") engine
         from pythainlp.tag.unigram import tag as tag_
 
diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py
index bf80154ce..87f574084 100644
--- a/pythainlp/tag/wangchanberta.py
+++ b/pythainlp/tag/wangchanberta.py
@@ -79,3 +79,55 @@ def get_ner(
             return sent
         
         return self.sent_ner
+
+
+class PosTagTransformers:
+    def __init__(self,
+                corpus: str = "lst20",
+                model_name:str = "wangchanberta-base-att-spm-uncased",
+                grouped_word: bool = False
+                ) -> None:
+        self.model_name = model_name
+        self.corpus = corpus
+        self.grouped_word = grouped_word
+        self.tokenizer = CamembertTokenizer.from_pretrained(
+                                    f'airesearch/{self.model_name}',
+                                    revision='main')
+        if self.model_name == "wangchanberta-base-att-spm-uncased":
+            self.tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+        self.load()
+
+    def load(self):
+        self.classify_tokens = pipeline(
+            task='ner',
+            tokenizer=self.tokenizer,
+            model = f'airesearch/{self.model_name}',
+            revision = f'finetuned@{self.corpus}-pos',
+            ignore_labels=[], 
+            grouped_entities=self.grouped_word
+        )
+
+    def tag(
+        self, text: str, corpus: str = False, grouped_word: bool = False
+    ) -> List[Tuple[str, str]]:
+        if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word:
+            self.grouped_word = grouped_word
+            self.corpus = corpus
+            self.load()
+        text = re.sub(" ", "<_>", text)
+        self.json_pos = self.classify_tokens(text)
+        self.output = ""
+        if grouped_word:
+            self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos]
+        else:
+            self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
+        return self.sent_pos
+
+
+def wangchanberta_pos_tag(
+    text: str, corpus: str = "lst20", grouped_word = False
+) -> List[Tuple[str, str]]:
+    if corpus not in ["lst20"]:
+        raise NotImplementedError()
+    _tag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word)
+    return _tag.tag(text)
\ No newline at end of file

From 266d8f6bbedd4db0157f76c9eeddd9cbd9972a48 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 20:13:03 +0700
Subject: [PATCH 07/34] Move file to pythainlp.wangchanberta

---
 pythainlp/tag/pos_tag.py                      |  2 +-
 pythainlp/tokenize/core.py                    |  3 +
 pythainlp/wangchanberta/__init__.py           |  8 +++
 .../core.py}                                  | 39 ++++-------
 pythainlp/wangchanberta/postag.py             | 65 +++++++++++++++++++
 5 files changed, 90 insertions(+), 27 deletions(-)
 create mode 100644 pythainlp/wangchanberta/__init__.py
 rename pythainlp/{tag/wangchanberta.py => wangchanberta/core.py} (73%)
 create mode 100644 pythainlp/wangchanberta/postag.py

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index 87dd353b9..8ca0eee2f 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -90,7 +90,7 @@ def pos_tag(
     if engine == "perceptron":
         from pythainlp.tag.perceptron import tag as tag_
     elif engine == "wangchanberta" and corpus == "lst20":
-        from pythainlp.tag.wangchanberta import wangchanberta_pos_tag as tag_
+        import pythainlp.wangchanberta.pos_tag as tag_
         words = ''.join(words)
     else:  # default, use "unigram" ("old") engine
         from pythainlp.tag.unigram import tag as tag_
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index b615deaac..5304cd8b8 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -301,6 +301,7 @@ def subword_tokenize(
     **Options for engine**
         * *tcc* (default) -  Thai Character Cluster (Theeramunkong et al. 2000)
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
+        * *wangchanberta* - SentencePiece from wangchanberta model.
 
     :Example:
 
@@ -338,6 +339,8 @@ def subword_tokenize(
         from pythainlp.tokenize.tcc import segment
     elif engine == "etcc":
         from pythainlp.tokenize.etcc import segment
+    elif engine == "wangchanberta":
+        from pythainlp.wangchanberta import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py
new file mode 100644
index 000000000..048d2b6e5
--- /dev/null
+++ b/pythainlp/wangchanberta/__init__.py
@@ -0,0 +1,8 @@
+__all__ = [
+    "ThaiNameTagger",
+    "pos_tag",
+    "segment",
+]
+
+from pythainlp.wangchanberta.core import ThaiNameTagger, segment
+from pythainlp.wangchanberta.postag import pos_tag
\ No newline at end of file
diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/wangchanberta/core.py
similarity index 73%
rename from pythainlp/tag/wangchanberta.py
rename to pythainlp/wangchanberta/core.py
index 87f574084..512fdf6e8 100644
--- a/pythainlp/tag/wangchanberta.py
+++ b/pythainlp/wangchanberta/core.py
@@ -6,24 +6,24 @@
     pipeline,
 )
 
+_model_name = "wangchanberta-base-att-spm-uncased"
+_tokenizer = CamembertTokenizer.from_pretrained(
+        f'airesearch/{_model_name}',
+        revision='main')
+if _model_name == "wangchanberta-base-att-spm-uncased":
+    _tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+
 
 class ThaiNameTagger:
     def __init__(self,
                 dataset_name: str = "thainer",
-                model_name: str = "wangchanberta-base-att-spm-uncased",
                 grouped_entities: bool = True):
-        self.model_name = model_name
         self.dataset_name = dataset_name
         self.grouped_entities = grouped_entities
-        self.tokenizer = CamembertTokenizer.from_pretrained(
-                                    f'airesearch/{self.model_name}',
-                                    revision='main')
-        if self.model_name == "wangchanberta-base-att-spm-uncased":
-            self.tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
         self.classify_tokens = pipeline(
             task='ner',
-            tokenizer=self.tokenizer,
-            model = f'airesearch/{self.model_name}',
+            tokenizer=_tokenizer,
+            model = f'airesearch/{_model_name}',
             revision = f'finetuned@{self.dataset_name}-ner',
             ignore_labels=[], 
             grouped_entities=self.grouped_entities
@@ -84,24 +84,17 @@ def get_ner(
 class PosTagTransformers:
     def __init__(self,
                 corpus: str = "lst20",
-                model_name:str = "wangchanberta-base-att-spm-uncased",
                 grouped_word: bool = False
                 ) -> None:
-        self.model_name = model_name
         self.corpus = corpus
         self.grouped_word = grouped_word
-        self.tokenizer = CamembertTokenizer.from_pretrained(
-                                    f'airesearch/{self.model_name}',
-                                    revision='main')
-        if self.model_name == "wangchanberta-base-att-spm-uncased":
-            self.tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
         self.load()
 
     def load(self):
         self.classify_tokens = pipeline(
             task='ner',
-            tokenizer=self.tokenizer,
-            model = f'airesearch/{self.model_name}',
+            tokenizer=_tokenizer,
+            model = f'airesearch/{_model_name}',
             revision = f'finetuned@{self.corpus}-pos',
             ignore_labels=[], 
             grouped_entities=self.grouped_word
@@ -123,11 +116,5 @@ def tag(
             self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
         return self.sent_pos
 
-
-def wangchanberta_pos_tag(
-    text: str, corpus: str = "lst20", grouped_word = False
-) -> List[Tuple[str, str]]:
-    if corpus not in ["lst20"]:
-        raise NotImplementedError()
-    _tag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word)
-    return _tag.tag(text)
\ No newline at end of file
+def segment(text):
+    return _tokenizer.tokenize(text)
\ No newline at end of file
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
new file mode 100644
index 000000000..6cc01604e
--- /dev/null
+++ b/pythainlp/wangchanberta/postag.py
@@ -0,0 +1,65 @@
+from typing import Dict, List, Tuple, Union
+import re
+from transformers import (
+    CamembertTokenizer,
+    AutoTokenizer,
+    pipeline,
+)
+
+_model_name = "wangchanberta-base-att-spm-uncased"
+_tokenizer = CamembertTokenizer.from_pretrained(
+        f'airesearch/{_model_name}',
+        revision='main')
+if _model_name == "wangchanberta-base-att-spm-uncased":
+    _tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']
+
+
+class PosTagTransformers:
+    def __init__(self,
+                corpus: str = "lst20",
+                grouped_word: bool = False
+                ) -> None:
+        self.corpus = corpus
+        self.grouped_word = grouped_word
+        self.load()
+
+    def load(self):
+        self.classify_tokens = pipeline(
+            task='ner',
+            tokenizer=_tokenizer,
+            model = f'airesearch/{_model_name}',
+            revision = f'finetuned@{self.corpus}-pos',
+            ignore_labels=[], 
+            grouped_entities=self.grouped_word
+        )
+
+    def tag(
+        self, text: str, corpus: str = "lst20", grouped_word: bool = False
+    ) -> List[Tuple[str, str]]:
+        if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word:
+            self.grouped_word = grouped_word
+            self.corpus = corpus
+            self.load()
+        text = re.sub(" ", "<_>", text)
+        self.json_pos = self.classify_tokens(text)
+        self.output = ""
+        if grouped_word:
+            self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos]
+        else:
+            self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
+        return self.sent_pos
+
+_corpus = "lst20"
+_grouped_word = False
+_postag = PosTagTransformers(corpus=_corpus, grouped_word = _grouped_word)
+
+def pos_tag(
+    text: str, corpus: str = "lst20", grouped_word = False
+) -> List[Tuple[str, str]]:
+    global _grouped_word,_postag
+    if corpus not in ["lst20"]:
+        raise NotImplementedError()
+    if _grouped_word != grouped_word:
+        _postag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word)
+        _grouped_word = grouped_word
+    return _postag.tag(text)
\ No newline at end of file

From df20fd2f5d0d490c981cde080246bac8366b3187 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 20:25:03 +0700
Subject: [PATCH 08/34] Update wangchanberta requirements

---
 pythainlp/wangchanberta/__init__.py |  2 +-
 pythainlp/wangchanberta/core.py     |  2 +-
 pythainlp/wangchanberta/postag.py   | 13 ++++++++-----
 setup.py                            |  2 +-
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py
index 048d2b6e5..bef6eca56 100644
--- a/pythainlp/wangchanberta/__init__.py
+++ b/pythainlp/wangchanberta/__init__.py
@@ -5,4 +5,4 @@
 ]
 
 from pythainlp.wangchanberta.core import ThaiNameTagger, segment
-from pythainlp.wangchanberta.postag import pos_tag
\ No newline at end of file
+from pythainlp.wangchanberta.postag import pos_tag
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 512fdf6e8..d475fbe2e 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -117,4 +117,4 @@ def tag(
         return self.sent_pos
 
 def segment(text):
-    return _tokenizer.tokenize(text)
\ No newline at end of file
+    return _tokenizer.tokenize(text)
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index 6cc01604e..0df0eb038 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -44,7 +44,7 @@ def tag(
         self.json_pos = self.classify_tokens(text)
         self.output = ""
         if grouped_word:
-            self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos]
+            self.sent_pos = [(i['word'].replace("<_>", " "), i['entity_group']) for i in self.json_pos]
         else:
             self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
         return self.sent_pos
@@ -54,12 +54,15 @@ def tag(
 _postag = PosTagTransformers(corpus=_corpus, grouped_word = _grouped_word)
 
 def pos_tag(
-    text: str, corpus: str = "lst20", grouped_word = False
+    text: str, corpus: str = "lst20", grouped_word: bool = False
 ) -> List[Tuple[str, str]]:
-    global _grouped_word,_postag
+    global _grouped_word, _postag
     if corpus not in ["lst20"]:
         raise NotImplementedError()
     if _grouped_word != grouped_word:
-        _postag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word)
+        _postag = PosTagTransformers(
+            corpus=corpus,
+            grouped_word = grouped_word
+        )
         _grouped_word = grouped_word
-    return _postag.tag(text)
\ No newline at end of file
+    return _postag.tag(text)
diff --git a/setup.py b/setup.py
index cec6f9a8b..cad25696e 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
         "sentencepiece>=0.1.91",
         "torch>=1.0.0",
     ],
-    "transformers": ["transformers"],
+    "wangchanberta": ["transformers", "sentencepiece"],
     "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"],
     "wordnet": ["nltk>=3.3.*"],
     "full": [

From c94f24160e1d9189f13ea0029cccb815fb68d3da Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 20:30:55 +0700
Subject: [PATCH 09/34] Update postag.py

---
 pythainlp/wangchanberta/postag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index 0df0eb038..1a6697ba3 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -65,4 +65,4 @@ def pos_tag(
             grouped_word = grouped_word
         )
         _grouped_word = grouped_word
-    return _postag.tag(text)
+    return _postag.tag(text, corpus = corpus,grouped_word = grouped_word)

From e4f7ba1d7bfb69474bb1b58d4a36f230bc31566d Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 20:42:29 +0700
Subject: [PATCH 10/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index d475fbe2e..8a4e70407 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -53,10 +53,15 @@ def get_ner(
         text = re.sub(" ", "<_>", text)
         self.json_ner = self.classify_tokens(text)
         self.output = ""
-        if self.grouped_entities:
+        if self.grouped_entities and self.dataset_name == "thainer":
             self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner]
-        else:
+        elif self.dataset_name == "thainer":
             self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁']
+        elif self.grouped_entities and self.dataset_name == "lst20":
+            self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'].replace('_','-').replace('E-','I-'))) for i in self.json_ner]
+        else:
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
+
         if tag:
             temp = ""
             sent = ""

From 79f0b833782aa5d9c5c0868a578fb8470d2df8ec Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 20:43:29 +0700
Subject: [PATCH 11/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 8a4e70407..74c055c43 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -58,7 +58,7 @@ def get_ner(
         elif self.dataset_name == "thainer":
             self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁']
         elif self.grouped_entities and self.dataset_name == "lst20":
-            self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'].replace('_','-').replace('E-','I-'))) for i in self.json_ner]
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
         else:
             self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
 

From ca81865d932d704a508b79ae8ef7f15bb75c8df7 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 20:47:09 +0700
Subject: [PATCH 12/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 74c055c43..05150bf70 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -60,7 +60,7 @@ def get_ner(
         elif self.grouped_entities and self.dataset_name == "lst20":
             self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
         else:
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
 
         if tag:
             temp = ""

From c4c1c4c524f5d6c6e5709b12f942e519c5fffeeb Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 21:39:54 +0700
Subject: [PATCH 13/34] Add test

---
 pythainlp/tag/pos_tag.py          |  2 +-
 pythainlp/wangchanberta/core.py   |  5 ++++-
 pythainlp/wangchanberta/postag.py |  4 ++++
 tests/test_tag.py                 |  9 +++++++++
 tests/test_tokenize.py            |  9 +++++++++
 tests/test_wangchanberta.py       | 18 ++++++++++++++++++
 6 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_wangchanberta.py

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index 8ca0eee2f..c8f1ae99d 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -90,7 +90,7 @@ def pos_tag(
     if engine == "perceptron":
         from pythainlp.tag.perceptron import tag as tag_
     elif engine == "wangchanberta" and corpus == "lst20":
-        import pythainlp.wangchanberta.pos_tag as tag_
+        from pythainlp.wangchanberta.postag import pos_tag as tag_
         words = ''.join(words)
     else:  # default, use "unigram" ("old") engine
         from pythainlp.tag.unigram import tag as tag_
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 05150bf70..65c4c150e 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -121,5 +121,8 @@ def tag(
             self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
         return self.sent_pos
 
-def segment(text):
+def segment(text: str) -> List[str]:
+    if not text or not isinstance(text, str):
+        return []
+
     return _tokenizer.tokenize(text)
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index 1a6697ba3..d69b45c27 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -57,6 +57,10 @@ def pos_tag(
     text: str, corpus: str = "lst20", grouped_word: bool = False
 ) -> List[Tuple[str, str]]:
     global _grouped_word, _postag
+    if isinstance(text, list):
+        text = ''.join(text)
+    elif not text or not isinstance(text, str):
+        return []
     if corpus not in ["lst20"]:
         raise NotImplementedError()
     if _grouped_word != grouped_word:
diff --git a/tests/test_tag.py b/tests/test_tag.py
index f4ffef759..5999a29ea 100644
--- a/tests/test_tag.py
+++ b/tests/test_tag.py
@@ -93,6 +93,15 @@ def test_pos_tag(self):
         self.assertIsNotNone(
             pos_tag(tokens, engine="perceptron", corpus="lst20_ud")
         )
+        self.assertEqual(
+            pos_tag([], engine="wangchanberta", corpus="lst20"), []
+        )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="wangchanberta", corpus="lst20")
+        )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud")
+        )
 
         self.assertEqual(pos_tag_sents(None), [])
         self.assertEqual(pos_tag_sents([]), [])
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 7b54def4f..f234d3c20 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -286,6 +286,15 @@ def test_subword_tokenize(self):
             "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc")
         )
         self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list)
+        self.assertEqual(subword_tokenize(None, engine="wangchanberta"), [])
+        self.assertEqual(subword_tokenize("", engine="wangchanberta"), [])
+        self.assertIsInstance(
+            subword_tokenize("สวัสดิีดาวอังคาร", engine="wangchanberta"), list
+        )
+        self.assertFalse(
+            "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta")
+        )
+        self.assertIsInstance(subword_tokenize("โควิด19", engine="wangchanberta"), list)
         self.assertFalse(
             " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
         )
diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
new file mode 100644
index 000000000..21b6eafcd
--- /dev/null
+++ b/tests/test_wangchanberta.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+
+from pythainlp.wangchanberta import ThaiNameTagger, pos_tag
+
+
+class TestWangchanberta(unittest.TestCase):
+    def test_thainer_wangchanberta(self):
+        ner = ThaiNameTagger()
+        self.assertIsNotNone(
+            ner.get_ner("I คิด therefore I am ผ็ฎ์")
+        )
+    def test_lst20_ner_wangchanberta(self):
+        ner = ThaiNameTagger(dataset_name="lst20")
+        self.assertIsNotNone(
+            ner.get_ner("I คิด therefore I am ผ็ฎ์")
+        )

From a57bd4ac9854670a0a35031ed21ed6b0ccebaccf Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 21:58:50 +0700
Subject: [PATCH 14/34] Update test

---
 pythainlp/wangchanberta/core.py | 35 ---------------------------------
 tests/test_wangchanberta.py     | 18 ++++++++++++++++-
 2 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 65c4c150e..76dd806a0 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -86,41 +86,6 @@ def get_ner(
         return self.sent_ner
 
 
-class PosTagTransformers:
-    def __init__(self,
-                corpus: str = "lst20",
-                grouped_word: bool = False
-                ) -> None:
-        self.corpus = corpus
-        self.grouped_word = grouped_word
-        self.load()
-
-    def load(self):
-        self.classify_tokens = pipeline(
-            task='ner',
-            tokenizer=_tokenizer,
-            model = f'airesearch/{_model_name}',
-            revision = f'finetuned@{self.corpus}-pos',
-            ignore_labels=[], 
-            grouped_entities=self.grouped_word
-        )
-
-    def tag(
-        self, text: str, corpus: str = False, grouped_word: bool = False
-    ) -> List[Tuple[str, str]]:
-        if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word:
-            self.grouped_word = grouped_word
-            self.corpus = corpus
-            self.load()
-        text = re.sub(" ", "<_>", text)
-        self.json_pos = self.classify_tokens(text)
-        self.output = ""
-        if grouped_word:
-            self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos]
-        else:
-            self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
-        return self.sent_pos
-
 def segment(text: str) -> List[str]:
     if not text or not isinstance(text, str):
         return []
diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index 21b6eafcd..c67f694cf 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -2,7 +2,7 @@
 
 import unittest
 
-from pythainlp.wangchanberta import ThaiNameTagger, pos_tag
+from pythainlp.wangchanberta import ThaiNameTagger, pos_tag, segment
 
 
 class TestWangchanberta(unittest.TestCase):
@@ -11,8 +11,24 @@ def test_thainer_wangchanberta(self):
         self.assertIsNotNone(
             ner.get_ner("I คิด therefore I am ผ็ฎ์")
         )
+        ner = ThaiNameTagger()
+        self.assertIsNotNone(
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+        )
+
     def test_lst20_ner_wangchanberta(self):
         ner = ThaiNameTagger(dataset_name="lst20")
         self.assertIsNotNone(
             ner.get_ner("I คิด therefore I am ผ็ฎ์")
         )
+        self.assertIsNotNone(
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+        )
+
+    def test_segment_wangchanberta(self):
+        self.assertIsNotNone(
+            segment("I คิด therefore I am ผ็ฎ์")
+        )
+        self.assertIsNotNone(
+            segment([])
+        )
\ No newline at end of file

From 68bf0503e2cf9810072fddc7547f6bbe1c9c146a Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 26 Feb 2021 22:55:32 +0700
Subject: [PATCH 15/34] Update test_wangchanberta.py

---
 tests/test_wangchanberta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index c67f694cf..bf607dbe1 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -31,4 +31,4 @@ def test_segment_wangchanberta(self):
         )
         self.assertIsNotNone(
             segment([])
-        )
\ No newline at end of file
+        )

From 757f9ac1453b78666fa0a12a928f027ef0b23284 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 27 Feb 2021 00:34:44 +0700
Subject: [PATCH 16/34] Add pythainlp.wangchanberta docs

---
 docs/api/ulmfit.rst             |  2 ++
 docs/api/wangchanberta.rst      | 22 ++++++++++++++++++++++
 pythainlp/tokenize/core.py      | 13 ++++++++++++-
 pythainlp/wangchanberta/core.py |  7 +++++++
 4 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 docs/api/wangchanberta.rst

diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst
index c6756a6b4..1f9aa002a 100644
--- a/docs/api/ulmfit.rst
+++ b/docs/api/ulmfit.rst
@@ -3,6 +3,8 @@
 pythainlp.ulmfit
 ====================================
 
+Universal Language Model Fine-tuning for Text Classification (ULMFiT).
+
 Modules
 -------
 .. autoclass:: ThaiTokenizer
diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst
new file mode 100644
index 000000000..7f495aba8
--- /dev/null
+++ b/docs/api/wangchanberta.rst
@@ -0,0 +1,22 @@
+.. currentmodule:: pythainlp.wangchanberta
+
+pythainlp.wangchanberta
+=======================
+
+WangchanBERTa base model: wangchanberta-base-att-spm-uncased[#Lowphansirikul_2021]_
+
+We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer.
+
+Modules
+-------
+.. autoclass:: ThaiNameTagger
+.. autofunction:: pos_tag
+.. autofunction:: segment
+
+References
+----------
+
+.. [#Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S.
+            WangchanBERTa: Pretraining transformer-based Thai Language Models.
+            arXiv:210109635 [cs] [Internet]. 2021 Jan 23 [cited 2021 Feb 27];
+            Available from: http://arxiv.org/abs/2101.09635
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 5304cd8b8..3fdd66e52 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -321,7 +321,7 @@ def subword_tokenize(
         # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก',
         'และ', 'พัฒ','นา', 'กา', 'ร']
 
-    Tokenize text into subword based on *etcc* **(Work In Progress)**::
+    Tokenize text into subword based on *etcc*::
 
         text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
         text_2 = "ความแปลกแยกและพัฒนาการ"
@@ -331,6 +331,17 @@ def subword_tokenize(
 
         subword_tokenize(text_2, engine='etcc')
         # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ']
+
+    Tokenize text into subword based on *wangchanberta*::
+
+        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
+        text_2 = "ความแปลกแยกและพัฒนาการ"
+
+        subword_tokenize(text_1, engine='wangchanberta')
+        # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง']
+
+        subword_tokenize(text_2, engine='wangchanberta')
+        # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ']
     """
     if not text or not isinstance(text, str):
         return []
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 76dd806a0..256bd6cc3 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -87,6 +87,13 @@ def get_ner(
 
 
 def segment(text: str) -> List[str]:
+    """
+    Subword tokenize. SentencePiece from wangchanberta model.
+
+    :param str text: text to be tokenized
+    :return: list of subwords
+    :rtype: list[str]
+    """
     if not text or not isinstance(text, str):
         return []
 

From 628cf50ccf1e6ae47c72b5b438c036823a198f9f Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 27 Feb 2021 14:45:30 +0700
Subject: [PATCH 17/34] Update tokenize.rst

---
 docs/api/tokenize.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
index 8b8b08c14..f5ceb13a3 100644
--- a/docs/api/tokenize.rst
+++ b/docs/api/tokenize.rst
@@ -79,3 +79,5 @@ tcc
 etcc
 ++++
 .. automodule:: pythainlp.tokenize.etcc
+
+.. autofunction:: pythainlp.tokenize.etcc.segment
\ No newline at end of file

From 8abe1b4caa46da4f80640039b09566db1a64ed07 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 1 Mar 2021 12:30:10 +0700
Subject: [PATCH 18/34] Fixed PEP8

---
 pythainlp/wangchanberta/core.py   |  7 +++----
 pythainlp/wangchanberta/postag.py | 32 +++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 256bd6cc3..4f737ee6e 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -25,7 +25,7 @@ def __init__(self,
             tokenizer=_tokenizer,
             model = f'airesearch/{_model_name}',
             revision = f'finetuned@{self.dataset_name}-ner',
-            ignore_labels=[], 
+            ignore_labels=[],
             grouped_entities=self.grouped_entities
         )
     
@@ -58,9 +58,9 @@ def get_ner(
         elif self.dataset_name == "thainer":
             self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁']
         elif self.grouped_entities and self.dataset_name == "lst20":
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner]
         else:
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_','-').replace('E-','I-')) for i in self.json_ner]
+            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner]
 
         if tag:
             temp = ""
@@ -82,7 +82,6 @@ def get_ner(
                     sent += "</" + temp + ">"
 
             return sent
-        
         return self.sent_ner
 
 
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index d69b45c27..6705a9417 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -17,8 +17,7 @@
 class PosTagTransformers:
     def __init__(self,
                 corpus: str = "lst20",
-                grouped_word: bool = False
-                ) -> None:
+                grouped_word: bool = False) -> None:
         self.corpus = corpus
         self.grouped_word = grouped_word
         self.load()
@@ -27,9 +26,9 @@ def load(self):
         self.classify_tokens = pipeline(
             task='ner',
             tokenizer=_tokenizer,
-            model = f'airesearch/{_model_name}',
-            revision = f'finetuned@{self.corpus}-pos',
-            ignore_labels=[], 
+            model=f'airesearch/{_model_name}',
+            revision=f'finetuned@{self.corpus}-pos',
+            ignore_labels=[],
             grouped_entities=self.grouped_word
         )
 
@@ -44,14 +43,23 @@ def tag(
         self.json_pos = self.classify_tokens(text)
         self.output = ""
         if grouped_word:
-            self.sent_pos = [(i['word'].replace("<_>", " "), i['entity_group']) for i in self.json_pos]
+            self.sent_pos = [
+                (i['word'].replace("<_>", " "),
+                i['entity_group']) for i in self.json_pos
+            ]
         else:
-            self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁']
+            self.sent_pos = [
+                (i['word'].replace("<_>", " ").replace('▁',''),
+                i['entity'])
+                for i in self.json_pos if i['word'] != '▁'
+            ]
         return self.sent_pos
 
+
 _corpus = "lst20"
 _grouped_word = False
-_postag = PosTagTransformers(corpus=_corpus, grouped_word = _grouped_word)
+_postag = PosTagTransformers(corpus=_corpus, grouped_word=_grouped_word)
+
 
 def pos_tag(
     text: str, corpus: str = "lst20", grouped_word: bool = False
@@ -66,7 +74,11 @@ def pos_tag(
     if _grouped_word != grouped_word:
         _postag = PosTagTransformers(
             corpus=corpus,
-            grouped_word = grouped_word
+            grouped_word=grouped_word
         )
         _grouped_word = grouped_word
-    return _postag.tag(text, corpus = corpus,grouped_word = grouped_word)
+    return _postag.tag(
+        text,
+        corpus=corpus,
+        grouped_word=grouped_word
+    )

From 8d1cbdbaddeb5a4675e2167ad7e81fad9f15b436 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 1 Mar 2021 23:08:56 +0700
Subject: [PATCH 19/34] Update test_wangchanberta.py

---
 tests/test_wangchanberta.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index bf607dbe1..05f78ca02 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -16,6 +16,11 @@ def test_thainer_wangchanberta(self):
             ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
         )
 
+        ner = ThaiNameTagger(grouped_entities=False)
+        self.assertIsNotNone(
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+        )
+
     def test_lst20_ner_wangchanberta(self):
         ner = ThaiNameTagger(dataset_name="lst20")
         self.assertIsNotNone(
@@ -25,6 +30,14 @@ def test_lst20_ner_wangchanberta(self):
             ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
         )
 
+        ner = ThaiNameTagger(
+            dataset_name="lst20",
+            grouped_entities=False
+        )
+        self.assertIsNotNone(
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+        )
+
     def test_segment_wangchanberta(self):
         self.assertIsNotNone(
             segment("I คิด therefore I am ผ็ฎ์")
@@ -32,3 +45,20 @@ def test_segment_wangchanberta(self):
         self.assertIsNotNone(
             segment([])
         )
+
+    def test_pos_tag_wangchanberta(self):
+        self.assertIsNotNone(
+            pos_tag("I คิด therefore I am ผ็ฎ์")
+        )
+        self.assertIsNotNone(
+            pos_tag(['I',' ','คิด',' ','therefore',' ','I',' ','am',' ','ผ็ฎ์'])
+        )
+        self.assertIsNotNone(
+            pos_tag(None)
+        )
+        self.assertIsNotNone(
+            pos_tag("I คิด therefore I am ผ็ฎ์",grouped_word=True)
+        )
+        self.assertIsNotNone(
+            pos_tag("ทดสอบระบบ",grouped_word=False)
+        )

From 4827c7ddfae4b8915f8dd8c96131a6c9c17dfbb5 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 1 Mar 2021 23:12:23 +0700
Subject: [PATCH 20/34] Update tests

---
 tests/test_tokenize.py      |  4 +++-
 tests/test_wangchanberta.py | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index f234d3c20..d163238ce 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -294,7 +294,9 @@ def test_subword_tokenize(self):
         self.assertFalse(
             "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta")
         )
-        self.assertIsInstance(subword_tokenize("โควิด19", engine="wangchanberta"), list)
+        self.assertIsInstance(
+            subword_tokenize("โควิด19", engine="wangchanberta"), list
+        )
         self.assertFalse(
             " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
         )
diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index 05f78ca02..c7e2f1609 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -51,7 +51,21 @@ def test_pos_tag_wangchanberta(self):
             pos_tag("I คิด therefore I am ผ็ฎ์")
         )
         self.assertIsNotNone(
-            pos_tag(['I',' ','คิด',' ','therefore',' ','I',' ','am',' ','ผ็ฎ์'])
+            pos_tag(
+                [
+                    'I',
+                    ' ',
+                    'คิด',
+                    ' ',
+                    'therefore',
+                    ' ',
+                    'I',
+                    ' ',
+                    'am',
+                    ' ',
+                    'ผ็ฎ์'
+                ]
+            )
         )
         self.assertIsNotNone(
             pos_tag(None)

From 1e4a0d4d8ed7dcf71acd7a21b4ae5454a367ac12 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 5 Mar 2021 17:03:18 +0700
Subject: [PATCH 21/34] Fixed PEP8

---
 pythainlp/wangchanberta/core.py   |  2 +-
 pythainlp/wangchanberta/postag.py |  6 ++++--
 tests/test_wangchanberta.py       | 12 ++++++------
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 4f737ee6e..6040df631 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -28,7 +28,7 @@ def __init__(self,
             ignore_labels=[],
             grouped_entities=self.grouped_entities
         )
-    
+
     def IOB(self, tag):
         if tag != "O":
             return "B-"+tag
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index 6705a9417..7153c61b2 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -49,8 +49,10 @@ def tag(
             ]
         else:
             self.sent_pos = [
-                (i['word'].replace("<_>", " ").replace('▁',''),
-                i['entity'])
+                (
+                    i['word'].replace("<_>", " ").replace('▁', ''),
+                    i['entity']
+                )
                 for i in self.json_pos if i['word'] != '▁'
             ]
         return self.sent_pos
diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index c7e2f1609..e26138572 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -13,12 +13,12 @@ def test_thainer_wangchanberta(self):
         )
         ner = ThaiNameTagger()
         self.assertIsNotNone(
-            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
 
         ner = ThaiNameTagger(grouped_entities=False)
         self.assertIsNotNone(
-            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
 
     def test_lst20_ner_wangchanberta(self):
@@ -27,7 +27,7 @@ def test_lst20_ner_wangchanberta(self):
             ner.get_ner("I คิด therefore I am ผ็ฎ์")
         )
         self.assertIsNotNone(
-            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
 
         ner = ThaiNameTagger(
@@ -35,7 +35,7 @@ def test_lst20_ner_wangchanberta(self):
             grouped_entities=False
         )
         self.assertIsNotNone(
-            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True)
+            ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
 
     def test_segment_wangchanberta(self):
@@ -71,8 +71,8 @@ def test_pos_tag_wangchanberta(self):
             pos_tag(None)
         )
         self.assertIsNotNone(
-            pos_tag("I คิด therefore I am ผ็ฎ์",grouped_word=True)
+            pos_tag("I คิด therefore I am ผ็ฎ์", grouped_word=True)
         )
         self.assertIsNotNone(
-            pos_tag("ทดสอบระบบ",grouped_word=False)
+            pos_tag("ทดสอบระบบ", grouped_word=False)
         )

From 8de00f77ab6941eacc29e92766d3f732ed691125 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Fri, 5 Mar 2021 17:13:08 +0700
Subject: [PATCH 22/34] Fixed PEP8

---
 pythainlp/wangchanberta/core.py   | 44 ++++++++++++++++++++++---------
 pythainlp/wangchanberta/postag.py | 17 +++++++-----
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 6040df631..5f14b957d 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -15,19 +15,20 @@
 
 
 class ThaiNameTagger:
-    def __init__(self,
-                dataset_name: str = "thainer",
-                grouped_entities: bool = True):
+    def __init__(
+        self,
+        dataset_name: str = "thainer",
+        grouped_entities: bool = True
+    ):
         self.dataset_name = dataset_name
         self.grouped_entities = grouped_entities
         self.classify_tokens = pipeline(
             task='ner',
             tokenizer=_tokenizer,
-            model = f'airesearch/{_model_name}',
-            revision = f'finetuned@{self.dataset_name}-ner',
+            model=f'airesearch/{_model_name}',
+            revision=f'finetuned@{self.dataset_name}-ner',
             ignore_labels=[],
-            grouped_entities=self.grouped_entities
-        )
+            grouped_entities=self.grouped_entities)
 
     def IOB(self, tag):
         if tag != "O":
@@ -40,7 +41,8 @@ def get_ner(
         """
         This function tags named-entitiy from text in IOB format.
 
-        Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand
+        Powered by wangchanberta from VISTEC-depa\
+             AI Research Institute of Thailand
         :param str text: text in Thai to be tagged
         :param bool tag: output like html tag.
         :return: a list of tuple associated with tokenized word group, NER tag,
@@ -54,13 +56,31 @@ def get_ner(
         self.json_ner = self.classify_tokens(text)
         self.output = ""
         if self.grouped_entities and self.dataset_name == "thainer":
-            self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner]
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " "), self.IOB(i['entity_group'])
+                ) for i in self.json_ner
+            ]
         elif self.dataset_name == "thainer":
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁']
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " "), i['entity']
+                ) for i in self.json_ner if i['word'] != '▁'
+            ]
         elif self.grouped_entities and self.dataset_name == "lst20":
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner]
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " "),
+                    i['entity_group'].replace('_', '-').replace('E-', 'I-')
+                ) for i in self.json_ner
+            ]
         else:
-            self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner]
+            self.sent_ner = [
+                (
+                    i['word'].replace("<_>", " "),
+                    i['entity'].replace('_', '-').replace('E-', 'I-')
+                ) for i in self.json_ner
+            ]
 
         if tag:
             temp = ""
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index 7153c61b2..cca9e8789 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -15,9 +15,11 @@
 
 
 class PosTagTransformers:
-    def __init__(self,
-                corpus: str = "lst20",
-                grouped_word: bool = False) -> None:
+    def __init__(
+        self,
+        corpus: str = "lst20",
+        grouped_word: bool = False
+    ) -> None:
         self.corpus = corpus
         self.grouped_word = grouped_word
         self.load()
@@ -35,7 +37,9 @@ def load(self):
     def tag(
         self, text: str, corpus: str = "lst20", grouped_word: bool = False
     ) -> List[Tuple[str, str]]:
-        if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word:
+        if (
+            corpus != self.corpus and corpus in ['lst20']
+        ) or grouped_word != self.grouped_word:
             self.grouped_word = grouped_word
             self.corpus = corpus
             self.load()
@@ -44,8 +48,9 @@ def tag(
         self.output = ""
         if grouped_word:
             self.sent_pos = [
-                (i['word'].replace("<_>", " "),
-                i['entity_group']) for i in self.json_pos
+                (
+                    i['word'].replace("<_>", " "), i['entity_group']
+                ) for i in self.json_pos
             ]
         else:
             self.sent_pos = [

From f1b0a0e2ab7ee51a05401ecfd913b8af3db7388a Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 6 Mar 2021 15:48:24 +0700
Subject: [PATCH 23/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 5f14b957d..fb2d8ac30 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -58,26 +58,26 @@ def get_ner(
         if self.grouped_entities and self.dataset_name == "thainer":
             self.sent_ner = [
                 (
-                    i['word'].replace("<_>", " "), self.IOB(i['entity_group'])
+                    i['word'].replace("<_>", " ").replace('▁', ''), self.IOB(i['entity_group'])
                 ) for i in self.json_ner
             ]
         elif self.dataset_name == "thainer":
             self.sent_ner = [
                 (
-                    i['word'].replace("<_>", " "), i['entity']
+                    i['word'].replace("<_>", " ").replace('▁', ''), i['entity']
                 ) for i in self.json_ner if i['word'] != '▁'
             ]
         elif self.grouped_entities and self.dataset_name == "lst20":
             self.sent_ner = [
                 (
-                    i['word'].replace("<_>", " "),
+                    i['word'].replace("<_>", " ").replace('▁', ''),
                     i['entity_group'].replace('_', '-').replace('E-', 'I-')
                 ) for i in self.json_ner
             ]
         else:
             self.sent_ner = [
                 (
-                    i['word'].replace("<_>", " "),
+                    i['word'].replace("<_>", " ").replace('▁', ''),
                     i['entity'].replace('_', '-').replace('E-', 'I-')
                 ) for i in self.json_ner
             ]

From f8a0efa3b6f048c3e9987fb1aabfcbfcfc93cf27 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 6 Mar 2021 15:49:04 +0700
Subject: [PATCH 24/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index fb2d8ac30..e3a075028 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -58,7 +58,8 @@ def get_ner(
         if self.grouped_entities and self.dataset_name == "thainer":
             self.sent_ner = [
                 (
-                    i['word'].replace("<_>", " ").replace('▁', ''), self.IOB(i['entity_group'])
+                    i['word'].replace("<_>", " ").replace('▁', ''),
+                    self.IOB(i['entity_group'])
                 ) for i in self.json_ner
             ]
         elif self.dataset_name == "thainer":

From f5ae3ad9c3779894cecd8b2da480c5701bf646cf Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Tue, 9 Mar 2021 19:34:53 +0700
Subject: [PATCH 25/34] Update core

---
 pythainlp/wangchanberta/core.py | 5 ++++-
 tests/test_wangchanberta.py     | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index e3a075028..8dd39de1e 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -103,7 +103,10 @@ def get_ner(
                     sent += "</" + temp + ">"
 
             return sent
-        return self.sent_ner
+        if self.sent_ner[0][0] == '' and len(self.sent_ner)>1:
+            return self.sent_ner[1:]
+        else:
+            return self.sent_ner
 
 
 def segment(text: str) -> List[str]:
diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index e26138572..341c2119e 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -15,6 +15,9 @@ def test_thainer_wangchanberta(self):
         self.assertIsNotNone(
             ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
+        self.assertIsNotNone(
+            ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True)
+        )
 
         ner = ThaiNameTagger(grouped_entities=False)
         self.assertIsNotNone(
@@ -29,6 +32,9 @@ def test_lst20_ner_wangchanberta(self):
         self.assertIsNotNone(
             ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
+        self.assertIsNotNone(
+            ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True)
+        )
 
         ner = ThaiNameTagger(
             dataset_name="lst20",

From 00b27532e55db3ef98106cd57446d93dbf4f10c4 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 11 Mar 2021 15:06:23 +0700
Subject: [PATCH 26/34] Fixed PEP8

---
 pythainlp/wangchanberta/core.py |  2 +-
 tests/test_wangchanberta.py     | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 8dd39de1e..e85d842fd 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -103,7 +103,7 @@ def get_ner(
                     sent += "</" + temp + ">"
 
             return sent
-        if self.sent_ner[0][0] == '' and len(self.sent_ner)>1:
+        if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1:
             return self.sent_ner[1:]
         else:
             return self.sent_ner
diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py
index 341c2119e..85a722b0b 100644
--- a/tests/test_wangchanberta.py
+++ b/tests/test_wangchanberta.py
@@ -16,7 +16,10 @@ def test_thainer_wangchanberta(self):
             ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
         self.assertIsNotNone(
-            ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True)
+            ner.get_ner(
+                "โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ",
+                tag=True
+            )
         )
 
         ner = ThaiNameTagger(grouped_entities=False)
@@ -33,7 +36,10 @@ def test_lst20_ner_wangchanberta(self):
             ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True)
         )
         self.assertIsNotNone(
-            ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True)
+            ner.get_ner(
+                "โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ",
+                tag=True
+            )
         )
 
         ner = ThaiNameTagger(

From c637c8a07582317ca39b8e06b2647a38cde83c08 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 11 Mar 2021 16:06:33 +0700
Subject: [PATCH 27/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index e85d842fd..3be87b537 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -82,7 +82,12 @@ def get_ner(
                     i['entity'].replace('_', '-').replace('E-', 'I-')
                 ) for i in self.json_ner
             ]
-
+        if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1:
+            self.sent_ner = self.sent_ner[1:]
+        for idx, (word, ner) in enumerate(self.sent_ner):
+            if idx > 0 and ner.startswith("B-"):
+                if ner.replace('B-', '') == self.sent_ner[idx-1][1].replace('B-', '').replace('I-', ''):
+                    self.sent_ner[idx] = (word,ner.replace('B-', 'I-'))
         if tag:
             temp = ""
             sent = ""
@@ -103,8 +108,7 @@ def get_ner(
                     sent += "</" + temp + ">"
 
             return sent
-        if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1:
-            return self.sent_ner[1:]
+        
         else:
             return self.sent_ner
 

From f8d438a3112d4ed64013b9e68c2fabeff1f20214 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 11 Mar 2021 16:13:02 +0700
Subject: [PATCH 28/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index 3be87b537..df74c1803 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -35,6 +35,9 @@ def IOB(self, tag):
             return "B-"+tag
         return "O"
 
+    def clear_tag(self, tag):
+        return tag.replace('B-', '').replace('I-', '')
+
     def get_ner(
         self, text: str, tag: bool = False
     ) -> List[Tuple[str, str]]:
@@ -86,7 +89,9 @@ def get_ner(
             self.sent_ner = self.sent_ner[1:]
         for idx, (word, ner) in enumerate(self.sent_ner):
             if idx > 0 and ner.startswith("B-"):
-                if ner.replace('B-', '') == self.sent_ner[idx-1][1].replace('B-', '').replace('I-', ''):
+                if (
+                    self.clear_tag(ner) == self.clear_tag(self.sent_ner[idx-1][1])
+                ):
                     self.sent_ner[idx] = (word,ner.replace('B-', 'I-'))
         if tag:
             temp = ""
@@ -108,7 +113,6 @@ def get_ner(
                     sent += "</" + temp + ">"
 
             return sent
-        
         else:
             return self.sent_ner
 

From 87b6119e172d3c33d476a1de59ce93f0b792317c Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 11 Mar 2021 16:14:24 +0700
Subject: [PATCH 29/34] Update core.py

---
 pythainlp/wangchanberta/core.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index df74c1803..e59603273 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -90,9 +90,11 @@ def get_ner(
         for idx, (word, ner) in enumerate(self.sent_ner):
             if idx > 0 and ner.startswith("B-"):
                 if (
-                    self.clear_tag(ner) == self.clear_tag(self.sent_ner[idx-1][1])
+                    self.clear_tag(ner) == self.clear_tag(
+                        self.sent_ner[idx-1][1]
+                    )
                 ):
-                    self.sent_ner[idx] = (word,ner.replace('B-', 'I-'))
+                    self.sent_ner[idx] = (word, ner.replace('B-', 'I-'))
         if tag:
             temp = ""
             sent = ""

From 9e04a18708632b407508d285735fb4d7b3ed6463 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Thu, 11 Mar 2021 19:14:56 +0700
Subject: [PATCH 30/34] Update wangchanberta.rst

---
 docs/api/wangchanberta.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst
index 7f495aba8..37080421c 100644
--- a/docs/api/wangchanberta.rst
+++ b/docs/api/wangchanberta.rst
@@ -3,7 +3,7 @@
 pythainlp.wangchanberta
 =======================
 
-WangchanBERTa base model: wangchanberta-base-att-spm-uncased[#Lowphansirikul_2021]_
+WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_2021]_
 
 We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer.
 

From 34a034ae8f4590cb3750c5eefd557803898f5f4d Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 13 Mar 2021 20:12:54 +0700
Subject: [PATCH 31/34] Update pos_tag docs

---
 pythainlp/tag/pos_tag.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index c8f1ae99d..a51ecb3ae 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -12,12 +12,14 @@ def pos_tag(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
-        * *wangchanberta* - wangchanberta model (support lst20 corpus only)
+        * *wangchanberta* - wangchanberta model (support lst20 corpus only \
+            and it supports a string only. if you input a list of word, it will \
+            convert list word to a string.
     :param str corpus:
         the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
             by National Electronics and Computer Technology Center, Thailand
-        * *lst20_ud* - LST20 text, with tags mapped to Universal POS tags \
+        * *lst20_ud* - LST20 text, with tags mapped to Universal POS tag \
             from `Universal Dependencies <https://universaldependencies.org/>`
         * *orchid* - `ORCHID \
             <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \

From 5bbbebe4172c02a4f03704bd23c88511f28553ca Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sat, 13 Mar 2021 22:18:46 +0700
Subject: [PATCH 32/34] Update pos_tag.py

---
 pythainlp/tag/pos_tag.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index a51ecb3ae..97f1a6d70 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -13,8 +13,8 @@ def pos_tag(
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
         * *wangchanberta* - wangchanberta model (support lst20 corpus only \
-            and it supports a string only. if you input a list of word, it will \
-            convert list word to a string.
+            and it supports a string only. if you input a list of word, \
+            it will convert list word to a string.
     :param str corpus:
         the corpus that used to create the language model for tagger
         * *lst20* - `LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \

From e4cf8dac124074d6986cccf04c46e47ddaa05aa5 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 15 Mar 2021 22:12:24 +0700
Subject: [PATCH 33/34] Add pythainlp.wangchanberta Speed Benchmark

---
 docs/api/wangchanberta.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst
index 37080421c..74eb18b1d 100644
--- a/docs/api/wangchanberta.rst
+++ b/docs/api/wangchanberta.rst
@@ -7,6 +7,33 @@ WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_20
 
 We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer.
 
+**Speed Benchmark**
+
++-------------------------+-------------------------+----------------+
+| Function                | Named Entity            | Part of Speech |
+|                         | Recognition             |                |
++=========================+=========================+================+
+| PyThaiNLP basic         | 89.7 ms                 | 312 ms         |
+| function (CRF for NER   |                         |                |
+| and perceptron model    |                         |                |
+| for POS)                |                         |                |
++-------------------------+-------------------------+----------------+
+| pythainlp.wangchanberta | 9.64 s                  | 9.65 s         |
+| (CPU)                   |                         |                |
++-------------------------+-------------------------+----------------+
+| pythainlp.wangchanberta | 8.02 s                  | 8 s            |
+| (GPU)                   |                         |                |
++-------------------------+-------------------------+----------------+
+
+Notebook:
+
+-  `PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google
+   Colab`_
+-  `pythainlp.wangchanberta GPU`_
+
+.. _PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google Colab: https://colab.research.google.com/drive/1ymTVB1UESXAyZlSpjknCb72xpdcZ86Db?usp=sharing
+.. _pythainlp.wangchanberta GPU: https://colab.research.google.com/drive/1AtkFT1HMGL2GO7O2tM_hi_7mExKwmhMw?usp=sharing
+
 Modules
 -------
 .. autoclass:: ThaiNameTagger

From ff6d300379d79bb92d2d8a7a58303cf110106a95 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Mon, 15 Mar 2021 22:55:54 +0700
Subject: [PATCH 34/34] Update docs

---
 docs/api/wangchanberta.rst        | 23 ++++++++---------------
 pythainlp/wangchanberta/core.py   | 21 ++++++++++++++++-----
 pythainlp/wangchanberta/postag.py | 10 ++++++++++
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst
index 74eb18b1d..8e4124177 100644
--- a/docs/api/wangchanberta.rst
+++ b/docs/api/wangchanberta.rst
@@ -9,21 +9,13 @@ We used WangchanBERTa for Thai name tagger task, part-of-speech and subword toke
 
 **Speed Benchmark**
 
-+-------------------------+-------------------------+----------------+
-| Function                | Named Entity            | Part of Speech |
-|                         | Recognition             |                |
-+=========================+=========================+================+
-| PyThaiNLP basic         | 89.7 ms                 | 312 ms         |
-| function (CRF for NER   |                         |                |
-| and perceptron model    |                         |                |
-| for POS)                |                         |                |
-+-------------------------+-------------------------+----------------+
-| pythainlp.wangchanberta | 9.64 s                  | 9.65 s         |
-| (CPU)                   |                         |                |
-+-------------------------+-------------------------+----------------+
-| pythainlp.wangchanberta | 8.02 s                  | 8 s            |
-| (GPU)                   |                         |                |
-+-------------------------+-------------------------+----------------+
+============================= ======================== ==============
+Function                      Named Entity Recognition Part of Speech
+============================= ======================== ==============
+PyThaiNLP basic function      89.7 ms                  312 ms
+pythainlp.wangchanberta (CPU) 9.64 s                   9.65 s
+pythainlp.wangchanberta (GPU) 8.02 s                   8 s
+============================= ======================== ==============
 
 Notebook:
 
@@ -37,6 +29,7 @@ Notebook:
 Modules
 -------
 .. autoclass:: ThaiNameTagger
+    :members:
 .. autofunction:: pos_tag
 .. autofunction:: segment
 
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
index e59603273..c82bf12c0 100644
--- a/pythainlp/wangchanberta/core.py
+++ b/pythainlp/wangchanberta/core.py
@@ -20,6 +20,17 @@ def __init__(
         dataset_name: str = "thainer",
         grouped_entities: bool = True
     ):
+        """
+        This function tags named-entitiy from text in IOB format.
+
+        Powered by wangchanberta from VISTEC-depa\
+             AI Research Institute of Thailand
+
+        :param str dataset_name:
+            * *thainer* - ThaiNER dataset
+            * *lst20* - LST20 Corpus
+        :param bool grouped_entities: grouped entities
+        """
         self.dataset_name = dataset_name
         self.grouped_entities = grouped_entities
         self.classify_tokens = pipeline(
@@ -30,17 +41,17 @@ def __init__(
             ignore_labels=[],
             grouped_entities=self.grouped_entities)
 
-    def IOB(self, tag):
+    def _IOB(self, tag):
         if tag != "O":
             return "B-"+tag
         return "O"
 
-    def clear_tag(self, tag):
+    def _clear_tag(self, tag):
         return tag.replace('B-', '').replace('I-', '')
 
     def get_ner(
         self, text: str, tag: bool = False
-    ) -> List[Tuple[str, str]]:
+    ) -> Union[List[Tuple[str, str]], str]:
         """
         This function tags named-entitiy from text in IOB format.
 
@@ -62,7 +73,7 @@ def get_ner(
             self.sent_ner = [
                 (
                     i['word'].replace("<_>", " ").replace('▁', ''),
-                    self.IOB(i['entity_group'])
+                    self._IOB(i['entity_group'])
                 ) for i in self.json_ner
             ]
         elif self.dataset_name == "thainer":
@@ -90,7 +101,7 @@ def get_ner(
         for idx, (word, ner) in enumerate(self.sent_ner):
             if idx > 0 and ner.startswith("B-"):
                 if (
-                    self.clear_tag(ner) == self.clear_tag(
+                    self._clear_tag(ner) == self._clear_tag(
                         self.sent_ner[idx-1][1]
                     )
                 ):
diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py
index cca9e8789..df0e9b7ea 100644
--- a/pythainlp/wangchanberta/postag.py
+++ b/pythainlp/wangchanberta/postag.py
@@ -71,6 +71,16 @@ def tag(
 def pos_tag(
     text: str, corpus: str = "lst20", grouped_word: bool = False
 ) -> List[Tuple[str, str]]:
+    """
+    Marks words with part-of-speech (POS) tags.
+
+    :param str text: thai text
+    :param str corpus:
+        * *lst20* - a LST20 tagger (default)
+    :param bool grouped_word: grouped word (default is False)
+    :return: a list of tuples (word, POS tag)
+    :rtype: list[tuple[str, str]]
+    """
     global _grouped_word, _postag
     if isinstance(text, list):
         text = ''.join(text)