diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 8b8b08c14..f5ceb13a3 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -79,3 +79,5 @@ tcc etcc ++++ .. automodule:: pythainlp.tokenize.etcc + +.. autofunction:: pythainlp.tokenize.etcc.segment \ No newline at end of file diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst index c6756a6b4..1f9aa002a 100644 --- a/docs/api/ulmfit.rst +++ b/docs/api/ulmfit.rst @@ -3,6 +3,8 @@ pythainlp.ulmfit ==================================== +Universal Language Model Fine-tuning for Text Classification (ULMFiT). + Modules ------- .. autoclass:: ThaiTokenizer diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst new file mode 100644 index 000000000..8e4124177 --- /dev/null +++ b/docs/api/wangchanberta.rst @@ -0,0 +1,42 @@ +.. currentmodule:: pythainlp.wangchanberta + +pythainlp.wangchanberta +======================= + +WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_2021]_ + +We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer. + +**Speed Benchmark** + +============================= ======================== ============== +Function Named Entity Recognition Part of Speech +============================= ======================== ============== +PyThaiNLP basic function 89.7 ms 312 ms +pythainlp.wangchanberta (CPU) 9.64 s 9.65 s +pythainlp.wangchanberta (GPU) 8.02 s 8 s +============================= ======================== ============== + +Notebook: + +- `PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google + Colab`_ +- `pythainlp.wangchanberta GPU`_ + +.. _PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google Colab: https://colab.research.google.com/drive/1ymTVB1UESXAyZlSpjknCb72xpdcZ86Db?usp=sharing +.. _pythainlp.wangchanberta GPU: https://colab.research.google.com/drive/1AtkFT1HMGL2GO7O2tM_hi_7mExKwmhMw?usp=sharing + +Modules +------- +.. autoclass:: ThaiNameTagger + :members: +.. autofunction:: pos_tag +.. autofunction:: segment + +References +---------- + +.. [#Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S. + WangchanBERTa: Pretraining transformer-based Thai Language Models. + arXiv:210109635 [cs] [Internet]. 2021 Jan 23 [cited 2021 Feb 27]; + Available from: http://arxiv.org/abs/2101.09635 diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index da7865a68..97f1a6d70 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -12,11 +12,14 @@ def pos_tag( :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger + * *wangchanberta* - wangchanberta model (support lst20 corpus only \ + and it supports a string only. if you input a list of word, \ + it will convert list word to a string. :param str corpus: the corpus that used to create the language model for tagger * *lst20* - `LST20 `_ corpus \ by National Electronics and Computer Technology Center, Thailand - * *lst20_ud* - LST20 text, with tags mapped to Universal POS tags \ + * *lst20_ud* - LST20 text, with tags mapped to Universal POS tag \ from `Universal Dependencies ` * *orchid* - `ORCHID \ `_ corpus, \ @@ -88,6 +91,9 @@ def pos_tag( if engine == "perceptron": from pythainlp.tag.perceptron import tag as tag_ + elif engine == "wangchanberta" and corpus == "lst20": + from pythainlp.wangchanberta.postag import pos_tag as tag_ + words = ''.join(words) else: # default, use "unigram" ("old") engine from pythainlp.tag.unigram import tag as tag_ diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index b615deaac..3fdd66e52 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -301,6 +301,7 @@ def subword_tokenize( **Options for engine** * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) + * *wangchanberta* - SentencePiece from wangchanberta model. :Example: @@ -320,7 +321,7 @@ def subword_tokenize( # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก', 'และ', 'พัฒ','นา', 'กา', 'ร'] - Tokenize text into subword based on *etcc* **(Work In Progress)**:: + Tokenize text into subword based on *etcc*:: text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" text_2 = "ความแปลกแยกและพัฒนาการ" @@ -330,6 +331,17 @@ def subword_tokenize( subword_tokenize(text_2, engine='etcc') # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ'] + + Tokenize text into subword based on *wangchanberta*:: + + text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + text_2 = "ความแปลกแยกและพัฒนาการ" + + subword_tokenize(text_1, engine='wangchanberta') + # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง'] + + subword_tokenize(text_2, engine='wangchanberta') + # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ'] """ if not text or not isinstance(text, str): return [] @@ -338,6 +350,8 @@ def subword_tokenize( from pythainlp.tokenize.tcc import segment elif engine == "etcc": from pythainlp.tokenize.etcc import segment + elif engine == "wangchanberta": + from pythainlp.wangchanberta import segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py new file mode 100644 index 000000000..bef6eca56 --- /dev/null +++ b/pythainlp/wangchanberta/__init__.py @@ -0,0 +1,8 @@ +__all__ = [ + "ThaiNameTagger", + "pos_tag", + "segment", +] + +from pythainlp.wangchanberta.core import ThaiNameTagger, segment +from pythainlp.wangchanberta.postag import pos_tag diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py new file mode 100644 index 000000000..c82bf12c0 --- /dev/null +++ b/pythainlp/wangchanberta/core.py @@ -0,0 +1,144 @@ +from typing import Dict, List, Tuple, Union +import re +from transformers import ( + CamembertTokenizer, + AutoTokenizer, + pipeline, +) + +_model_name = "wangchanberta-base-att-spm-uncased" +_tokenizer = CamembertTokenizer.from_pretrained( + f'airesearch/{_model_name}', + revision='main') +if _model_name == "wangchanberta-base-att-spm-uncased": + _tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + + +class ThaiNameTagger: + def __init__( + self, + dataset_name: str = "thainer", + grouped_entities: bool = True + ): + """ + This function tags named-entitiy from text in IOB format. + + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + + :param str dataset_name: + * *thainer* - ThaiNER dataset + * *lst20* - LST20 Corpus + :param bool grouped_entities: grouped entities + """ + self.dataset_name = dataset_name + self.grouped_entities = grouped_entities + self.classify_tokens = pipeline( + task='ner', + tokenizer=_tokenizer, + model=f'airesearch/{_model_name}', + revision=f'finetuned@{self.dataset_name}-ner', + ignore_labels=[], + grouped_entities=self.grouped_entities) + + def _IOB(self, tag): + if tag != "O": + return "B-"+tag + return "O" + + def _clear_tag(self, tag): + return tag.replace('B-', '').replace('I-', '') + + def get_ner( + self, text: str, tag: bool = False + ) -> Union[List[Tuple[str, str]], str]: + """ + This function tags named-entitiy from text in IOB format. + + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + :param str text: text in Thai to be tagged + :param bool tag: output like html tag. + :return: a list of tuple associated with tokenized word group, NER tag, + and output like html tag (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuple associated with tokenized + word and NER tag + :rtype: Union[list[tuple[str, str]]], str + """ + text = re.sub(" ", "<_>", text) + self.json_ner = self.classify_tokens(text) + self.output = "" + if self.grouped_entities and self.dataset_name == "thainer": + self.sent_ner = [ + ( + i['word'].replace("<_>", " ").replace('▁', ''), + self._IOB(i['entity_group']) + ) for i in self.json_ner + ] + elif self.dataset_name == "thainer": + self.sent_ner = [ + ( + i['word'].replace("<_>", " ").replace('▁', ''), i['entity'] + ) for i in self.json_ner if i['word'] != '▁' + ] + elif self.grouped_entities and self.dataset_name == "lst20": + self.sent_ner = [ + ( + i['word'].replace("<_>", " ").replace('▁', ''), + i['entity_group'].replace('_', '-').replace('E-', 'I-') + ) for i in self.json_ner + ] + else: + self.sent_ner = [ + ( + i['word'].replace("<_>", " ").replace('▁', ''), + i['entity'].replace('_', '-').replace('E-', 'I-') + ) for i in self.json_ner + ] + if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1: + self.sent_ner = self.sent_ner[1:] + for idx, (word, ner) in enumerate(self.sent_ner): + if idx > 0 and ner.startswith("B-"): + if ( + self._clear_tag(ner) == self._clear_tag( + self.sent_ner[idx-1][1] + ) + ): + self.sent_ner[idx] = (word, ner.replace('B-', 'I-')) + if tag: + temp = "" + sent = "" + for idx, (word, ner) in enumerate(self.sent_ner): + if ner.startswith("B-") and temp != "": + sent += "" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "" + temp = "" + sent += word + + if idx == len(self.sent_ner) - 1 and temp != "": + sent += "" + + return sent + else: + return self.sent_ner + + +def segment(text: str) -> List[str]: + """ + Subword tokenize. SentencePiece from wangchanberta model. + + :param str text: text to be tokenized + :return: list of subwords + :rtype: list[str] + """ + if not text or not isinstance(text, str): + return [] + + return _tokenizer.tokenize(text) diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py new file mode 100644 index 000000000..df0e9b7ea --- /dev/null +++ b/pythainlp/wangchanberta/postag.py @@ -0,0 +1,101 @@ +from typing import Dict, List, Tuple, Union +import re +from transformers import ( + CamembertTokenizer, + AutoTokenizer, + pipeline, +) + +_model_name = "wangchanberta-base-att-spm-uncased" +_tokenizer = CamembertTokenizer.from_pretrained( + f'airesearch/{_model_name}', + revision='main') +if _model_name == "wangchanberta-base-att-spm-uncased": + _tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + + +class PosTagTransformers: + def __init__( + self, + corpus: str = "lst20", + grouped_word: bool = False + ) -> None: + self.corpus = corpus + self.grouped_word = grouped_word + self.load() + + def load(self): + self.classify_tokens = pipeline( + task='ner', + tokenizer=_tokenizer, + model=f'airesearch/{_model_name}', + revision=f'finetuned@{self.corpus}-pos', + ignore_labels=[], + grouped_entities=self.grouped_word + ) + + def tag( + self, text: str, corpus: str = "lst20", grouped_word: bool = False + ) -> List[Tuple[str, str]]: + if ( + corpus != self.corpus and corpus in ['lst20'] + ) or grouped_word != self.grouped_word: + self.grouped_word = grouped_word + self.corpus = corpus + self.load() + text = re.sub(" ", "<_>", text) + self.json_pos = self.classify_tokens(text) + self.output = "" + if grouped_word: + self.sent_pos = [ + ( + i['word'].replace("<_>", " "), i['entity_group'] + ) for i in self.json_pos + ] + else: + self.sent_pos = [ + ( + i['word'].replace("<_>", " ").replace('▁', ''), + i['entity'] + ) + for i in self.json_pos if i['word'] != '▁' + ] + return self.sent_pos + + +_corpus = "lst20" +_grouped_word = False +_postag = PosTagTransformers(corpus=_corpus, grouped_word=_grouped_word) + + +def pos_tag( + text: str, corpus: str = "lst20", grouped_word: bool = False +) -> List[Tuple[str, str]]: + """ + Marks words with part-of-speech (POS) tags. + + :param str text: thai text + :param str corpus: + * *lst20* - a LST20 tagger (default) + :param bool grouped_word: grouped word (default is False) + :return: a list of tuples (word, POS tag) + :rtype: list[tuple[str, str]] + """ + global _grouped_word, _postag + if isinstance(text, list): + text = ''.join(text) + elif not text or not isinstance(text, str): + return [] + if corpus not in ["lst20"]: + raise NotImplementedError() + if _grouped_word != grouped_word: + _postag = PosTagTransformers( + corpus=corpus, + grouped_word=grouped_word + ) + _grouped_word = grouped_word + return _postag.tag( + text, + corpus=corpus, + grouped_word=grouped_word + ) diff --git a/setup.py b/setup.py index 208fa9f55..cad25696e 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ "sentencepiece>=0.1.91", "torch>=1.0.0", ], + "wangchanberta": ["transformers", "sentencepiece"], "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], "full": [ @@ -70,7 +71,7 @@ "sentencepiece>=0.1.91", "ssg>=0.0.6", "torch>=1.0.0", - "transformers>=4.1.1" + "transformers>=4.1.1", ], } diff --git a/tests/test_tag.py b/tests/test_tag.py index f4ffef759..5999a29ea 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -93,6 +93,15 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="perceptron", corpus="lst20_ud") ) + self.assertEqual( + pos_tag([], engine="wangchanberta", corpus="lst20"), [] + ) + self.assertIsNotNone( + pos_tag(tokens, engine="wangchanberta", corpus="lst20") + ) + self.assertIsNotNone( + pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") + ) self.assertEqual(pos_tag_sents(None), []) self.assertEqual(pos_tag_sents([]), []) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 7b54def4f..d163238ce 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -286,6 +286,17 @@ def test_subword_tokenize(self): "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") ) self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list) + self.assertEqual(subword_tokenize(None, engine="wangchanberta"), []) + self.assertEqual(subword_tokenize("", engine="wangchanberta"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="wangchanberta"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta") + ) + self.assertIsInstance( + subword_tokenize("โควิด19", engine="wangchanberta"), list + ) self.assertFalse( " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py new file mode 100644 index 000000000..85a722b0b --- /dev/null +++ b/tests/test_wangchanberta.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +import unittest + +from pythainlp.wangchanberta import ThaiNameTagger, pos_tag, segment + + +class TestWangchanberta(unittest.TestCase): + def test_thainer_wangchanberta(self): + ner = ThaiNameTagger() + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์") + ) + ner = ThaiNameTagger() + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) + ) + self.assertIsNotNone( + ner.get_ner( + "โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", + tag=True + ) + ) + + ner = ThaiNameTagger(grouped_entities=False) + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) + ) + + def test_lst20_ner_wangchanberta(self): + ner = ThaiNameTagger(dataset_name="lst20") + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์") + ) + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) + ) + self.assertIsNotNone( + ner.get_ner( + "โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", + tag=True + ) + ) + + ner = ThaiNameTagger( + dataset_name="lst20", + grouped_entities=False + ) + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) + ) + + def test_segment_wangchanberta(self): + self.assertIsNotNone( + segment("I คิด therefore I am ผ็ฎ์") + ) + self.assertIsNotNone( + segment([]) + ) + + def test_pos_tag_wangchanberta(self): + self.assertIsNotNone( + pos_tag("I คิด therefore I am ผ็ฎ์") + ) + self.assertIsNotNone( + pos_tag( + [ + 'I', + ' ', + 'คิด', + ' ', + 'therefore', + ' ', + 'I', + ' ', + 'am', + ' ', + 'ผ็ฎ์' + ] + ) + ) + self.assertIsNotNone( + pos_tag(None) + ) + self.assertIsNotNone( + pos_tag("I คิด therefore I am ผ็ฎ์", grouped_word=True) + ) + self.assertIsNotNone( + pos_tag("ทดสอบระบบ", grouped_word=False) + )