From 43d2e5503628c0a2ba266351a303b084da81f87c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 16:41:41 +0700 Subject: [PATCH 01/34] Add wangchanberta --- pythainlp/tag/wangchanberta.py | 70 ++++++++++++++++++++++++++++++++++ setup.py | 3 +- 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 pythainlp/tag/wangchanberta.py diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py new file mode 100644 index 000000000..0feb32443 --- /dev/null +++ b/pythainlp/tag/wangchanberta.py @@ -0,0 +1,70 @@ +from typing import Dict, List, Tuple, Union +import re +from transformers import ( + CamembertTokenizer, + AutoTokenizer, + pipeline, +) + +class ThaiNameTagger: + def __init__(self, + dataset_name: str = "thainer", + model_name:str = "wangchanberta-base-att-spm-uncased" + ) -> None: + self.model_name = model_name + self.tokenizer = CamembertTokenizer.from_pretrained( + f'airesearch/{self.model_name}', + revision='main') + if self.model_name == "wangchanberta-base-att-spm-uncased": + self.tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + self.classify_tokens = pipeline( + task='ner', + tokenizer=self.tokenizer, + model = f'airesearch/{self.model_name}', + revision = f'finetuned@{dataset_name}-ner', + ignore_labels=[], + grouped_entities=True + ) + + def get_ner( + self, text: str, tag: bool = False + ) -> List[Tuple[str, str]]: + """ + This function tags named-entitiy from text in IOB format. + + Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand + :param str text: text in Thai to be tagged + :param bool tag: output like html tag. + :return: a list of tuple associated with tokenized word group, NER tag, + and output like html tag (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuple associated with tokenized + word and NER tag + :rtype: Union[list[tuple[str, str]]], str + """ + text = re.sub(" ", "<_>", text) + self.json_ner = self.classify_tokens(text) + self.output = "" + self.sent_ner = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_ner] + if tag: + temp = "" + sent = "" + for idx, (word, ner) in enumerate(self.sent_ner): + if ner.startswith("B-") and temp != "": + sent += "" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "" + temp = "" + sent += word + + if idx == len(self.sent_ner) - 1 and temp != "": + sent += "" + + return sent + + return self.sent_ner diff --git a/setup.py b/setup.py index 208fa9f55..cec6f9a8b 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ "sentencepiece>=0.1.91", "torch>=1.0.0", ], + "transformers": ["transformers"], "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], "full": [ @@ -70,7 +71,7 @@ "sentencepiece>=0.1.91", "ssg>=0.0.6", "torch>=1.0.0", - "transformers>=4.1.1" + "transformers>=4.1.1", ], } From 8dd0f21f5530493eabfb7026203342be1e62eb19 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 16:46:19 +0700 Subject: [PATCH 02/34] Update wangchanberta.py --- pythainlp/tag/wangchanberta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py index 0feb32443..00e622578 100644 --- a/pythainlp/tag/wangchanberta.py +++ b/pythainlp/tag/wangchanberta.py @@ -6,11 +6,11 @@ pipeline, ) + class ThaiNameTagger: def __init__(self, dataset_name: str = "thainer", - model_name:str = "wangchanberta-base-att-spm-uncased" - ) -> None: + model_name: str = "wangchanberta-base-att-spm-uncased"): self.model_name = model_name self.tokenizer = CamembertTokenizer.from_pretrained( f'airesearch/{self.model_name}', From fc20b4b8544a2018e47e147be46852f2ce33ab8c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 17:02:57 +0700 Subject: [PATCH 03/34] fixed IOB --- pythainlp/tag/wangchanberta.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py index 00e622578..bed608557 100644 --- a/pythainlp/tag/wangchanberta.py +++ b/pythainlp/tag/wangchanberta.py @@ -25,6 +25,11 @@ def __init__(self, ignore_labels=[], grouped_entities=True ) + + def IOB(self, tag): + if tag != "O": + return "B-"+tag + return "O" def get_ner( self, text: str, tag: bool = False @@ -45,7 +50,7 @@ def get_ner( text = re.sub(" ", "<_>", text) self.json_ner = self.classify_tokens(text) self.output = "" - self.sent_ner = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_ner] + self.sent_ner = [(i['word'].replace("<_>", " "),self.IOB(i['entity_group'])) for i in self.json_ner] if tag: temp = "" sent = "" From 78377251f5fec2ce4b961b37697feafc2395ddd2 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 17:13:48 +0700 Subject: [PATCH 04/34] Update wangchanberta.py fixed grouped_entities --- pythainlp/tag/wangchanberta.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py index bed608557..53cfa8572 100644 --- a/pythainlp/tag/wangchanberta.py +++ b/pythainlp/tag/wangchanberta.py @@ -10,8 +10,11 @@ class ThaiNameTagger: def __init__(self, dataset_name: str = "thainer", - model_name: str = "wangchanberta-base-att-spm-uncased"): + model_name: str = "wangchanberta-base-att-spm-uncased", + grouped_entities: bool = True): self.model_name = model_name + self.dataset_name = dataset_name + self.grouped_entities = grouped_entities self.tokenizer = CamembertTokenizer.from_pretrained( f'airesearch/{self.model_name}', revision='main') @@ -21,9 +24,9 @@ def __init__(self, task='ner', tokenizer=self.tokenizer, model = f'airesearch/{self.model_name}', - revision = f'finetuned@{dataset_name}-ner', + revision = f'finetuned@{self.dataset_name}-ner', ignore_labels=[], - grouped_entities=True + grouped_entities=self.grouped_entities ) def IOB(self, tag): @@ -50,7 +53,10 @@ def get_ner( text = re.sub(" ", "<_>", text) self.json_ner = self.classify_tokens(text) self.output = "" - self.sent_ner = [(i['word'].replace("<_>", " "),self.IOB(i['entity_group'])) for i in self.json_ner] + if self.grouped_entities: + self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner] + else: + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner] if tag: temp = "" sent = "" From f1548f67665d9dfda2576925775449a4c4ef2c70 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 17:15:52 +0700 Subject: [PATCH 05/34] Update wangchanberta.py --- pythainlp/tag/wangchanberta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py index 53cfa8572..bf80154ce 100644 --- a/pythainlp/tag/wangchanberta.py +++ b/pythainlp/tag/wangchanberta.py @@ -56,7 +56,7 @@ def get_ner( if self.grouped_entities: self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner] else: - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner] + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁'] if tag: temp = "" sent = "" From ca551d2c42b73a9383ba06accf3fad45f43bb8d8 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 19:36:07 +0700 Subject: [PATCH 06/34] Add wangchanberta.PosTagTransformers --- pythainlp/tag/pos_tag.py | 4 +++ pythainlp/tag/wangchanberta.py | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index da7865a68..87dd353b9 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -12,6 +12,7 @@ def pos_tag( :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger + * *wangchanberta* - wangchanberta model (support lst20 corpus only) :param str corpus: the corpus that used to create the language model for tagger * *lst20* - `LST20 `_ corpus \ @@ -88,6 +89,9 @@ def pos_tag( if engine == "perceptron": from pythainlp.tag.perceptron import tag as tag_ + elif engine == "wangchanberta" and corpus == "lst20": + from pythainlp.tag.wangchanberta import wangchanberta_pos_tag as tag_ + words = ''.join(words) else: # default, use "unigram" ("old") engine from pythainlp.tag.unigram import tag as tag_ diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/tag/wangchanberta.py index bf80154ce..87f574084 100644 --- a/pythainlp/tag/wangchanberta.py +++ b/pythainlp/tag/wangchanberta.py @@ -79,3 +79,55 @@ def get_ner( return sent return self.sent_ner + + +class PosTagTransformers: + def __init__(self, + corpus: str = "lst20", + model_name:str = "wangchanberta-base-att-spm-uncased", + grouped_word: bool = False + ) -> None: + self.model_name = model_name + self.corpus = corpus + self.grouped_word = grouped_word + self.tokenizer = CamembertTokenizer.from_pretrained( + f'airesearch/{self.model_name}', + revision='main') + if self.model_name == "wangchanberta-base-att-spm-uncased": + self.tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + self.load() + + def load(self): + self.classify_tokens = pipeline( + task='ner', + tokenizer=self.tokenizer, + model = f'airesearch/{self.model_name}', + revision = f'finetuned@{self.corpus}-pos', + ignore_labels=[], + grouped_entities=self.grouped_word + ) + + def tag( + self, text: str, corpus: str = False, grouped_word: bool = False + ) -> List[Tuple[str, str]]: + if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word: + self.grouped_word = grouped_word + self.corpus = corpus + self.load() + text = re.sub(" ", "<_>", text) + self.json_pos = self.classify_tokens(text) + self.output = "" + if grouped_word: + self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos] + else: + self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] + return self.sent_pos + + +def wangchanberta_pos_tag( + text: str, corpus: str = "lst20", grouped_word = False +) -> List[Tuple[str, str]]: + if corpus not in ["lst20"]: + raise NotImplementedError() + _tag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word) + return _tag.tag(text) \ No newline at end of file From 266d8f6bbedd4db0157f76c9eeddd9cbd9972a48 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 20:13:03 +0700 Subject: [PATCH 07/34] Move file to pythainlp.wangchanberta --- pythainlp/tag/pos_tag.py | 2 +- pythainlp/tokenize/core.py | 3 + pythainlp/wangchanberta/__init__.py | 8 +++ .../core.py} | 39 ++++------- pythainlp/wangchanberta/postag.py | 65 +++++++++++++++++++ 5 files changed, 90 insertions(+), 27 deletions(-) create mode 100644 pythainlp/wangchanberta/__init__.py rename pythainlp/{tag/wangchanberta.py => wangchanberta/core.py} (73%) create mode 100644 pythainlp/wangchanberta/postag.py diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 87dd353b9..8ca0eee2f 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -90,7 +90,7 @@ def pos_tag( if engine == "perceptron": from pythainlp.tag.perceptron import tag as tag_ elif engine == "wangchanberta" and corpus == "lst20": - from pythainlp.tag.wangchanberta import wangchanberta_pos_tag as tag_ + import pythainlp.wangchanberta.pos_tag as tag_ words = ''.join(words) else: # default, use "unigram" ("old") engine from pythainlp.tag.unigram import tag as tag_ diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index b615deaac..5304cd8b8 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -301,6 +301,7 @@ def subword_tokenize( **Options for engine** * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) + * *wangchanberta* - SentencePiece from wangchanberta model. :Example: @@ -338,6 +339,8 @@ def subword_tokenize( from pythainlp.tokenize.tcc import segment elif engine == "etcc": from pythainlp.tokenize.etcc import segment + elif engine == "wangchanberta": + from pythainlp.wangchanberta import segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py new file mode 100644 index 000000000..048d2b6e5 --- /dev/null +++ b/pythainlp/wangchanberta/__init__.py @@ -0,0 +1,8 @@ +__all__ = [ + "ThaiNameTagger", + "pos_tag", + "segment", +] + +from pythainlp.wangchanberta.core import ThaiNameTagger, segment +from pythainlp.wangchanberta.postag import pos_tag \ No newline at end of file diff --git a/pythainlp/tag/wangchanberta.py b/pythainlp/wangchanberta/core.py similarity index 73% rename from pythainlp/tag/wangchanberta.py rename to pythainlp/wangchanberta/core.py index 87f574084..512fdf6e8 100644 --- a/pythainlp/tag/wangchanberta.py +++ b/pythainlp/wangchanberta/core.py @@ -6,24 +6,24 @@ pipeline, ) +_model_name = "wangchanberta-base-att-spm-uncased" +_tokenizer = CamembertTokenizer.from_pretrained( + f'airesearch/{_model_name}', + revision='main') +if _model_name == "wangchanberta-base-att-spm-uncased": + _tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + class ThaiNameTagger: def __init__(self, dataset_name: str = "thainer", - model_name: str = "wangchanberta-base-att-spm-uncased", grouped_entities: bool = True): - self.model_name = model_name self.dataset_name = dataset_name self.grouped_entities = grouped_entities - self.tokenizer = CamembertTokenizer.from_pretrained( - f'airesearch/{self.model_name}', - revision='main') - if self.model_name == "wangchanberta-base-att-spm-uncased": - self.tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] self.classify_tokens = pipeline( task='ner', - tokenizer=self.tokenizer, - model = f'airesearch/{self.model_name}', + tokenizer=_tokenizer, + model = f'airesearch/{_model_name}', revision = f'finetuned@{self.dataset_name}-ner', ignore_labels=[], grouped_entities=self.grouped_entities @@ -84,24 +84,17 @@ def get_ner( class PosTagTransformers: def __init__(self, corpus: str = "lst20", - model_name:str = "wangchanberta-base-att-spm-uncased", grouped_word: bool = False ) -> None: - self.model_name = model_name self.corpus = corpus self.grouped_word = grouped_word - self.tokenizer = CamembertTokenizer.from_pretrained( - f'airesearch/{self.model_name}', - revision='main') - if self.model_name == "wangchanberta-base-att-spm-uncased": - self.tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] self.load() def load(self): self.classify_tokens = pipeline( task='ner', - tokenizer=self.tokenizer, - model = f'airesearch/{self.model_name}', + tokenizer=_tokenizer, + model = f'airesearch/{_model_name}', revision = f'finetuned@{self.corpus}-pos', ignore_labels=[], grouped_entities=self.grouped_word @@ -123,11 +116,5 @@ def tag( self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] return self.sent_pos - -def wangchanberta_pos_tag( - text: str, corpus: str = "lst20", grouped_word = False -) -> List[Tuple[str, str]]: - if corpus not in ["lst20"]: - raise NotImplementedError() - _tag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word) - return _tag.tag(text) \ No newline at end of file +def segment(text): + return _tokenizer.tokenize(text) \ No newline at end of file diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py new file mode 100644 index 000000000..6cc01604e --- /dev/null +++ b/pythainlp/wangchanberta/postag.py @@ -0,0 +1,65 @@ +from typing import Dict, List, Tuple, Union +import re +from transformers import ( + CamembertTokenizer, + AutoTokenizer, + pipeline, +) + +_model_name = "wangchanberta-base-att-spm-uncased" +_tokenizer = CamembertTokenizer.from_pretrained( + f'airesearch/{_model_name}', + revision='main') +if _model_name == "wangchanberta-base-att-spm-uncased": + _tokenizer.additional_special_tokens = ['NOTUSED', 'NOTUSED', '<_>'] + + +class PosTagTransformers: + def __init__(self, + corpus: str = "lst20", + grouped_word: bool = False + ) -> None: + self.corpus = corpus + self.grouped_word = grouped_word + self.load() + + def load(self): + self.classify_tokens = pipeline( + task='ner', + tokenizer=_tokenizer, + model = f'airesearch/{_model_name}', + revision = f'finetuned@{self.corpus}-pos', + ignore_labels=[], + grouped_entities=self.grouped_word + ) + + def tag( + self, text: str, corpus: str = "lst20", grouped_word: bool = False + ) -> List[Tuple[str, str]]: + if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word: + self.grouped_word = grouped_word + self.corpus = corpus + self.load() + text = re.sub(" ", "<_>", text) + self.json_pos = self.classify_tokens(text) + self.output = "" + if grouped_word: + self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos] + else: + self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] + return self.sent_pos + +_corpus = "lst20" +_grouped_word = False +_postag = PosTagTransformers(corpus=_corpus, grouped_word = _grouped_word) + +def pos_tag( + text: str, corpus: str = "lst20", grouped_word = False +) -> List[Tuple[str, str]]: + global _grouped_word,_postag + if corpus not in ["lst20"]: + raise NotImplementedError() + if _grouped_word != grouped_word: + _postag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word) + _grouped_word = grouped_word + return _postag.tag(text) \ No newline at end of file From df20fd2f5d0d490c981cde080246bac8366b3187 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 20:25:03 +0700 Subject: [PATCH 08/34] Update wangchanberta requirements --- pythainlp/wangchanberta/__init__.py | 2 +- pythainlp/wangchanberta/core.py | 2 +- pythainlp/wangchanberta/postag.py | 13 ++++++++----- setup.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py index 048d2b6e5..bef6eca56 100644 --- a/pythainlp/wangchanberta/__init__.py +++ b/pythainlp/wangchanberta/__init__.py @@ -5,4 +5,4 @@ ] from pythainlp.wangchanberta.core import ThaiNameTagger, segment -from pythainlp.wangchanberta.postag import pos_tag \ No newline at end of file +from pythainlp.wangchanberta.postag import pos_tag diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 512fdf6e8..d475fbe2e 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -117,4 +117,4 @@ def tag( return self.sent_pos def segment(text): - return _tokenizer.tokenize(text) \ No newline at end of file + return _tokenizer.tokenize(text) diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index 6cc01604e..0df0eb038 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -44,7 +44,7 @@ def tag( self.json_pos = self.classify_tokens(text) self.output = "" if grouped_word: - self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos] + self.sent_pos = [(i['word'].replace("<_>", " "), i['entity_group']) for i in self.json_pos] else: self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] return self.sent_pos @@ -54,12 +54,15 @@ def tag( _postag = PosTagTransformers(corpus=_corpus, grouped_word = _grouped_word) def pos_tag( - text: str, corpus: str = "lst20", grouped_word = False + text: str, corpus: str = "lst20", grouped_word: bool = False ) -> List[Tuple[str, str]]: - global _grouped_word,_postag + global _grouped_word, _postag if corpus not in ["lst20"]: raise NotImplementedError() if _grouped_word != grouped_word: - _postag = PosTagTransformers(corpus=corpus, grouped_word = grouped_word) + _postag = PosTagTransformers( + corpus=corpus, + grouped_word = grouped_word + ) _grouped_word = grouped_word - return _postag.tag(text) \ No newline at end of file + return _postag.tag(text) diff --git a/setup.py b/setup.py index cec6f9a8b..cad25696e 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "sentencepiece>=0.1.91", "torch>=1.0.0", ], - "transformers": ["transformers"], + "wangchanberta": ["transformers", "sentencepiece"], "mt5": ["transformers>=4.1.1", "sentencepiece>=0.1.91"], "wordnet": ["nltk>=3.3.*"], "full": [ From c94f24160e1d9189f13ea0029cccb815fb68d3da Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 20:30:55 +0700 Subject: [PATCH 09/34] Update postag.py --- pythainlp/wangchanberta/postag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index 0df0eb038..1a6697ba3 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -65,4 +65,4 @@ def pos_tag( grouped_word = grouped_word ) _grouped_word = grouped_word - return _postag.tag(text) + return _postag.tag(text, corpus = corpus,grouped_word = grouped_word) From e4f7ba1d7bfb69474bb1b58d4a36f230bc31566d Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 20:42:29 +0700 Subject: [PATCH 10/34] Update core.py --- pythainlp/wangchanberta/core.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index d475fbe2e..8a4e70407 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -53,10 +53,15 @@ def get_ner( text = re.sub(" ", "<_>", text) self.json_ner = self.classify_tokens(text) self.output = "" - if self.grouped_entities: + if self.grouped_entities and self.dataset_name == "thainer": self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner] - else: + elif self.dataset_name == "thainer": self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁'] + elif self.grouped_entities and self.dataset_name == "lst20": + self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'].replace('_','-').replace('E-','I-'))) for i in self.json_ner] + else: + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner] + if tag: temp = "" sent = "" From 79f0b833782aa5d9c5c0868a578fb8470d2df8ec Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 20:43:29 +0700 Subject: [PATCH 11/34] Update core.py --- pythainlp/wangchanberta/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 8a4e70407..74c055c43 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -58,7 +58,7 @@ def get_ner( elif self.dataset_name == "thainer": self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁'] elif self.grouped_entities and self.dataset_name == "lst20": - self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'].replace('_','-').replace('E-','I-'))) for i in self.json_ner] + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner] else: self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner] From ca81865d932d704a508b79ae8ef7f15bb75c8df7 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 20:47:09 +0700 Subject: [PATCH 12/34] Update core.py --- pythainlp/wangchanberta/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 74c055c43..05150bf70 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -60,7 +60,7 @@ def get_ner( elif self.grouped_entities and self.dataset_name == "lst20": self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner] else: - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner] + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_','-').replace('E-','I-')) for i in self.json_ner] if tag: temp = "" From c4c1c4c524f5d6c6e5709b12f942e519c5fffeeb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 21:39:54 +0700 Subject: [PATCH 13/34] Add test --- pythainlp/tag/pos_tag.py | 2 +- pythainlp/wangchanberta/core.py | 5 ++++- pythainlp/wangchanberta/postag.py | 4 ++++ tests/test_tag.py | 9 +++++++++ tests/test_tokenize.py | 9 +++++++++ tests/test_wangchanberta.py | 18 ++++++++++++++++++ 6 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 tests/test_wangchanberta.py diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 8ca0eee2f..c8f1ae99d 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -90,7 +90,7 @@ def pos_tag( if engine == "perceptron": from pythainlp.tag.perceptron import tag as tag_ elif engine == "wangchanberta" and corpus == "lst20": - import pythainlp.wangchanberta.pos_tag as tag_ + from pythainlp.wangchanberta.postag import pos_tag as tag_ words = ''.join(words) else: # default, use "unigram" ("old") engine from pythainlp.tag.unigram import tag as tag_ diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 05150bf70..65c4c150e 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -121,5 +121,8 @@ def tag( self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] return self.sent_pos -def segment(text): +def segment(text: str) -> List[str]: + if not text or not isinstance(text, str): + return [] + return _tokenizer.tokenize(text) diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index 1a6697ba3..d69b45c27 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -57,6 +57,10 @@ def pos_tag( text: str, corpus: str = "lst20", grouped_word: bool = False ) -> List[Tuple[str, str]]: global _grouped_word, _postag + if isinstance(text, list): + text = ''.join(text) + elif not text or not isinstance(text, str): + return [] if corpus not in ["lst20"]: raise NotImplementedError() if _grouped_word != grouped_word: diff --git a/tests/test_tag.py b/tests/test_tag.py index f4ffef759..5999a29ea 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -93,6 +93,15 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="perceptron", corpus="lst20_ud") ) + self.assertEqual( + pos_tag([], engine="wangchanberta", corpus="lst20"), [] + ) + self.assertIsNotNone( + pos_tag(tokens, engine="wangchanberta", corpus="lst20") + ) + self.assertIsNotNone( + pos_tag(tokens, engine="wangchanberta", corpus="lst20_ud") + ) self.assertEqual(pos_tag_sents(None), []) self.assertEqual(pos_tag_sents([]), []) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 7b54def4f..f234d3c20 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -286,6 +286,15 @@ def test_subword_tokenize(self): "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc") ) self.assertIsInstance(subword_tokenize("โควิด19", engine="etcc"), list) + self.assertEqual(subword_tokenize(None, engine="wangchanberta"), []) + self.assertEqual(subword_tokenize("", engine="wangchanberta"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="wangchanberta"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta") + ) + self.assertIsInstance(subword_tokenize("โควิด19", engine="wangchanberta"), list) self.assertFalse( " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py new file mode 100644 index 000000000..21b6eafcd --- /dev/null +++ b/tests/test_wangchanberta.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +import unittest + +from pythainlp.wangchanberta import ThaiNameTagger, pos_tag + + +class TestWangchanberta(unittest.TestCase): + def test_thainer_wangchanberta(self): + ner = ThaiNameTagger() + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์") + ) + def test_lst20_ner_wangchanberta(self): + ner = ThaiNameTagger(dataset_name="lst20") + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์") + ) From a57bd4ac9854670a0a35031ed21ed6b0ccebaccf Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 21:58:50 +0700 Subject: [PATCH 14/34] Update test --- pythainlp/wangchanberta/core.py | 35 --------------------------------- tests/test_wangchanberta.py | 18 ++++++++++++++++- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 65c4c150e..76dd806a0 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -86,41 +86,6 @@ def get_ner( return self.sent_ner -class PosTagTransformers: - def __init__(self, - corpus: str = "lst20", - grouped_word: bool = False - ) -> None: - self.corpus = corpus - self.grouped_word = grouped_word - self.load() - - def load(self): - self.classify_tokens = pipeline( - task='ner', - tokenizer=_tokenizer, - model = f'airesearch/{_model_name}', - revision = f'finetuned@{self.corpus}-pos', - ignore_labels=[], - grouped_entities=self.grouped_word - ) - - def tag( - self, text: str, corpus: str = False, grouped_word: bool = False - ) -> List[Tuple[str, str]]: - if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word: - self.grouped_word = grouped_word - self.corpus = corpus - self.load() - text = re.sub(" ", "<_>", text) - self.json_pos = self.classify_tokens(text) - self.output = "" - if grouped_word: - self.sent_pos = [(i['word'].replace("<_>", " "),i['entity_group']) for i in self.json_pos] - else: - self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] - return self.sent_pos - def segment(text: str) -> List[str]: if not text or not isinstance(text, str): return [] diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index 21b6eafcd..c67f694cf 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -2,7 +2,7 @@ import unittest -from pythainlp.wangchanberta import ThaiNameTagger, pos_tag +from pythainlp.wangchanberta import ThaiNameTagger, pos_tag, segment class TestWangchanberta(unittest.TestCase): @@ -11,8 +11,24 @@ def test_thainer_wangchanberta(self): self.assertIsNotNone( ner.get_ner("I คิด therefore I am ผ็ฎ์") ) + ner = ThaiNameTagger() + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ) + def test_lst20_ner_wangchanberta(self): ner = ThaiNameTagger(dataset_name="lst20") self.assertIsNotNone( ner.get_ner("I คิด therefore I am ผ็ฎ์") ) + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ) + + def test_segment_wangchanberta(self): + self.assertIsNotNone( + segment("I คิด therefore I am ผ็ฎ์") + ) + self.assertIsNotNone( + segment([]) + ) \ No newline at end of file From 68bf0503e2cf9810072fddc7547f6bbe1c9c146a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 26 Feb 2021 22:55:32 +0700 Subject: [PATCH 15/34] Update test_wangchanberta.py --- tests/test_wangchanberta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index c67f694cf..bf607dbe1 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -31,4 +31,4 @@ def test_segment_wangchanberta(self): ) self.assertIsNotNone( segment([]) - ) \ No newline at end of file + ) From 757f9ac1453b78666fa0a12a928f027ef0b23284 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 27 Feb 2021 00:34:44 +0700 Subject: [PATCH 16/34] Add pythainlp.wangchanberta docs --- docs/api/ulmfit.rst | 2 ++ docs/api/wangchanberta.rst | 22 ++++++++++++++++++++++ pythainlp/tokenize/core.py | 13 ++++++++++++- pythainlp/wangchanberta/core.py | 7 +++++++ 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 docs/api/wangchanberta.rst diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst index c6756a6b4..1f9aa002a 100644 --- a/docs/api/ulmfit.rst +++ b/docs/api/ulmfit.rst @@ -3,6 +3,8 @@ pythainlp.ulmfit ==================================== +Universal Language Model Fine-tuning for Text Classification (ULMFiT). + Modules ------- .. autoclass:: ThaiTokenizer diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst new file mode 100644 index 000000000..7f495aba8 --- /dev/null +++ b/docs/api/wangchanberta.rst @@ -0,0 +1,22 @@ +.. currentmodule:: pythainlp.wangchanberta + +pythainlp.wangchanberta +======================= + +WangchanBERTa base model: wangchanberta-base-att-spm-uncased[#Lowphansirikul_2021]_ + +We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer. + +Modules +------- +.. autoclass:: ThaiNameTagger +.. autofunction:: pos_tag +.. autofunction:: segment + +References +---------- + +.. [#Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S. + WangchanBERTa: Pretraining transformer-based Thai Language Models. + arXiv:210109635 [cs] [Internet]. 2021 Jan 23 [cited 2021 Feb 27]; + Available from: http://arxiv.org/abs/2101.09635 diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 5304cd8b8..3fdd66e52 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -321,7 +321,7 @@ def subword_tokenize( # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก', 'และ', 'พัฒ','นา', 'กา', 'ร'] - Tokenize text into subword based on *etcc* **(Work In Progress)**:: + Tokenize text into subword based on *etcc*:: text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" text_2 = "ความแปลกแยกและพัฒนาการ" @@ -331,6 +331,17 @@ def subword_tokenize( subword_tokenize(text_2, engine='etcc') # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ'] + + Tokenize text into subword based on *wangchanberta*:: + + text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + text_2 = "ความแปลกแยกและพัฒนาการ" + + subword_tokenize(text_1, engine='wangchanberta') + # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง'] + + subword_tokenize(text_2, engine='wangchanberta') + # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ'] """ if not text or not isinstance(text, str): return [] diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 76dd806a0..256bd6cc3 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -87,6 +87,13 @@ def get_ner( def segment(text: str) -> List[str]: + """ + Subword tokenize. SentencePiece from wangchanberta model. + + :param str text: text to be tokenized + :return: list of subwords + :rtype: list[str] + """ if not text or not isinstance(text, str): return [] From 628cf50ccf1e6ae47c72b5b438c036823a198f9f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 27 Feb 2021 14:45:30 +0700 Subject: [PATCH 17/34] Update tokenize.rst --- docs/api/tokenize.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 8b8b08c14..f5ceb13a3 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -79,3 +79,5 @@ tcc etcc ++++ .. automodule:: pythainlp.tokenize.etcc + +.. autofunction:: pythainlp.tokenize.etcc.segment \ No newline at end of file From 8abe1b4caa46da4f80640039b09566db1a64ed07 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 1 Mar 2021 12:30:10 +0700 Subject: [PATCH 18/34] Fixed PEP8 --- pythainlp/wangchanberta/core.py | 7 +++---- pythainlp/wangchanberta/postag.py | 32 +++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 256bd6cc3..4f737ee6e 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -25,7 +25,7 @@ def __init__(self, tokenizer=_tokenizer, model = f'airesearch/{_model_name}', revision = f'finetuned@{self.dataset_name}-ner', - ignore_labels=[], + ignore_labels=[], grouped_entities=self.grouped_entities ) @@ -58,9 +58,9 @@ def get_ner( elif self.dataset_name == "thainer": self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁'] elif self.grouped_entities and self.dataset_name == "lst20": - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_','-').replace('E-','I-')) for i in self.json_ner] + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner] else: - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_','-').replace('E-','I-')) for i in self.json_ner] + self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner] if tag: temp = "" @@ -82,7 +82,6 @@ def get_ner( sent += "" return sent - return self.sent_ner diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index d69b45c27..6705a9417 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -17,8 +17,7 @@ class PosTagTransformers: def __init__(self, corpus: str = "lst20", - grouped_word: bool = False - ) -> None: + grouped_word: bool = False) -> None: self.corpus = corpus self.grouped_word = grouped_word self.load() @@ -27,9 +26,9 @@ def load(self): self.classify_tokens = pipeline( task='ner', tokenizer=_tokenizer, - model = f'airesearch/{_model_name}', - revision = f'finetuned@{self.corpus}-pos', - ignore_labels=[], + model=f'airesearch/{_model_name}', + revision=f'finetuned@{self.corpus}-pos', + ignore_labels=[], grouped_entities=self.grouped_word ) @@ -44,14 +43,23 @@ def tag( self.json_pos = self.classify_tokens(text) self.output = "" if grouped_word: - self.sent_pos = [(i['word'].replace("<_>", " "), i['entity_group']) for i in self.json_pos] + self.sent_pos = [ + (i['word'].replace("<_>", " "), + i['entity_group']) for i in self.json_pos + ] else: - self.sent_pos = [(i['word'].replace("<_>", " ").replace('▁',''), i['entity']) for i in self.json_pos if i['word'] != '▁'] + self.sent_pos = [ + (i['word'].replace("<_>", " ").replace('▁',''), + i['entity']) + for i in self.json_pos if i['word'] != '▁' + ] return self.sent_pos + _corpus = "lst20" _grouped_word = False -_postag = PosTagTransformers(corpus=_corpus, grouped_word = _grouped_word) +_postag = PosTagTransformers(corpus=_corpus, grouped_word=_grouped_word) + def pos_tag( text: str, corpus: str = "lst20", grouped_word: bool = False @@ -66,7 +74,11 @@ def pos_tag( if _grouped_word != grouped_word: _postag = PosTagTransformers( corpus=corpus, - grouped_word = grouped_word + grouped_word=grouped_word ) _grouped_word = grouped_word - return _postag.tag(text, corpus = corpus,grouped_word = grouped_word) + return _postag.tag( + text, + corpus=corpus, + grouped_word=grouped_word + ) From 8d1cbdbaddeb5a4675e2167ad7e81fad9f15b436 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 1 Mar 2021 23:08:56 +0700 Subject: [PATCH 19/34] Update test_wangchanberta.py --- tests/test_wangchanberta.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index bf607dbe1..05f78ca02 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -16,6 +16,11 @@ def test_thainer_wangchanberta(self): ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) ) + ner = ThaiNameTagger(grouped_entities=False) + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ) + def test_lst20_ner_wangchanberta(self): ner = ThaiNameTagger(dataset_name="lst20") self.assertIsNotNone( @@ -25,6 +30,14 @@ def test_lst20_ner_wangchanberta(self): ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) ) + ner = ThaiNameTagger( + dataset_name="lst20", + grouped_entities=False + ) + self.assertIsNotNone( + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ) + def test_segment_wangchanberta(self): self.assertIsNotNone( segment("I คิด therefore I am ผ็ฎ์") @@ -32,3 +45,20 @@ def test_segment_wangchanberta(self): self.assertIsNotNone( segment([]) ) + + def test_pos_tag_wangchanberta(self): + self.assertIsNotNone( + pos_tag("I คิด therefore I am ผ็ฎ์") + ) + self.assertIsNotNone( + pos_tag(['I',' ','คิด',' ','therefore',' ','I',' ','am',' ','ผ็ฎ์']) + ) + self.assertIsNotNone( + pos_tag(None) + ) + self.assertIsNotNone( + pos_tag("I คิด therefore I am ผ็ฎ์",grouped_word=True) + ) + self.assertIsNotNone( + pos_tag("ทดสอบระบบ",grouped_word=False) + ) From 4827c7ddfae4b8915f8dd8c96131a6c9c17dfbb5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 1 Mar 2021 23:12:23 +0700 Subject: [PATCH 20/34] Update tests --- tests/test_tokenize.py | 4 +++- tests/test_wangchanberta.py | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index f234d3c20..d163238ce 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -294,7 +294,9 @@ def test_subword_tokenize(self): self.assertFalse( "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta") ) - self.assertIsInstance(subword_tokenize("โควิด19", engine="wangchanberta"), list) + self.assertIsInstance( + subword_tokenize("โควิด19", engine="wangchanberta"), list + ) self.assertFalse( " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False) ) diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index 05f78ca02..c7e2f1609 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -51,7 +51,21 @@ def test_pos_tag_wangchanberta(self): pos_tag("I คิด therefore I am ผ็ฎ์") ) self.assertIsNotNone( - pos_tag(['I',' ','คิด',' ','therefore',' ','I',' ','am',' ','ผ็ฎ์']) + pos_tag( + [ + 'I', + ' ', + 'คิด', + ' ', + 'therefore', + ' ', + 'I', + ' ', + 'am', + ' ', + 'ผ็ฎ์' + ] + ) ) self.assertIsNotNone( pos_tag(None) From 1e4a0d4d8ed7dcf71acd7a21b4ae5454a367ac12 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 5 Mar 2021 17:03:18 +0700 Subject: [PATCH 21/34] Fixed PEP8 --- pythainlp/wangchanberta/core.py | 2 +- pythainlp/wangchanberta/postag.py | 6 ++++-- tests/test_wangchanberta.py | 12 ++++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 4f737ee6e..6040df631 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -28,7 +28,7 @@ def __init__(self, ignore_labels=[], grouped_entities=self.grouped_entities ) - + def IOB(self, tag): if tag != "O": return "B-"+tag diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index 6705a9417..7153c61b2 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -49,8 +49,10 @@ def tag( ] else: self.sent_pos = [ - (i['word'].replace("<_>", " ").replace('▁',''), - i['entity']) + ( + i['word'].replace("<_>", " ").replace('▁', ''), + i['entity'] + ) for i in self.json_pos if i['word'] != '▁' ] return self.sent_pos diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index c7e2f1609..e26138572 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -13,12 +13,12 @@ def test_thainer_wangchanberta(self): ) ner = ThaiNameTagger() self.assertIsNotNone( - ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) ner = ThaiNameTagger(grouped_entities=False) self.assertIsNotNone( - ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) def test_lst20_ner_wangchanberta(self): @@ -27,7 +27,7 @@ def test_lst20_ner_wangchanberta(self): ner.get_ner("I คิด therefore I am ผ็ฎ์") ) self.assertIsNotNone( - ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) ner = ThaiNameTagger( @@ -35,7 +35,7 @@ def test_lst20_ner_wangchanberta(self): grouped_entities=False ) self.assertIsNotNone( - ner.get_ner("I คิด therefore I am ผ็ฎ์", tag = True) + ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) def test_segment_wangchanberta(self): @@ -71,8 +71,8 @@ def test_pos_tag_wangchanberta(self): pos_tag(None) ) self.assertIsNotNone( - pos_tag("I คิด therefore I am ผ็ฎ์",grouped_word=True) + pos_tag("I คิด therefore I am ผ็ฎ์", grouped_word=True) ) self.assertIsNotNone( - pos_tag("ทดสอบระบบ",grouped_word=False) + pos_tag("ทดสอบระบบ", grouped_word=False) ) From 8de00f77ab6941eacc29e92766d3f732ed691125 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 5 Mar 2021 17:13:08 +0700 Subject: [PATCH 22/34] Fixed PEP8 --- pythainlp/wangchanberta/core.py | 44 ++++++++++++++++++++++--------- pythainlp/wangchanberta/postag.py | 17 +++++++----- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 6040df631..5f14b957d 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -15,19 +15,20 @@ class ThaiNameTagger: - def __init__(self, - dataset_name: str = "thainer", - grouped_entities: bool = True): + def __init__( + self, + dataset_name: str = "thainer", + grouped_entities: bool = True + ): self.dataset_name = dataset_name self.grouped_entities = grouped_entities self.classify_tokens = pipeline( task='ner', tokenizer=_tokenizer, - model = f'airesearch/{_model_name}', - revision = f'finetuned@{self.dataset_name}-ner', + model=f'airesearch/{_model_name}', + revision=f'finetuned@{self.dataset_name}-ner', ignore_labels=[], - grouped_entities=self.grouped_entities - ) + grouped_entities=self.grouped_entities) def IOB(self, tag): if tag != "O": @@ -40,7 +41,8 @@ def get_ner( """ This function tags named-entitiy from text in IOB format. - Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand :param str text: text in Thai to be tagged :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word group, NER tag, @@ -54,13 +56,31 @@ def get_ner( self.json_ner = self.classify_tokens(text) self.output = "" if self.grouped_entities and self.dataset_name == "thainer": - self.sent_ner = [(i['word'].replace("<_>", " "), self.IOB(i['entity_group'])) for i in self.json_ner] + self.sent_ner = [ + ( + i['word'].replace("<_>", " "), self.IOB(i['entity_group']) + ) for i in self.json_ner + ] elif self.dataset_name == "thainer": - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity']) for i in self.json_ner if i['word'] != '▁'] + self.sent_ner = [ + ( + i['word'].replace("<_>", " "), i['entity'] + ) for i in self.json_ner if i['word'] != '▁' + ] elif self.grouped_entities and self.dataset_name == "lst20": - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity_group'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner] + self.sent_ner = [ + ( + i['word'].replace("<_>", " "), + i['entity_group'].replace('_', '-').replace('E-', 'I-') + ) for i in self.json_ner + ] else: - self.sent_ner = [(i['word'].replace("<_>", " "), i['entity'].replace('_', '-').replace('E-', 'I-')) for i in self.json_ner] + self.sent_ner = [ + ( + i['word'].replace("<_>", " "), + i['entity'].replace('_', '-').replace('E-', 'I-') + ) for i in self.json_ner + ] if tag: temp = "" diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index 7153c61b2..cca9e8789 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -15,9 +15,11 @@ class PosTagTransformers: - def __init__(self, - corpus: str = "lst20", - grouped_word: bool = False) -> None: + def __init__( + self, + corpus: str = "lst20", + grouped_word: bool = False + ) -> None: self.corpus = corpus self.grouped_word = grouped_word self.load() @@ -35,7 +37,9 @@ def load(self): def tag( self, text: str, corpus: str = "lst20", grouped_word: bool = False ) -> List[Tuple[str, str]]: - if (corpus != self.corpus and corpus in ['lst20']) or grouped_word != self.grouped_word: + if ( + corpus != self.corpus and corpus in ['lst20'] + ) or grouped_word != self.grouped_word: self.grouped_word = grouped_word self.corpus = corpus self.load() @@ -44,8 +48,9 @@ def tag( self.output = "" if grouped_word: self.sent_pos = [ - (i['word'].replace("<_>", " "), - i['entity_group']) for i in self.json_pos + ( + i['word'].replace("<_>", " "), i['entity_group'] + ) for i in self.json_pos ] else: self.sent_pos = [ From f1b0a0e2ab7ee51a05401ecfd913b8af3db7388a Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 6 Mar 2021 15:48:24 +0700 Subject: [PATCH 23/34] Update core.py --- pythainlp/wangchanberta/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 5f14b957d..fb2d8ac30 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -58,26 +58,26 @@ def get_ner( if self.grouped_entities and self.dataset_name == "thainer": self.sent_ner = [ ( - i['word'].replace("<_>", " "), self.IOB(i['entity_group']) + i['word'].replace("<_>", " ").replace('▁', ''), self.IOB(i['entity_group']) ) for i in self.json_ner ] elif self.dataset_name == "thainer": self.sent_ner = [ ( - i['word'].replace("<_>", " "), i['entity'] + i['word'].replace("<_>", " ").replace('▁', ''), i['entity'] ) for i in self.json_ner if i['word'] != '▁' ] elif self.grouped_entities and self.dataset_name == "lst20": self.sent_ner = [ ( - i['word'].replace("<_>", " "), + i['word'].replace("<_>", " ").replace('▁', ''), i['entity_group'].replace('_', '-').replace('E-', 'I-') ) for i in self.json_ner ] else: self.sent_ner = [ ( - i['word'].replace("<_>", " "), + i['word'].replace("<_>", " ").replace('▁', ''), i['entity'].replace('_', '-').replace('E-', 'I-') ) for i in self.json_ner ] From f8a0efa3b6f048c3e9987fb1aabfcbfcfc93cf27 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 6 Mar 2021 15:49:04 +0700 Subject: [PATCH 24/34] Update core.py --- pythainlp/wangchanberta/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index fb2d8ac30..e3a075028 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -58,7 +58,8 @@ def get_ner( if self.grouped_entities and self.dataset_name == "thainer": self.sent_ner = [ ( - i['word'].replace("<_>", " ").replace('▁', ''), self.IOB(i['entity_group']) + i['word'].replace("<_>", " ").replace('▁', ''), + self.IOB(i['entity_group']) ) for i in self.json_ner ] elif self.dataset_name == "thainer": From f5ae3ad9c3779894cecd8b2da480c5701bf646cf Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 9 Mar 2021 19:34:53 +0700 Subject: [PATCH 25/34] Update core --- pythainlp/wangchanberta/core.py | 5 ++++- tests/test_wangchanberta.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index e3a075028..8dd39de1e 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -103,7 +103,10 @@ def get_ner( sent += "" return sent - return self.sent_ner + if self.sent_ner[0][0] == '' and len(self.sent_ner)>1: + return self.sent_ner[1:] + else: + return self.sent_ner def segment(text: str) -> List[str]: diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index e26138572..341c2119e 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -15,6 +15,9 @@ def test_thainer_wangchanberta(self): self.assertIsNotNone( ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) + self.assertIsNotNone( + ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True) + ) ner = ThaiNameTagger(grouped_entities=False) self.assertIsNotNone( @@ -29,6 +32,9 @@ def test_lst20_ner_wangchanberta(self): self.assertIsNotNone( ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) + self.assertIsNotNone( + ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True) + ) ner = ThaiNameTagger( dataset_name="lst20", From 00b27532e55db3ef98106cd57446d93dbf4f10c4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 11 Mar 2021 15:06:23 +0700 Subject: [PATCH 26/34] Fixed PEP8 --- pythainlp/wangchanberta/core.py | 2 +- tests/test_wangchanberta.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 8dd39de1e..e85d842fd 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -103,7 +103,7 @@ def get_ner( sent += "" return sent - if self.sent_ner[0][0] == '' and len(self.sent_ner)>1: + if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1: return self.sent_ner[1:] else: return self.sent_ner diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index 341c2119e..85a722b0b 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -16,7 +16,10 @@ def test_thainer_wangchanberta(self): ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) self.assertIsNotNone( - ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True) + ner.get_ner( + "โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", + tag=True + ) ) ner = ThaiNameTagger(grouped_entities=False) @@ -33,7 +36,10 @@ def test_lst20_ner_wangchanberta(self): ner.get_ner("I คิด therefore I am ผ็ฎ์", tag=True) ) self.assertIsNotNone( - ner.get_ner("โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", tag=True) + ner.get_ner( + "โรงเรียนสวนกุหลาบเป็นโรงเรียนที่ดี แต่ไม่มีสวนกุหลาบ", + tag=True + ) ) ner = ThaiNameTagger( From c637c8a07582317ca39b8e06b2647a38cde83c08 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 11 Mar 2021 16:06:33 +0700 Subject: [PATCH 27/34] Update core.py --- pythainlp/wangchanberta/core.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index e85d842fd..3be87b537 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -82,7 +82,12 @@ def get_ner( i['entity'].replace('_', '-').replace('E-', 'I-') ) for i in self.json_ner ] - + if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1: + self.sent_ner = self.sent_ner[1:] + for idx, (word, ner) in enumerate(self.sent_ner): + if idx > 0 and ner.startswith("B-"): + if ner.replace('B-', '') == self.sent_ner[idx-1][1].replace('B-', '').replace('I-', ''): + self.sent_ner[idx] = (word,ner.replace('B-', 'I-')) if tag: temp = "" sent = "" @@ -103,8 +108,7 @@ def get_ner( sent += "" return sent - if self.sent_ner[0][0] == '' and len(self.sent_ner) > 1: - return self.sent_ner[1:] + else: return self.sent_ner From f8d438a3112d4ed64013b9e68c2fabeff1f20214 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 11 Mar 2021 16:13:02 +0700 Subject: [PATCH 28/34] Update core.py --- pythainlp/wangchanberta/core.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 3be87b537..df74c1803 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -35,6 +35,9 @@ def IOB(self, tag): return "B-"+tag return "O" + def clear_tag(self, tag): + return tag.replace('B-', '').replace('I-', '') + def get_ner( self, text: str, tag: bool = False ) -> List[Tuple[str, str]]: @@ -86,7 +89,9 @@ def get_ner( self.sent_ner = self.sent_ner[1:] for idx, (word, ner) in enumerate(self.sent_ner): if idx > 0 and ner.startswith("B-"): - if ner.replace('B-', '') == self.sent_ner[idx-1][1].replace('B-', '').replace('I-', ''): + if ( + self.clear_tag(ner) == self.clear_tag(self.sent_ner[idx-1][1]) + ): self.sent_ner[idx] = (word,ner.replace('B-', 'I-')) if tag: temp = "" @@ -108,7 +113,6 @@ def get_ner( sent += "" return sent - else: return self.sent_ner From 87b6119e172d3c33d476a1de59ce93f0b792317c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 11 Mar 2021 16:14:24 +0700 Subject: [PATCH 29/34] Update core.py --- pythainlp/wangchanberta/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index df74c1803..e59603273 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -90,9 +90,11 @@ def get_ner( for idx, (word, ner) in enumerate(self.sent_ner): if idx > 0 and ner.startswith("B-"): if ( - self.clear_tag(ner) == self.clear_tag(self.sent_ner[idx-1][1]) + self.clear_tag(ner) == self.clear_tag( + self.sent_ner[idx-1][1] + ) ): - self.sent_ner[idx] = (word,ner.replace('B-', 'I-')) + self.sent_ner[idx] = (word, ner.replace('B-', 'I-')) if tag: temp = "" sent = "" From 9e04a18708632b407508d285735fb4d7b3ed6463 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 11 Mar 2021 19:14:56 +0700 Subject: [PATCH 30/34] Update wangchanberta.rst --- docs/api/wangchanberta.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst index 7f495aba8..37080421c 100644 --- a/docs/api/wangchanberta.rst +++ b/docs/api/wangchanberta.rst @@ -3,7 +3,7 @@ pythainlp.wangchanberta ======================= -WangchanBERTa base model: wangchanberta-base-att-spm-uncased[#Lowphansirikul_2021]_ +WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_2021]_ We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer. From 34a034ae8f4590cb3750c5eefd557803898f5f4d Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 13 Mar 2021 20:12:54 +0700 Subject: [PATCH 31/34] Update pos_tag docs --- pythainlp/tag/pos_tag.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index c8f1ae99d..a51ecb3ae 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -12,12 +12,14 @@ def pos_tag( :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger - * *wangchanberta* - wangchanberta model (support lst20 corpus only) + * *wangchanberta* - wangchanberta model (support lst20 corpus only \ + and it supports a string only. if you input a list of word, it will \ + convert list word to a string. :param str corpus: the corpus that used to create the language model for tagger * *lst20* - `LST20 `_ corpus \ by National Electronics and Computer Technology Center, Thailand - * *lst20_ud* - LST20 text, with tags mapped to Universal POS tags \ + * *lst20_ud* - LST20 text, with tags mapped to Universal POS tag \ from `Universal Dependencies ` * *orchid* - `ORCHID \ `_ corpus, \ From 5bbbebe4172c02a4f03704bd23c88511f28553ca Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 13 Mar 2021 22:18:46 +0700 Subject: [PATCH 32/34] Update pos_tag.py --- pythainlp/tag/pos_tag.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index a51ecb3ae..97f1a6d70 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -13,8 +13,8 @@ def pos_tag( * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger * *wangchanberta* - wangchanberta model (support lst20 corpus only \ - and it supports a string only. if you input a list of word, it will \ - convert list word to a string. + and it supports a string only. if you input a list of word, \ + it will convert list word to a string. :param str corpus: the corpus that used to create the language model for tagger * *lst20* - `LST20 `_ corpus \ From e4cf8dac124074d6986cccf04c46e47ddaa05aa5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 15 Mar 2021 22:12:24 +0700 Subject: [PATCH 33/34] Add pythainlp.wangchanberta Speed Benchmark --- docs/api/wangchanberta.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst index 37080421c..74eb18b1d 100644 --- a/docs/api/wangchanberta.rst +++ b/docs/api/wangchanberta.rst @@ -7,6 +7,33 @@ WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_20 We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer. +**Speed Benchmark** + ++-------------------------+-------------------------+----------------+ +| Function | Named Entity | Part of Speech | +| | Recognition | | ++=========================+=========================+================+ +| PyThaiNLP basic | 89.7 ms | 312 ms | +| function (CRF for NER | | | +| and perceptron model | | | +| for POS) | | | ++-------------------------+-------------------------+----------------+ +| pythainlp.wangchanberta | 9.64 s | 9.65 s | +| (CPU) | | | ++-------------------------+-------------------------+----------------+ +| pythainlp.wangchanberta | 8.02 s | 8 s | +| (GPU) | | | ++-------------------------+-------------------------+----------------+ + +Notebook: + +- `PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google + Colab`_ +- `pythainlp.wangchanberta GPU`_ + +.. _PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google Colab: https://colab.research.google.com/drive/1ymTVB1UESXAyZlSpjknCb72xpdcZ86Db?usp=sharing +.. _pythainlp.wangchanberta GPU: https://colab.research.google.com/drive/1AtkFT1HMGL2GO7O2tM_hi_7mExKwmhMw?usp=sharing + Modules ------- .. autoclass:: ThaiNameTagger From ff6d300379d79bb92d2d8a7a58303cf110106a95 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 15 Mar 2021 22:55:54 +0700 Subject: [PATCH 34/34] Update docs --- docs/api/wangchanberta.rst | 23 ++++++++--------------- pythainlp/wangchanberta/core.py | 21 ++++++++++++++++----- pythainlp/wangchanberta/postag.py | 10 ++++++++++ 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst index 74eb18b1d..8e4124177 100644 --- a/docs/api/wangchanberta.rst +++ b/docs/api/wangchanberta.rst @@ -9,21 +9,13 @@ We used WangchanBERTa for Thai name tagger task, part-of-speech and subword toke **Speed Benchmark** -+-------------------------+-------------------------+----------------+ -| Function | Named Entity | Part of Speech | -| | Recognition | | -+=========================+=========================+================+ -| PyThaiNLP basic | 89.7 ms | 312 ms | -| function (CRF for NER | | | -| and perceptron model | | | -| for POS) | | | -+-------------------------+-------------------------+----------------+ -| pythainlp.wangchanberta | 9.64 s | 9.65 s | -| (CPU) | | | -+-------------------------+-------------------------+----------------+ -| pythainlp.wangchanberta | 8.02 s | 8 s | -| (GPU) | | | -+-------------------------+-------------------------+----------------+ +============================= ======================== ============== +Function Named Entity Recognition Part of Speech +============================= ======================== ============== +PyThaiNLP basic function 89.7 ms 312 ms +pythainlp.wangchanberta (CPU) 9.64 s 9.65 s +pythainlp.wangchanberta (GPU) 8.02 s 8 s +============================= ======================== ============== Notebook: @@ -37,6 +29,7 @@ Notebook: Modules ------- .. autoclass:: ThaiNameTagger + :members: .. autofunction:: pos_tag .. autofunction:: segment diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index e59603273..c82bf12c0 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -20,6 +20,17 @@ def __init__( dataset_name: str = "thainer", grouped_entities: bool = True ): + """ + This function tags named-entitiy from text in IOB format. + + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + + :param str dataset_name: + * *thainer* - ThaiNER dataset + * *lst20* - LST20 Corpus + :param bool grouped_entities: grouped entities + """ self.dataset_name = dataset_name self.grouped_entities = grouped_entities self.classify_tokens = pipeline( @@ -30,17 +41,17 @@ def __init__( ignore_labels=[], grouped_entities=self.grouped_entities) - def IOB(self, tag): + def _IOB(self, tag): if tag != "O": return "B-"+tag return "O" - def clear_tag(self, tag): + def _clear_tag(self, tag): return tag.replace('B-', '').replace('I-', '') def get_ner( self, text: str, tag: bool = False - ) -> List[Tuple[str, str]]: + ) -> Union[List[Tuple[str, str]], str]: """ This function tags named-entitiy from text in IOB format. @@ -62,7 +73,7 @@ def get_ner( self.sent_ner = [ ( i['word'].replace("<_>", " ").replace('▁', ''), - self.IOB(i['entity_group']) + self._IOB(i['entity_group']) ) for i in self.json_ner ] elif self.dataset_name == "thainer": @@ -90,7 +101,7 @@ def get_ner( for idx, (word, ner) in enumerate(self.sent_ner): if idx > 0 and ner.startswith("B-"): if ( - self.clear_tag(ner) == self.clear_tag( + self._clear_tag(ner) == self._clear_tag( self.sent_ner[idx-1][1] ) ): diff --git a/pythainlp/wangchanberta/postag.py b/pythainlp/wangchanberta/postag.py index cca9e8789..df0e9b7ea 100644 --- a/pythainlp/wangchanberta/postag.py +++ b/pythainlp/wangchanberta/postag.py @@ -71,6 +71,16 @@ def tag( def pos_tag( text: str, corpus: str = "lst20", grouped_word: bool = False ) -> List[Tuple[str, str]]: + """ + Marks words with part-of-speech (POS) tags. + + :param str text: thai text + :param str corpus: + * *lst20* - a LST20 tagger (default) + :param bool grouped_word: grouped word (default is False) + :return: a list of tuples (word, POS tag) + :rtype: list[tuple[str, str]] + """ global _grouped_word, _postag if isinstance(text, list): text = ''.join(text)