diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 93d57e3c0..790810a62 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -27,10 +27,11 @@ "chunk_parse", "NER", "NNER", + "pos_tag_transformers" ] from pythainlp.tag.locations import tag_provinces -from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents +from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers from pythainlp.tag._tag_perceptron import PerceptronTagger from pythainlp.tag.chunk import chunk_parse from pythainlp.tag.named_entity import NER, NNER diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index c2e9bbf7f..abdfe5fc2 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -15,6 +15,7 @@ from typing import List, Tuple + def pos_tag( words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: @@ -176,3 +177,47 @@ def pos_tag_sents( return [] return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences] + + +def pos_tag_transformers( + words: str, engine: str = "bert-base-th-cased-blackboard" +): + """ + "wangchanberta-ud-thai-pud-upos", + "mdeberta-v3-ud-thai-pud-upos", + "bert-base-th-cased-blackboard", + + """ + + try: + from transformers import AutoModelForTokenClassification, \ + AutoTokenizer, TokenClassificationPipeline + except ImportError: + raise ImportError( + "Not found transformers! Please install transformers by pip install transformers") + + if not words: + return [] + + if engine == "wangchanberta-ud-thai-pud-upos": + model = AutoModelForTokenClassification.from_pretrained( + "Pavarissy/wangchanberta-ud-thai-pud-upos") + tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos") + elif engine == "mdeberta-v3-ud-thai-pud-upos": + model = AutoModelForTokenClassification.from_pretrained( + "Pavarissy/mdeberta-v3-ud-thai-pud-upos") + tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos") + elif engine == "bert-base-th-cased-blackboard": + model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai") + tokenizer = AutoTokenizer.from_pretrained("lunarlist/pos_thai") + else: + raise ValueError( + "pos_tag_transformers not support {0} engine.".format( + engine + ) + ) + + pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True) + + outputs = pipeline(words) + return outputs \ No newline at end of file diff --git a/tests/test_tag.py b/tests/test_tag.py index eae51bf30..8d1755b18 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -9,10 +9,12 @@ perceptron, pos_tag, pos_tag_sents, + pos_tag_transformers, unigram, tltk, NER, NNER, + ) from pythainlp.tag.locations import tag_provinces from pythainlp.tag.thainer import ThaiNameTagger @@ -362,3 +364,13 @@ def test_NER_class(self): def test_NNER_class(self): nner = NNER() self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า")) + + def test_pos_tag_transformers(self): + self.assertIsNotNone(pos_tag_transformers( + words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert-base-th-cased-blackboard")) + self.assertIsNotNone(pos_tag_transformers( + words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta-v3-ud-thai-pud-upos")) + self.assertIsNotNone(pos_tag_transformers( + words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos")) + with self.assertRaises(ValueError): + pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine") \ No newline at end of file