Merge pull request #3 from PyThaiNLP/dev

konbraphat51 · web-flow · commit e39b62262855 · 2023-11-09T23:55:55.000+09:00
update
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
 | Version | Description | Status |
 |:------:|:--:|:------:|
 | [4.0](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/714) |
-| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 4.1  | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
+| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.0 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
 
 
 ## Getting Started
diff --git a/README_TH.md b/README_TH.md
@@ -21,7 +21,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
 | รุ่น | คำอธิบาย | สถานะ |
 |:------:|:--:|:------:|
 | [4.0](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/714) |
-| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 4.1  | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
+| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.0  | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
 
 ติดตามพวกเราบน [PyThaiNLP Facebook page](https://www.facebook.com/pythainlp/) เพื่อรับข่าวสารเพิ่มเติม
 
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -17,7 +17,7 @@
 #
 # URL: <https://pythainlp.github.io/>
 # For license information, see LICENSE
-__version__ = "4.1.0beta5"
+__version__ = "5.0.0dev0"
 
 thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"  # 44 chars
 
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -27,10 +27,11 @@
     "chunk_parse",
     "NER",
     "NNER",
+    "pos_tag_transformers"
 ]
 
 from pythainlp.tag.locations import tag_provinces
-from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
+from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
 from pythainlp.tag._tag_perceptron import PerceptronTagger
 from pythainlp.tag.chunk import chunk_parse
 from pythainlp.tag.named_entity import NER, NNER
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -15,6 +15,7 @@
 from typing import List, Tuple
 
 
+
 def pos_tag(
     words: List[str], engine: str = "perceptron", corpus: str = "orchid"
 ) -> List[Tuple[str, str]]:
@@ -176,3 +177,47 @@ def pos_tag_sents(
         return []
 
     return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
+
+
+def pos_tag_transformers(
+    words: str, engine: str = "bert-base-th-cased-blackboard"
+):
+    """
+    "wangchanberta-ud-thai-pud-upos",
+    "mdeberta-v3-ud-thai-pud-upos",
+    "bert-base-th-cased-blackboard",
+
+    """
+
+    try:
+        from transformers import AutoModelForTokenClassification, \
+            AutoTokenizer, TokenClassificationPipeline
+    except ImportError:
+        raise ImportError(
+            "Not found transformers! Please install transformers by pip install transformers")
+
+    if not words:
+        return []
+
+    if engine == "wangchanberta-ud-thai-pud-upos":
+        model = AutoModelForTokenClassification.from_pretrained(
+            "Pavarissy/wangchanberta-ud-thai-pud-upos")
+        tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos")
+    elif engine == "mdeberta-v3-ud-thai-pud-upos":
+        model = AutoModelForTokenClassification.from_pretrained(
+            "Pavarissy/mdeberta-v3-ud-thai-pud-upos")
+        tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos")
+    elif engine == "bert-base-th-cased-blackboard":
+        model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai")
+        tokenizer = AutoTokenizer.from_pretrained("lunarlist/pos_thai")
+    else:
+        raise ValueError(
+            "pos_tag_transformers not support {0} engine.".format(
+                engine
+            )
+        )
+
+    pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True)
+
+    outputs = pipeline(words)
+    return outputs
diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py
@@ -25,7 +25,7 @@
 from pythainlp.tokenize import word_tokenize
 from pythainlp.util import isthai
 
-_TOKENIZER_ENGINE = "newmm"  # should be same as that used in training data
+_TOKENIZER_ENGINE = "mm"
 
 
 def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 4.1.0beta5
+current_version = 5.0.0dev0
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
diff --git a/setup.py b/setup.py
@@ -172,7 +172,7 @@
 
 setup(
     name="pythainlp",
-    version="4.1.0beta5",
+    version="5.0.0dev0",
     description="Thai Natural Language Processing library",
     long_description=readme,
     long_description_content_type="text/markdown",
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -9,10 +9,12 @@
     perceptron,
     pos_tag,
     pos_tag_sents,
+    pos_tag_transformers,
     unigram,
     tltk,
     NER,
     NNER,
+
 )
 from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.thainer import ThaiNameTagger
@@ -362,3 +364,13 @@ def test_NER_class(self):
     def test_NNER_class(self):
         nner = NNER()
         self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า"))
+
+    def test_pos_tag_transformers(self):
+        self.assertIsNotNone(pos_tag_transformers(
+            words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert-base-th-cased-blackboard"))
+        self.assertIsNotNone(pos_tag_transformers(
+            words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta-v3-ud-thai-pud-upos"))
+        self.assertIsNotNone(pos_tag_transformers(
+            words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos"))
+        with self.assertRaises(ValueError):
+            pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine")

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`#`
`18`	`18`	`# URL: <https://pythainlp.github.io/>`
`19`	`19`	`# For license information, see LICENSE`
`20`		`-__version__ = "4.1.0beta5"`
	`20`	`+__version__ = "5.0.0dev0"`
`21`	`21`
`22`	`22`	`thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars`
`23`	`23`