Skip to content

Commit e39b622

Browse files
authored
Merge pull request #3 from PyThaiNLP/dev
update
2 parents 2e2f0cf + edb52b3 commit e39b622

File tree

9 files changed

+65
-7
lines changed

9 files changed

+65
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
2424
| Version | Description | Status |
2525
|:------:|:--:|:------:|
2626
| [4.0](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/714) |
27-
| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 4.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
27+
| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.0 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
2828

2929

3030
## Getting Started

README_TH.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
2121
| รุ่น | คำอธิบาย | สถานะ |
2222
|:------:|:--:|:------:|
2323
| [4.0](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/714) |
24-
| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 4.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
24+
| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.0 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
2525

2626
ติดตามพวกเราบน [PyThaiNLP Facebook page](https://www.facebook.com/pythainlp/) เพื่อรับข่าวสารเพิ่มเติม
2727

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#
1818
# URL: <https://pythainlp.github.io/>
1919
# For license information, see LICENSE
20-
__version__ = "4.1.0beta5"
20+
__version__ = "5.0.0dev0"
2121

2222
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
2323

pythainlp/tag/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@
2727
"chunk_parse",
2828
"NER",
2929
"NNER",
30+
"pos_tag_transformers"
3031
]
3132

3233
from pythainlp.tag.locations import tag_provinces
33-
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
34+
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
3435
from pythainlp.tag._tag_perceptron import PerceptronTagger
3536
from pythainlp.tag.chunk import chunk_parse
3637
from pythainlp.tag.named_entity import NER, NNER

pythainlp/tag/pos_tag.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from typing import List, Tuple
1616

1717

18+
1819
def pos_tag(
1920
words: List[str], engine: str = "perceptron", corpus: str = "orchid"
2021
) -> List[Tuple[str, str]]:
@@ -176,3 +177,47 @@ def pos_tag_sents(
176177
return []
177178

178179
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
180+
181+
182+
def pos_tag_transformers(
183+
words: str, engine: str = "bert-base-th-cased-blackboard"
184+
):
185+
"""
186+
"wangchanberta-ud-thai-pud-upos",
187+
"mdeberta-v3-ud-thai-pud-upos",
188+
"bert-base-th-cased-blackboard",
189+
190+
"""
191+
192+
try:
193+
from transformers import AutoModelForTokenClassification, \
194+
AutoTokenizer, TokenClassificationPipeline
195+
except ImportError:
196+
raise ImportError(
197+
"Not found transformers! Please install transformers by pip install transformers")
198+
199+
if not words:
200+
return []
201+
202+
if engine == "wangchanberta-ud-thai-pud-upos":
203+
model = AutoModelForTokenClassification.from_pretrained(
204+
"Pavarissy/wangchanberta-ud-thai-pud-upos")
205+
tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos")
206+
elif engine == "mdeberta-v3-ud-thai-pud-upos":
207+
model = AutoModelForTokenClassification.from_pretrained(
208+
"Pavarissy/mdeberta-v3-ud-thai-pud-upos")
209+
tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos")
210+
elif engine == "bert-base-th-cased-blackboard":
211+
model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai")
212+
tokenizer = AutoTokenizer.from_pretrained("lunarlist/pos_thai")
213+
else:
214+
raise ValueError(
215+
"pos_tag_transformers not support {0} engine.".format(
216+
engine
217+
)
218+
)
219+
220+
pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True)
221+
222+
outputs = pipeline(words)
223+
return outputs

pythainlp/tag/thainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from pythainlp.tokenize import word_tokenize
2626
from pythainlp.util import isthai
2727

28-
_TOKENIZER_ENGINE = "newmm" # should be same as that used in training data
28+
_TOKENIZER_ENGINE = "mm"
2929

3030

3131
def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 4.1.0beta5
2+
current_version = 5.0.0dev0
33
commit = True
44
tag = True
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@
172172

173173
setup(
174174
name="pythainlp",
175-
version="4.1.0beta5",
175+
version="5.0.0dev0",
176176
description="Thai Natural Language Processing library",
177177
long_description=readme,
178178
long_description_content_type="text/markdown",

tests/test_tag.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@
99
perceptron,
1010
pos_tag,
1111
pos_tag_sents,
12+
pos_tag_transformers,
1213
unigram,
1314
tltk,
1415
NER,
1516
NNER,
17+
1618
)
1719
from pythainlp.tag.locations import tag_provinces
1820
from pythainlp.tag.thainer import ThaiNameTagger
@@ -362,3 +364,13 @@ def test_NER_class(self):
362364
def test_NNER_class(self):
363365
nner = NNER()
364366
self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า"))
367+
368+
def test_pos_tag_transformers(self):
369+
self.assertIsNotNone(pos_tag_transformers(
370+
words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert-base-th-cased-blackboard"))
371+
self.assertIsNotNone(pos_tag_transformers(
372+
words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta-v3-ud-thai-pud-upos"))
373+
self.assertIsNotNone(pos_tag_transformers(
374+
words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos"))
375+
with self.assertRaises(ValueError):
376+
pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine")

0 commit comments

Comments
 (0)