1515from typing import List , Tuple
1616
1717
18+
1819def pos_tag (
1920 words : List [str ], engine : str = "perceptron" , corpus : str = "orchid"
2021) -> List [Tuple [str , str ]]:
@@ -176,3 +177,47 @@ def pos_tag_sents(
176177 return []
177178
178179 return [pos_tag (sent , engine = engine , corpus = corpus ) for sent in sentences ]
180+
181+
182+ def pos_tag_transformers (
183+ words : str , engine : str = "bert-base-th-cased-blackboard"
184+ ):
185+ """
186+ "wangchanberta-ud-thai-pud-upos",
187+ "mdeberta-v3-ud-thai-pud-upos",
188+ "bert-base-th-cased-blackboard",
189+
190+ """
191+
192+ try :
193+ from transformers import AutoModelForTokenClassification , \
194+ AutoTokenizer , TokenClassificationPipeline
195+ except ImportError :
196+ raise ImportError (
197+ "Not found transformers! Please install transformers by pip install transformers" )
198+
199+ if not words :
200+ return []
201+
202+ if engine == "wangchanberta-ud-thai-pud-upos" :
203+ model = AutoModelForTokenClassification .from_pretrained (
204+ "Pavarissy/wangchanberta-ud-thai-pud-upos" )
205+ tokenizer = AutoTokenizer .from_pretrained ("Pavarissy/wangchanberta-ud-thai-pud-upos" )
206+ elif engine == "mdeberta-v3-ud-thai-pud-upos" :
207+ model = AutoModelForTokenClassification .from_pretrained (
208+ "Pavarissy/mdeberta-v3-ud-thai-pud-upos" )
209+ tokenizer = AutoTokenizer .from_pretrained ("Pavarissy/mdeberta-v3-ud-thai-pud-upos" )
210+ elif engine == "bert-base-th-cased-blackboard" :
211+ model = AutoModelForTokenClassification .from_pretrained ("lunarlist/pos_thai" )
212+ tokenizer = AutoTokenizer .from_pretrained ("lunarlist/pos_thai" )
213+ else :
214+ raise ValueError (
215+ "pos_tag_transformers not support {0} engine." .format (
216+ engine
217+ )
218+ )
219+
220+ pipeline = TokenClassificationPipeline (model = model , tokenizer = tokenizer , grouped_entities = True )
221+
222+ outputs = pipeline (words )
223+ return outputs
0 commit comments