|
2 | 2 | """ |
3 | 3 | Named-entity recognizer |
4 | 4 | """ |
| 5 | +import warnings |
| 6 | +from typing import List, Tuple, Union |
| 7 | +from pythainlp.tag.thainer import ThaiNameTagger |
5 | 8 |
|
6 | | -__all__ = ["ThaiNameTagger"] |
7 | 9 |
|
8 | | -from typing import Dict, List, Tuple, Union |
9 | | - |
10 | | -from pycrfsuite import Tagger as CRFTagger |
11 | | -from pythainlp.corpus import get_corpus_path, thai_stopwords |
12 | | -from pythainlp.tag import pos_tag |
13 | | -from pythainlp.tokenize import word_tokenize |
14 | | -from pythainlp.util import isthai |
15 | | - |
16 | | -_CORPUS_NAME = "thainer" |
17 | | -_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data |
18 | | - |
19 | | - |
20 | | -def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย |
21 | | - return word in thai_stopwords() |
22 | | - |
23 | | - |
24 | | -def _doc2features(doc, i) -> Dict: |
25 | | - word = doc[i][0] |
26 | | - postag = doc[i][1] |
27 | | - |
28 | | - # Features from current word |
29 | | - features = { |
30 | | - "word.word": word, |
31 | | - "word.stopword": _is_stopword(word), |
32 | | - "word.isthai": isthai(word), |
33 | | - "word.isspace": word.isspace(), |
34 | | - "postag": postag, |
35 | | - "word.isdigit": word.isdigit(), |
36 | | - } |
37 | | - if word.isdigit() and len(word) == 5: |
38 | | - features["word.islen5"] = True |
39 | | - |
40 | | - # Features from previous word |
41 | | - if i > 0: |
42 | | - prevword = doc[i - 1][0] |
43 | | - prevpostag = doc[i - 1][1] |
44 | | - prev_features = { |
45 | | - "word.prevword": prevword, |
46 | | - "word.previsspace": prevword.isspace(), |
47 | | - "word.previsthai": isthai(prevword), |
48 | | - "word.prevstopword": _is_stopword(prevword), |
49 | | - "word.prevpostag": prevpostag, |
50 | | - "word.prevwordisdigit": prevword.isdigit(), |
51 | | - } |
52 | | - features.update(prev_features) |
53 | | - else: |
54 | | - features["BOS"] = True # Special "Beginning of Sequence" tag |
55 | | - |
56 | | - # Features from next word |
57 | | - if i < len(doc) - 1: |
58 | | - nextword = doc[i + 1][0] |
59 | | - nextpostag = doc[i + 1][1] |
60 | | - next_features = { |
61 | | - "word.nextword": nextword, |
62 | | - "word.nextisspace": nextword.isspace(), |
63 | | - "word.nextpostag": nextpostag, |
64 | | - "word.nextisthai": isthai(nextword), |
65 | | - "word.nextstopword": _is_stopword(nextword), |
66 | | - "word.nextwordisdigit": nextword.isdigit(), |
67 | | - } |
68 | | - features.update(next_features) |
69 | | - else: |
70 | | - features["EOS"] = True # Special "End of Sequence" tag |
71 | | - |
72 | | - return features |
73 | | - |
74 | | - |
75 | | -class ThaiNameTagger: |
| 10 | +class NER: |
76 | 11 | """ |
77 | | - Thai named-entity recognizer. |
78 | | - :param str version: Thai NER version. |
79 | | - It's support Thai NER 1.4 & 1.5. |
80 | | - The defualt value is `1.5` |
81 | | - |
82 | | - :Example: |
83 | | - :: |
| 12 | + Named-entity recognizer class |
84 | 13 |
|
85 | | - from pythainlp.tag.named_entity import ThaiNameTagger |
| 14 | + :param str engine: Named-entity recognizer engine |
| 15 | + :param str corpus: corpus |
86 | 16 |
|
87 | | - thainer15 = ThaiNameTagger(version="1.5") |
88 | | - thainer15.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") |
| 17 | + **Options for engine** |
| 18 | + * *thainer* - Thai NER engine |
| 19 | + * *wangchanberta* - wangchanberta model |
89 | 20 |
|
90 | | - thainer14 = ThaiNameTagger(version="1.4") |
91 | | - thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") |
| 21 | + **Options for corpus** |
| 22 | + * *thaimer* - Thai NER corpus |
| 23 | + * *lst20* - lst20 corpus (wangchanberta only) |
92 | 24 | """ |
93 | | - def __init__(self, version: str = "1.5") -> None: |
94 | | - """ |
95 | | - Thai named-entity recognizer. |
96 | | -
|
97 | | - :param str version: Thai NER version. |
98 | | - It's support Thai NER 1.4 & 1.5. |
99 | | - The defualt value is `1.5` |
100 | | - """ |
101 | | - self.crf = CRFTagger() |
102 | | - |
103 | | - if version == "1.4": |
104 | | - self.crf.open(get_corpus_path("thainer-1.4", version="1.4")) |
105 | | - self.pos_tag_name = "orchid_ud" |
| 25 | + def __init__(self, engine: str, corpus: str = "thainer") -> None: |
| 26 | + self.load_engine(engine=engine, corpus=corpus) |
| 27 | + |
| 28 | + def load_engine(self, engine: str, corpus: str) -> None: |
| 29 | + self.name_engine = engine |
| 30 | + self.engine = None |
| 31 | + if engine == "thainer" and corpus == "thainer": |
| 32 | + from pythainlp.tag.thainer import ThaiNameTagger |
| 33 | + self.engine = ThaiNameTagger() |
| 34 | + elif engine == "wangchanberta": |
| 35 | + from pythainlp.wangchanberta import ThaiNameTagger |
| 36 | + self.engine = ThaiNameTagger(dataset_name=corpus) |
106 | 37 | else: |
107 | | - self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5")) |
108 | | - self.pos_tag_name = "lst20" |
109 | | - |
110 | | - def get_ner( |
111 | | - self, text: str, pos: bool = True, tag: bool = False |
112 | | - ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: |
| 38 | + raise ValueError( |
| 39 | + "NER class not support {0} engine or {1} corpus.".format( |
| 40 | + engine, |
| 41 | + corpus |
| 42 | + ) |
| 43 | + ) |
| 44 | + |
| 45 | + def tag( |
| 46 | + self, |
| 47 | + text, |
| 48 | + pos=True, |
| 49 | + tag=False |
| 50 | + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: |
113 | 51 | """ |
114 | 52 | This function tags named-entitiy from text in IOB format. |
115 | 53 |
|
116 | 54 | :param str text: text in Thai to be tagged |
117 | | - :param bool pos: To include POS tags in the results (`True`) or |
118 | | - exclude (`False`). The defualt value is `True` |
| 55 | + :param bool pos: output with part-of-speech tag.\ |
| 56 | + (wangchanberta is not support) |
119 | 57 | :param bool tag: output like html tag. |
120 | 58 | :return: a list of tuple associated with tokenized word, NER tag, |
121 | 59 | POS tag (if the parameter `pos` is specified as `True`), |
122 | 60 | and output like html tag (if the parameter `tag` is |
123 | 61 | specified as `True`). |
124 | 62 | Otherwise, return a list of tuple associated with tokenized |
125 | 63 | word and NER tag |
126 | | - :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str |
127 | | -
|
128 | | - :Note: |
129 | | - * For the POS tags to be included in the results, this function |
130 | | - uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron` |
131 | | - and corpus as orchid_ud`. |
132 | | -
|
| 64 | + :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] |
133 | 65 | :Example: |
134 | 66 |
|
135 | | - >>> from pythainlp.tag.named_entity import ThaiNameTagger |
136 | | - >>> |
137 | | - >>> ner = ThaiNameTagger() |
138 | | - >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") |
139 | | - [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), |
140 | | - ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'), |
141 | | - ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), |
142 | | - ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), |
143 | | - ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'), |
144 | | - ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), |
145 | | - ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), |
146 | | - ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), |
147 | | - ('น.', 'NOUN', 'I-TIME')] |
| 67 | + >>> from pythainlp.tag import NER |
148 | 68 | >>> |
149 | | - >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", |
150 | | - pos=False) |
151 | | - [('วันที่', 'O'), (' ', 'O'), |
152 | | - ('15', 'B-DATE'), (' ', 'I-DATE'), |
153 | | - ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), |
154 | | - ('61', 'I-DATE'), (' ', 'O'), |
155 | | - ('ทดสอบ', 'O'), ('ระบบ', 'O'), |
156 | | - ('เวลา', 'O'), (' ', 'O'), |
157 | | - ('14', 'B-TIME'), (':', 'I-TIME'), |
158 | | - ('49', 'I-TIME'), (' ', 'I-TIME'), |
159 | | - ('น.', 'I-TIME')] |
160 | | - >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", |
161 | | - tag=True) |
162 | | - 'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>' |
| 69 | + >>> ner = NER("thainer") |
| 70 | + >>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์") |
| 71 | + [('ทดสอบ', 'VV', 'O'), |
| 72 | + ('นาย', 'NN', 'B-PERSON'), |
| 73 | + ('วรรณ', 'NN', 'I-PERSON'), |
| 74 | + ('พงษ์', 'NN', 'I-PERSON'), |
| 75 | + (' ', 'PU', 'I-PERSON'), |
| 76 | + ('ภัททิย', 'NN', 'I-PERSON'), |
| 77 | + ('ไพบูลย์', 'NN', 'I-PERSON')] |
| 78 | + >>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True) |
| 79 | + 'ทดสอบย<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>' |
163 | 80 | """ |
164 | | - tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE) |
165 | | - pos_tags = pos_tag( |
166 | | - tokens, |
167 | | - engine="perceptron", |
168 | | - corpus=self.pos_tag_name |
169 | | - ) |
170 | | - x_test = ThaiNameTagger.__extract_features(pos_tags) |
171 | | - y = self.crf.tag(x_test) |
172 | | - |
173 | | - sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)] |
174 | | - |
175 | | - if tag: |
176 | | - temp = "" |
177 | | - sent = "" |
178 | | - for idx, (word, ner) in enumerate(sent_ner): |
179 | | - if ner.startswith("B-") and temp != "": |
180 | | - sent += "</" + temp + ">" |
181 | | - temp = ner[2:] |
182 | | - sent += "<" + temp + ">" |
183 | | - elif ner.startswith("B-"): |
184 | | - temp = ner[2:] |
185 | | - sent += "<" + temp + ">" |
186 | | - elif ner == "O" and temp != "": |
187 | | - sent += "</" + temp + ">" |
188 | | - temp = "" |
189 | | - sent += word |
190 | | - |
191 | | - if idx == len(sent_ner) - 1 and temp != "": |
192 | | - sent += "</" + temp + ">" |
193 | | - |
194 | | - return sent |
195 | | - |
196 | | - if pos: |
197 | | - return [ |
198 | | - (pos_tags[i][0], pos_tags[i][1], data) |
199 | | - for i, data in enumerate(y) |
200 | | - ] |
201 | | - |
202 | | - return sent_ner |
203 | | - |
204 | | - @staticmethod |
205 | | - def __extract_features(doc): |
206 | | - return [_doc2features(doc, i) for i in range(len(doc))] |
| 81 | + if pos and self.name_engine == "wangchanberta": |
| 82 | + warnings.warn( |
| 83 | + """wangchanberta is not support part-of-speech tag. |
| 84 | + It have not part-of-speech tag in output.""" |
| 85 | + ) |
| 86 | + if self.name_engine == "wangchanberta": |
| 87 | + return self.engine.get_ner(text, tag=tag) |
| 88 | + else: |
| 89 | + return self.engine.get_ner(text, tag=tag, pos=pos) |
0 commit comments