Skip to content

Commit d897421

Browse files
authored
Merge branch 'add-ner-class' into dev
2 parents d20e864 + dca612e commit d897421

File tree

5 files changed

+302
-226
lines changed

5 files changed

+302
-226
lines changed

docs/api/tag.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,9 @@ Modules
230230
.. autofunction:: pos_tag_sents
231231
.. autofunction:: tag_provinces
232232
.. autofunction:: chunk_parse
233-
.. autoclass:: pythainlp.tag.named_entity.ThaiNameTagger
233+
.. autoclass:: pythainlp.tag.named_entity.NER
234+
:members:
235+
.. autoclass:: pythainlp.tag.thainer.ThaiNameTagger
234236
:members: get_ner
235237
.. autofunction:: pythainlp.tag.tltk.get_ner
236238

pythainlp/tag/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
"pos_tag",
1212
"pos_tag_sents",
1313
"tag_provinces",
14+
"chunk_parse",
15+
"NER",
1416
]
1517

1618
from pythainlp.tag.locations import tag_provinces
1719
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
1820
from pythainlp.tag._tag_perceptron import PerceptronTagger
1921
from pythainlp.tag.chunk import chunk_parse
22+
from pythainlp.tag.named_entity import NER

pythainlp/tag/named_entity.py

Lines changed: 62 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -2,205 +2,88 @@
22
"""
33
Named-entity recognizer
44
"""
5+
import warnings
6+
from typing import List, Tuple, Union
7+
from pythainlp.tag.thainer import ThaiNameTagger
58

6-
__all__ = ["ThaiNameTagger"]
79

8-
from typing import Dict, List, Tuple, Union
9-
10-
from pycrfsuite import Tagger as CRFTagger
11-
from pythainlp.corpus import get_corpus_path, thai_stopwords
12-
from pythainlp.tag import pos_tag
13-
from pythainlp.tokenize import word_tokenize
14-
from pythainlp.util import isthai
15-
16-
_CORPUS_NAME = "thainer"
17-
_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data
18-
19-
20-
def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย
21-
return word in thai_stopwords()
22-
23-
24-
def _doc2features(doc, i) -> Dict:
25-
word = doc[i][0]
26-
postag = doc[i][1]
27-
28-
# Features from current word
29-
features = {
30-
"word.word": word,
31-
"word.stopword": _is_stopword(word),
32-
"word.isthai": isthai(word),
33-
"word.isspace": word.isspace(),
34-
"postag": postag,
35-
"word.isdigit": word.isdigit(),
36-
}
37-
if word.isdigit() and len(word) == 5:
38-
features["word.islen5"] = True
39-
40-
# Features from previous word
41-
if i > 0:
42-
prevword = doc[i - 1][0]
43-
prevpostag = doc[i - 1][1]
44-
prev_features = {
45-
"word.prevword": prevword,
46-
"word.previsspace": prevword.isspace(),
47-
"word.previsthai": isthai(prevword),
48-
"word.prevstopword": _is_stopword(prevword),
49-
"word.prevpostag": prevpostag,
50-
"word.prevwordisdigit": prevword.isdigit(),
51-
}
52-
features.update(prev_features)
53-
else:
54-
features["BOS"] = True # Special "Beginning of Sequence" tag
55-
56-
# Features from next word
57-
if i < len(doc) - 1:
58-
nextword = doc[i + 1][0]
59-
nextpostag = doc[i + 1][1]
60-
next_features = {
61-
"word.nextword": nextword,
62-
"word.nextisspace": nextword.isspace(),
63-
"word.nextpostag": nextpostag,
64-
"word.nextisthai": isthai(nextword),
65-
"word.nextstopword": _is_stopword(nextword),
66-
"word.nextwordisdigit": nextword.isdigit(),
67-
}
68-
features.update(next_features)
69-
else:
70-
features["EOS"] = True # Special "End of Sequence" tag
71-
72-
return features
73-
74-
75-
class ThaiNameTagger:
10+
class NER:
7611
"""
77-
Thai named-entity recognizer.
78-
:param str version: Thai NER version.
79-
It's support Thai NER 1.4 & 1.5.
80-
The defualt value is `1.5`
81-
82-
:Example:
83-
::
12+
Named-entity recognizer class
8413
85-
from pythainlp.tag.named_entity import ThaiNameTagger
14+
:param str engine: Named-entity recognizer engine
15+
:param str corpus: corpus
8616
87-
thainer15 = ThaiNameTagger(version="1.5")
88-
thainer15.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
17+
**Options for engine**
18+
* *thainer* - Thai NER engine
19+
* *wangchanberta* - wangchanberta model
8920
90-
thainer14 = ThaiNameTagger(version="1.4")
91-
thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
21+
**Options for corpus**
22+
* *thaimer* - Thai NER corpus
23+
* *lst20* - lst20 corpus (wangchanberta only)
9224
"""
93-
def __init__(self, version: str = "1.5") -> None:
94-
"""
95-
Thai named-entity recognizer.
96-
97-
:param str version: Thai NER version.
98-
It's support Thai NER 1.4 & 1.5.
99-
The defualt value is `1.5`
100-
"""
101-
self.crf = CRFTagger()
102-
103-
if version == "1.4":
104-
self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
105-
self.pos_tag_name = "orchid_ud"
25+
def __init__(self, engine: str, corpus: str = "thainer") -> None:
26+
self.load_engine(engine=engine, corpus=corpus)
27+
28+
def load_engine(self, engine: str, corpus: str) -> None:
29+
self.name_engine = engine
30+
self.engine = None
31+
if engine == "thainer" and corpus == "thainer":
32+
from pythainlp.tag.thainer import ThaiNameTagger
33+
self.engine = ThaiNameTagger()
34+
elif engine == "wangchanberta":
35+
from pythainlp.wangchanberta import ThaiNameTagger
36+
self.engine = ThaiNameTagger(dataset_name=corpus)
10637
else:
107-
self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5"))
108-
self.pos_tag_name = "lst20"
109-
110-
def get_ner(
111-
self, text: str, pos: bool = True, tag: bool = False
112-
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
38+
raise ValueError(
39+
"NER class not support {0} engine or {1} corpus.".format(
40+
engine,
41+
corpus
42+
)
43+
)
44+
45+
def tag(
46+
self,
47+
text,
48+
pos=True,
49+
tag=False
50+
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
11351
"""
11452
This function tags named-entitiy from text in IOB format.
11553
11654
:param str text: text in Thai to be tagged
117-
:param bool pos: To include POS tags in the results (`True`) or
118-
exclude (`False`). The defualt value is `True`
55+
:param bool pos: output with part-of-speech tag.\
56+
(wangchanberta is not support)
11957
:param bool tag: output like html tag.
12058
:return: a list of tuple associated with tokenized word, NER tag,
12159
POS tag (if the parameter `pos` is specified as `True`),
12260
and output like html tag (if the parameter `tag` is
12361
specified as `True`).
12462
Otherwise, return a list of tuple associated with tokenized
12563
word and NER tag
126-
:rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str
127-
128-
:Note:
129-
* For the POS tags to be included in the results, this function
130-
uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
131-
and corpus as orchid_ud`.
132-
64+
:rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
13365
:Example:
13466
135-
>>> from pythainlp.tag.named_entity import ThaiNameTagger
136-
>>>
137-
>>> ner = ThaiNameTagger()
138-
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
139-
[('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
140-
('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
141-
('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
142-
('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
143-
('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
144-
('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
145-
('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
146-
('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
147-
('น.', 'NOUN', 'I-TIME')]
67+
>>> from pythainlp.tag import NER
14868
>>>
149-
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
150-
pos=False)
151-
[('วันที่', 'O'), (' ', 'O'),
152-
('15', 'B-DATE'), (' ', 'I-DATE'),
153-
('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
154-
('61', 'I-DATE'), (' ', 'O'),
155-
('ทดสอบ', 'O'), ('ระบบ', 'O'),
156-
('เวลา', 'O'), (' ', 'O'),
157-
('14', 'B-TIME'), (':', 'I-TIME'),
158-
('49', 'I-TIME'), (' ', 'I-TIME'),
159-
('น.', 'I-TIME')]
160-
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
161-
tag=True)
162-
'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
69+
>>> ner = NER("thainer")
70+
>>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์")
71+
[('ทดสอบ', 'VV', 'O'),
72+
('นาย', 'NN', 'B-PERSON'),
73+
('วรรณ', 'NN', 'I-PERSON'),
74+
('พงษ์', 'NN', 'I-PERSON'),
75+
(' ', 'PU', 'I-PERSON'),
76+
('ภัททิย', 'NN', 'I-PERSON'),
77+
('ไพบูลย์', 'NN', 'I-PERSON')]
78+
>>> ner.tag("ทดสอบยนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
79+
'ทดสอบย<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
16380
"""
164-
tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
165-
pos_tags = pos_tag(
166-
tokens,
167-
engine="perceptron",
168-
corpus=self.pos_tag_name
169-
)
170-
x_test = ThaiNameTagger.__extract_features(pos_tags)
171-
y = self.crf.tag(x_test)
172-
173-
sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]
174-
175-
if tag:
176-
temp = ""
177-
sent = ""
178-
for idx, (word, ner) in enumerate(sent_ner):
179-
if ner.startswith("B-") and temp != "":
180-
sent += "</" + temp + ">"
181-
temp = ner[2:]
182-
sent += "<" + temp + ">"
183-
elif ner.startswith("B-"):
184-
temp = ner[2:]
185-
sent += "<" + temp + ">"
186-
elif ner == "O" and temp != "":
187-
sent += "</" + temp + ">"
188-
temp = ""
189-
sent += word
190-
191-
if idx == len(sent_ner) - 1 and temp != "":
192-
sent += "</" + temp + ">"
193-
194-
return sent
195-
196-
if pos:
197-
return [
198-
(pos_tags[i][0], pos_tags[i][1], data)
199-
for i, data in enumerate(y)
200-
]
201-
202-
return sent_ner
203-
204-
@staticmethod
205-
def __extract_features(doc):
206-
return [_doc2features(doc, i) for i in range(len(doc))]
81+
if pos and self.name_engine == "wangchanberta":
82+
warnings.warn(
83+
"""wangchanberta is not support part-of-speech tag.
84+
It have not part-of-speech tag in output."""
85+
)
86+
if self.name_engine == "wangchanberta":
87+
return self.engine.get_ner(text, tag=tag)
88+
else:
89+
return self.engine.get_ner(text, tag=tag, pos=pos)

0 commit comments

Comments
 (0)