Skip to content

Commit 701fb3a

Browse files
committed
Add NER class
1 parent 2e9ac04 commit 701fb3a

File tree

3 files changed

+227
-199
lines changed

3 files changed

+227
-199
lines changed

pythainlp/tag/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
"pos_tag",
1212
"pos_tag_sents",
1313
"tag_provinces",
14+
"chunk_parse",
15+
"NER",
1416
]
1517

1618
from pythainlp.tag.locations import tag_provinces
1719
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents
1820
from pythainlp.tag._tag_perceptron import PerceptronTagger
1921
from pythainlp.tag.chunk import chunk_parse
22+
from pythainlp.tag.named_entity import NER

pythainlp/tag/named_entity.py

Lines changed: 18 additions & 199 deletions
Original file line numberDiff line numberDiff line change
@@ -2,205 +2,24 @@
22
"""
33
Named-entity recognizer
44
"""
5+
from typing import List, Tuple, Union
56

6-
__all__ = ["ThaiNameTagger"]
7+
class NER:
8+
def __init__(self, engine: str) -> None:
9+
self.load_engine(engine=engine)
710

8-
from typing import Dict, List, Tuple, Union
9-
10-
from pycrfsuite import Tagger as CRFTagger
11-
from pythainlp.corpus import get_corpus_path, thai_stopwords
12-
from pythainlp.tag import pos_tag
13-
from pythainlp.tokenize import word_tokenize
14-
from pythainlp.util import isthai
15-
16-
_CORPUS_NAME = "thainer"
17-
_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data
18-
19-
20-
def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย
21-
return word in thai_stopwords()
22-
23-
24-
def _doc2features(doc, i) -> Dict:
25-
word = doc[i][0]
26-
postag = doc[i][1]
27-
28-
# Features from current word
29-
features = {
30-
"word.word": word,
31-
"word.stopword": _is_stopword(word),
32-
"word.isthai": isthai(word),
33-
"word.isspace": word.isspace(),
34-
"postag": postag,
35-
"word.isdigit": word.isdigit(),
36-
}
37-
if word.isdigit() and len(word) == 5:
38-
features["word.islen5"] = True
39-
40-
# Features from previous word
41-
if i > 0:
42-
prevword = doc[i - 1][0]
43-
prevpostag = doc[i - 1][1]
44-
prev_features = {
45-
"word.prevword": prevword,
46-
"word.previsspace": prevword.isspace(),
47-
"word.previsthai": isthai(prevword),
48-
"word.prevstopword": _is_stopword(prevword),
49-
"word.prevpostag": prevpostag,
50-
"word.prevwordisdigit": prevword.isdigit(),
51-
}
52-
features.update(prev_features)
53-
else:
54-
features["BOS"] = True # Special "Beginning of Sequence" tag
55-
56-
# Features from next word
57-
if i < len(doc) - 1:
58-
nextword = doc[i + 1][0]
59-
nextpostag = doc[i + 1][1]
60-
next_features = {
61-
"word.nextword": nextword,
62-
"word.nextisspace": nextword.isspace(),
63-
"word.nextpostag": nextpostag,
64-
"word.nextisthai": isthai(nextword),
65-
"word.nextstopword": _is_stopword(nextword),
66-
"word.nextwordisdigit": nextword.isdigit(),
67-
}
68-
features.update(next_features)
69-
else:
70-
features["EOS"] = True # Special "End of Sequence" tag
71-
72-
return features
73-
74-
75-
class ThaiNameTagger:
76-
"""
77-
Thai named-entity recognizer.
78-
:param str version: Thai NER version.
79-
It's support Thai NER 1.4 & 1.5.
80-
The defualt value is `1.5`
81-
82-
:Example:
83-
::
84-
85-
from pythainlp.tag.named_entity import ThaiNameTagger
86-
87-
thainer15 = ThaiNameTagger(version="1.5")
88-
thainer15.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
89-
90-
thainer14 = ThaiNameTagger(version="1.4")
91-
thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
92-
"""
93-
def __init__(self, version: str = "1.5") -> None:
94-
"""
95-
Thai named-entity recognizer.
96-
97-
:param str version: Thai NER version.
98-
It's support Thai NER 1.4 & 1.5.
99-
The defualt value is `1.5`
100-
"""
101-
self.crf = CRFTagger()
102-
103-
if version == "1.4":
104-
self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
105-
self.pos_tag_name = "orchid_ud"
11+
def load_engine(self, engine: str) -> None:
12+
self.engine = None
13+
if engine == "thainer":
14+
from pythainlp.tag.thainer import ThaiNameTagger
15+
self.engine = ThaiNameTagger()
10616
else:
107-
self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5"))
108-
self.pos_tag_name = "lst20"
109-
110-
def get_ner(
111-
self, text: str, pos: bool = True, tag: bool = False
112-
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
113-
"""
114-
This function tags named-entitiy from text in IOB format.
115-
116-
:param str text: text in Thai to be tagged
117-
:param bool pos: To include POS tags in the results (`True`) or
118-
exclude (`False`). The defualt value is `True`
119-
:param bool tag: output like html tag.
120-
:return: a list of tuple associated with tokenized word, NER tag,
121-
POS tag (if the parameter `pos` is specified as `True`),
122-
and output like html tag (if the parameter `tag` is
123-
specified as `True`).
124-
Otherwise, return a list of tuple associated with tokenized
125-
word and NER tag
126-
:rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str
127-
128-
:Note:
129-
* For the POS tags to be included in the results, this function
130-
uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
131-
and corpus as orchid_ud`.
132-
133-
:Example:
134-
135-
>>> from pythainlp.tag.named_entity import ThaiNameTagger
136-
>>>
137-
>>> ner = ThaiNameTagger()
138-
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
139-
[('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
140-
('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
141-
('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
142-
('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
143-
('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
144-
('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
145-
('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
146-
('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
147-
('น.', 'NOUN', 'I-TIME')]
148-
>>>
149-
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
150-
pos=False)
151-
[('วันที่', 'O'), (' ', 'O'),
152-
('15', 'B-DATE'), (' ', 'I-DATE'),
153-
('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
154-
('61', 'I-DATE'), (' ', 'O'),
155-
('ทดสอบ', 'O'), ('ระบบ', 'O'),
156-
('เวลา', 'O'), (' ', 'O'),
157-
('14', 'B-TIME'), (':', 'I-TIME'),
158-
('49', 'I-TIME'), (' ', 'I-TIME'),
159-
('น.', 'I-TIME')]
160-
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
161-
tag=True)
162-
'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
163-
"""
164-
tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
165-
pos_tags = pos_tag(
166-
tokens,
167-
engine="perceptron",
168-
corpus=self.pos_tag_name
169-
)
170-
x_test = ThaiNameTagger.__extract_features(pos_tags)
171-
y = self.crf.tag(x_test)
172-
173-
sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]
174-
175-
if tag:
176-
temp = ""
177-
sent = ""
178-
for idx, (word, ner) in enumerate(sent_ner):
179-
if ner.startswith("B-") and temp != "":
180-
sent += "</" + temp + ">"
181-
temp = ner[2:]
182-
sent += "<" + temp + ">"
183-
elif ner.startswith("B-"):
184-
temp = ner[2:]
185-
sent += "<" + temp + ">"
186-
elif ner == "O" and temp != "":
187-
sent += "</" + temp + ">"
188-
temp = ""
189-
sent += word
190-
191-
if idx == len(sent_ner) - 1 and temp != "":
192-
sent += "</" + temp + ">"
193-
194-
return sent
195-
196-
if pos:
197-
return [
198-
(pos_tags[i][0], pos_tags[i][1], data)
199-
for i, data in enumerate(y)
200-
]
201-
202-
return sent_ner
203-
204-
@staticmethod
205-
def __extract_features(doc):
206-
return [_doc2features(doc, i) for i in range(len(doc))]
17+
raise ValueError(
18+
"ner class not support {0} engine.".format(engine)
19+
)
20+
21+
def tag(
22+
self,
23+
text
24+
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
25+
return self.engine.get_ner(text)

0 commit comments

Comments
 (0)