|  | 
| 2 | 2 | """ | 
| 3 | 3 | Named-entity recognizer | 
| 4 | 4 | """ | 
|  | 5 | +from typing import List, Tuple, Union | 
| 5 | 6 | 
 | 
| 6 |  | -__all__ = ["ThaiNameTagger"] | 
|  | 7 | +class NER: | 
|  | 8 | +    def __init__(self, engine: str) -> None: | 
|  | 9 | +        self.load_engine(engine=engine) | 
| 7 | 10 | 
 | 
| 8 |  | -from typing import Dict, List, Tuple, Union | 
| 9 |  | - | 
| 10 |  | -from pycrfsuite import Tagger as CRFTagger | 
| 11 |  | -from pythainlp.corpus import get_corpus_path, thai_stopwords | 
| 12 |  | -from pythainlp.tag import pos_tag | 
| 13 |  | -from pythainlp.tokenize import word_tokenize | 
| 14 |  | -from pythainlp.util import isthai | 
| 15 |  | - | 
| 16 |  | -_CORPUS_NAME = "thainer" | 
| 17 |  | -_TOKENIZER_ENGINE = "newmm"  # should be the same as one used in training data | 
| 18 |  | - | 
| 19 |  | - | 
| 20 |  | -def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย | 
| 21 |  | -    return word in thai_stopwords() | 
| 22 |  | - | 
| 23 |  | - | 
| 24 |  | -def _doc2features(doc, i) -> Dict: | 
| 25 |  | -    word = doc[i][0] | 
| 26 |  | -    postag = doc[i][1] | 
| 27 |  | - | 
| 28 |  | -    # Features from current word | 
| 29 |  | -    features = { | 
| 30 |  | -        "word.word": word, | 
| 31 |  | -        "word.stopword": _is_stopword(word), | 
| 32 |  | -        "word.isthai": isthai(word), | 
| 33 |  | -        "word.isspace": word.isspace(), | 
| 34 |  | -        "postag": postag, | 
| 35 |  | -        "word.isdigit": word.isdigit(), | 
| 36 |  | -    } | 
| 37 |  | -    if word.isdigit() and len(word) == 5: | 
| 38 |  | -        features["word.islen5"] = True | 
| 39 |  | - | 
| 40 |  | -    # Features from previous word | 
| 41 |  | -    if i > 0: | 
| 42 |  | -        prevword = doc[i - 1][0] | 
| 43 |  | -        prevpostag = doc[i - 1][1] | 
| 44 |  | -        prev_features = { | 
| 45 |  | -            "word.prevword": prevword, | 
| 46 |  | -            "word.previsspace": prevword.isspace(), | 
| 47 |  | -            "word.previsthai": isthai(prevword), | 
| 48 |  | -            "word.prevstopword": _is_stopword(prevword), | 
| 49 |  | -            "word.prevpostag": prevpostag, | 
| 50 |  | -            "word.prevwordisdigit": prevword.isdigit(), | 
| 51 |  | -        } | 
| 52 |  | -        features.update(prev_features) | 
| 53 |  | -    else: | 
| 54 |  | -        features["BOS"] = True  # Special "Beginning of Sequence" tag | 
| 55 |  | - | 
| 56 |  | -    # Features from next word | 
| 57 |  | -    if i < len(doc) - 1: | 
| 58 |  | -        nextword = doc[i + 1][0] | 
| 59 |  | -        nextpostag = doc[i + 1][1] | 
| 60 |  | -        next_features = { | 
| 61 |  | -            "word.nextword": nextword, | 
| 62 |  | -            "word.nextisspace": nextword.isspace(), | 
| 63 |  | -            "word.nextpostag": nextpostag, | 
| 64 |  | -            "word.nextisthai": isthai(nextword), | 
| 65 |  | -            "word.nextstopword": _is_stopword(nextword), | 
| 66 |  | -            "word.nextwordisdigit": nextword.isdigit(), | 
| 67 |  | -        } | 
| 68 |  | -        features.update(next_features) | 
| 69 |  | -    else: | 
| 70 |  | -        features["EOS"] = True  # Special "End of Sequence" tag | 
| 71 |  | - | 
| 72 |  | -    return features | 
| 73 |  | - | 
| 74 |  | - | 
| 75 |  | -class ThaiNameTagger: | 
| 76 |  | -    """ | 
| 77 |  | -    Thai named-entity recognizer. | 
| 78 |  | -    :param str version: Thai NER version. | 
| 79 |  | -        It's support Thai NER 1.4 & 1.5. | 
| 80 |  | -        The defualt value is `1.5` | 
| 81 |  | -     | 
| 82 |  | -    :Example: | 
| 83 |  | -    :: | 
| 84 |  | -
 | 
| 85 |  | -        from pythainlp.tag.named_entity import ThaiNameTagger | 
| 86 |  | -
 | 
| 87 |  | -        thainer15 = ThaiNameTagger(version="1.5") | 
| 88 |  | -        thainer15.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") | 
| 89 |  | -
 | 
| 90 |  | -        thainer14 = ThaiNameTagger(version="1.4") | 
| 91 |  | -        thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") | 
| 92 |  | -    """ | 
| 93 |  | -    def __init__(self, version: str = "1.5") -> None: | 
| 94 |  | -        """ | 
| 95 |  | -        Thai named-entity recognizer. | 
| 96 |  | -
 | 
| 97 |  | -        :param str version: Thai NER version. | 
| 98 |  | -                            It's support Thai NER 1.4 & 1.5. | 
| 99 |  | -                            The defualt value is `1.5` | 
| 100 |  | -        """ | 
| 101 |  | -        self.crf = CRFTagger() | 
| 102 |  | - | 
| 103 |  | -        if version == "1.4": | 
| 104 |  | -            self.crf.open(get_corpus_path("thainer-1.4", version="1.4")) | 
| 105 |  | -            self.pos_tag_name = "orchid_ud" | 
|  | 11 | +    def load_engine(self, engine: str) -> None: | 
|  | 12 | +        self.engine = None | 
|  | 13 | +        if engine == "thainer": | 
|  | 14 | +            from pythainlp.tag.thainer import ThaiNameTagger | 
|  | 15 | +            self.engine = ThaiNameTagger() | 
| 106 | 16 |         else: | 
| 107 |  | -            self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5")) | 
| 108 |  | -            self.pos_tag_name = "lst20" | 
| 109 |  | - | 
| 110 |  | -    def get_ner( | 
| 111 |  | -        self, text: str, pos: bool = True, tag: bool = False | 
| 112 |  | -    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: | 
| 113 |  | -        """ | 
| 114 |  | -        This function tags named-entitiy from text in IOB format. | 
| 115 |  | -
 | 
| 116 |  | -        :param str text: text in Thai to be tagged | 
| 117 |  | -        :param bool pos: To include POS tags in the results (`True`) or | 
| 118 |  | -                            exclude (`False`). The defualt value is `True` | 
| 119 |  | -        :param bool tag: output like html tag. | 
| 120 |  | -        :return: a list of tuple associated with tokenized word, NER tag, | 
| 121 |  | -                 POS tag (if the parameter `pos` is specified as `True`), | 
| 122 |  | -                 and output like html tag (if the parameter `tag` is | 
| 123 |  | -                 specified as `True`). | 
| 124 |  | -                 Otherwise, return a list of tuple associated with tokenized | 
| 125 |  | -                 word and NER tag | 
| 126 |  | -        :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str | 
| 127 |  | -
 | 
| 128 |  | -        :Note: | 
| 129 |  | -            * For the POS tags to be included in the results, this function | 
| 130 |  | -              uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron` | 
| 131 |  | -              and corpus as orchid_ud`. | 
| 132 |  | -
 | 
| 133 |  | -        :Example: | 
| 134 |  | -
 | 
| 135 |  | -            >>> from pythainlp.tag.named_entity import ThaiNameTagger | 
| 136 |  | -            >>> | 
| 137 |  | -            >>> ner = ThaiNameTagger() | 
| 138 |  | -            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") | 
| 139 |  | -            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), | 
| 140 |  | -            ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'), | 
| 141 |  | -            ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), | 
| 142 |  | -            ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), | 
| 143 |  | -            ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'), | 
| 144 |  | -            ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), | 
| 145 |  | -            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), | 
| 146 |  | -            ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), | 
| 147 |  | -            ('น.', 'NOUN', 'I-TIME')] | 
| 148 |  | -            >>> | 
| 149 |  | -            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", | 
| 150 |  | -                            pos=False) | 
| 151 |  | -            [('วันที่', 'O'), (' ', 'O'), | 
| 152 |  | -            ('15', 'B-DATE'), (' ', 'I-DATE'), | 
| 153 |  | -            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), | 
| 154 |  | -            ('61', 'I-DATE'), (' ', 'O'), | 
| 155 |  | -            ('ทดสอบ', 'O'), ('ระบบ', 'O'), | 
| 156 |  | -            ('เวลา', 'O'), (' ', 'O'), | 
| 157 |  | -            ('14', 'B-TIME'), (':', 'I-TIME'), | 
| 158 |  | -            ('49', 'I-TIME'), (' ', 'I-TIME'), | 
| 159 |  | -            ('น.', 'I-TIME')] | 
| 160 |  | -            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", | 
| 161 |  | -                            tag=True) | 
| 162 |  | -            'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>' | 
| 163 |  | -        """ | 
| 164 |  | -        tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE) | 
| 165 |  | -        pos_tags = pos_tag( | 
| 166 |  | -            tokens, | 
| 167 |  | -            engine="perceptron", | 
| 168 |  | -            corpus=self.pos_tag_name | 
| 169 |  | -        ) | 
| 170 |  | -        x_test = ThaiNameTagger.__extract_features(pos_tags) | 
| 171 |  | -        y = self.crf.tag(x_test) | 
| 172 |  | - | 
| 173 |  | -        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)] | 
| 174 |  | - | 
| 175 |  | -        if tag: | 
| 176 |  | -            temp = "" | 
| 177 |  | -            sent = "" | 
| 178 |  | -            for idx, (word, ner) in enumerate(sent_ner): | 
| 179 |  | -                if ner.startswith("B-") and temp != "": | 
| 180 |  | -                    sent += "</" + temp + ">" | 
| 181 |  | -                    temp = ner[2:] | 
| 182 |  | -                    sent += "<" + temp + ">" | 
| 183 |  | -                elif ner.startswith("B-"): | 
| 184 |  | -                    temp = ner[2:] | 
| 185 |  | -                    sent += "<" + temp + ">" | 
| 186 |  | -                elif ner == "O" and temp != "": | 
| 187 |  | -                    sent += "</" + temp + ">" | 
| 188 |  | -                    temp = "" | 
| 189 |  | -                sent += word | 
| 190 |  | - | 
| 191 |  | -                if idx == len(sent_ner) - 1 and temp != "": | 
| 192 |  | -                    sent += "</" + temp + ">" | 
| 193 |  | - | 
| 194 |  | -            return sent | 
| 195 |  | - | 
| 196 |  | -        if pos: | 
| 197 |  | -            return [ | 
| 198 |  | -                (pos_tags[i][0], pos_tags[i][1], data) | 
| 199 |  | -                for i, data in enumerate(y) | 
| 200 |  | -            ] | 
| 201 |  | - | 
| 202 |  | -        return sent_ner | 
| 203 |  | - | 
| 204 |  | -    @staticmethod | 
| 205 |  | -    def __extract_features(doc): | 
| 206 |  | -        return [_doc2features(doc, i) for i in range(len(doc))] | 
|  | 17 | +            raise ValueError( | 
|  | 18 | +                "ner class not support {0} engine.".format(engine) | 
|  | 19 | +            ) | 
|  | 20 | + | 
|  | 21 | +    def tag( | 
|  | 22 | +        self, | 
|  | 23 | +        text | 
|  | 24 | +    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: | 
|  | 25 | +        return self.engine.get_ner(text) | 
0 commit comments