diff --git a/stanza/utils/datasets/ner/convert_he_iahlt.py b/stanza/utils/datasets/ner/convert_he_iahlt.py index 81b40aaba..b1550b1da 100644 --- a/stanza/utils/datasets/ner/convert_he_iahlt.py +++ b/stanza/utils/datasets/ner/convert_he_iahlt.py @@ -6,73 +6,93 @@ import stanza.utils.default_paths as default_paths from stanza.utils.datasets.ner.utils import write_dataset +_RE_ENTITY_SPLIT = re.compile(r"([()])") + def output_entities(sentence): + # Build a single print call for all entities in the sentence for faster output + entities = [] for word in sentence.words: misc = word.misc if misc is None: continue - - pieces = misc.split("|") - for piece in pieces: + # Avoid repeated split if there's only one piece + if "Entity=" not in misc: + continue + for piece in misc.split("|"): if piece.startswith("Entity="): - entity = piece.split("=", maxsplit=1)[1] - print(" " + entity) + # Use partition for more efficient single split + _, _, entity = piece.partition("=") + entities.append(" " + entity) break + if entities: + print("\n".join(entities)) def extract_single_sentence(sentence): current_entity = [] words = [] + append_word = words.append # Localize method for speed + for word in sentence.words: text = word.text misc = word.misc - if misc is None: - pieces = [] - else: - pieces = misc.split("|") + pieces = misc.split("|") if misc else [] closes = [] first_entity = False + + # Fast path for common case of no entities for piece in pieces: - if piece.startswith("Entity="): - entity = piece.split("=", maxsplit=1)[1] - entity_pieces = re.split(r"([()])", entity) - entity_pieces = [x for x in entity_pieces if x] # remove blanks from re.split - entity_idx = 0 - while entity_idx < len(entity_pieces): - if entity_pieces[entity_idx] == '(': - assert len(entity_pieces) > entity_idx + 1, "Opening an unspecified entity" - if len(current_entity) == 0: - first_entity = True - current_entity.append(entity_pieces[entity_idx + 1]) - entity_idx += 2 - elif entity_pieces[entity_idx] == ')': - assert entity_idx != 0, "Closing an unspecified entity" - closes.append(entity_pieces[entity_idx-1]) - entity_idx += 1 - else: - # the entities themselves get added or removed via the () - entity_idx += 1 - - if len(current_entity) == 0: - entity = 'O' + # Use the faster in-string check to avoid startswith unless needed + if not piece or piece[0] != 'E' or not piece.startswith("Entity="): + continue + # Use partition rather than split for a slight speed gain + _, _, entity = piece.partition("=") + # Avoid repeated regex compilation with the precompiled one + entity_pieces = [x for x in _RE_ENTITY_SPLIT.split(entity) if x] # remove blanks + entity_idx = 0 + entity_pieces_len = len(entity_pieces) + while entity_idx < entity_pieces_len: + piece_value = entity_pieces[entity_idx] + if piece_value == '(': + # Combine assertion into conditional for a very modest gain + if entity_idx + 1 >= entity_pieces_len: + raise AssertionError("Opening an unspecified entity") + if not current_entity: + first_entity = True + current_entity.append(entity_pieces[entity_idx + 1]) + entity_idx += 2 + elif piece_value == ')': + if entity_idx == 0: + raise AssertionError("Closing an unspecified entity") + closes.append(entity_pieces[entity_idx - 1]) + entity_idx += 1 + else: + entity_idx += 1 + + # Avoid unnecessary list operation if current_entity is empty + if not current_entity: + entity_tag = 'O' else: - entity = current_entity[0] - entity = "B-" + entity if first_entity else "I-" + entity - words.append((text, entity)) + entity_tag = current_entity[0] + entity_tag = "B-" + entity_tag if first_entity else "I-" + entity_tag + append_word((text, entity_tag)) - assert len(current_entity) >= len(closes), "Too many closes for the current open entities" + if len(current_entity) < len(closes): + raise AssertionError("Too many closes for the current open entities") for close_entity in closes: - # TODO: check the close is closing the right thing - assert close_entity == current_entity[-1], "Closed the wrong entity: %s vs %s" % (close_entity, current_entity[-1]) - current_entity = current_entity[:-1] + last = current_entity[-1] + if close_entity != last: + raise AssertionError("Closed the wrong entity: %s vs %s" % (close_entity, last)) + current_entity.pop() return words def extract_sentences(doc): sentences = [] + append_sentence = sentences.append # Localize append for small speedup for sentence in doc.sentences: try: words = extract_single_sentence(sentence) - sentences.append(words) + append_sentence(words) except AssertionError as e: print("Skipping sentence %s ... %s" % (sentence.sent_id, str(e))) output_entities(sentence)